From 4f7e8e7495e755601dc7acd4280e25ecc059f74b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 14 Nov 2019 10:08:07 -0700 Subject: [PATCH 01/50] Xpetra: Re-enabled fast TwoMatrixAdd path --- packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp index 09f27b3d9a50..cf2d0aa8eced 100644 --- a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp +++ b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp @@ -110,7 +110,6 @@ namespace Xpetra { if(transposeA) Aprime = transposer_type(Aprime).createTranspose(); //Decide whether the fast code path can be taken. - /* if(A.isFillComplete() && B.isFillComplete()) { RCP C = rcp(new tcrs_matrix_type(Aprime->getRowMap(), 0)); @@ -122,7 +121,6 @@ namespace Xpetra { } else { - */ //Slow case - one or both operands are non-fill complete. //TODO: deprecate this. //Need to compute the explicit transpose before add if transposeA and/or transposeB. @@ -152,7 +150,7 @@ namespace Xpetra { *Bprime, false, beta, C); return rcp(new CrsWrap(rcp_implicit_cast(rcp(new XTCrsType(C))))); - //} + } } #endif } From ef301bcdf5ef6dfdcc0bc18fb4317348efda0b0d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 20 Nov 2019 16:05:22 -0700 Subject: [PATCH 02/50] TpetraTSQR: Make headers use double quotes instead of angle brackets While I'm at it, fix other weird formatting. --- packages/tpetra/tsqr/src/TbbTsqr.hpp | 16 +- .../tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp | 9 +- .../tsqr/src/TbbTsqr_CacheBlockTask.hpp | 17 +- .../tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp | 15 +- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 13 +- .../tsqr/src/TbbTsqr_FillWithZerosTask.hpp | 87 ++-- .../tpetra/tsqr/src/TbbTsqr_Partitioner.hpp | 36 +- .../tsqr/src/TbbTsqr_RevealRankTask.hpp | 10 +- packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp | 32 +- .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 25 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp | 20 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp | 44 +- .../tsqr/src/TbbTsqr_UnCacheBlockTask.hpp | 11 +- packages/tpetra/tsqr/src/Tsqr.hpp | 28 +- packages/tpetra/tsqr/src/TsqrAdaptor.hpp | 123 ++--- packages/tpetra/tsqr/src/TsqrCommFactory.hpp | 16 +- packages/tpetra/tsqr/src/TsqrFactory.hpp | 45 +- .../tsqr/src/TsqrFactory_SequentialTsqr.hpp | 10 +- .../tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp | 5 +- packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp | 20 +- packages/tpetra/tsqr/src/Tsqr_ApplyType.cpp | 31 +- packages/tpetra/tsqr/src/Tsqr_ApplyType.hpp | 5 +- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 8 +- .../tsqr/src/Tsqr_CacheBlockingStrategy.hpp | 3 - packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 205 ++++---- .../tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp | 12 +- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 16 +- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 11 +- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 8 +- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 17 +- packages/tpetra/tsqr/src/Tsqr_CombineTest.hpp | 5 +- packages/tpetra/tsqr/src/Tsqr_ConfigDefs.hpp | 4 +- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 14 +- .../tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp | 487 +++++++++--------- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 15 +- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 18 +- .../tpetra/tsqr/src/Tsqr_GlobalTimeStats.cpp | 6 +- .../tpetra/tsqr/src/Tsqr_GlobalTimeStats.hpp | 4 +- .../tpetra/tsqr/src/Tsqr_GlobalVerify.hpp | 16 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 14 +- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 24 +- packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp | 6 +- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 35 +- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 11 +- .../tpetra/tsqr/src/Tsqr_MessengerBase.hpp | 8 +- packages/tpetra/tsqr/src/Tsqr_Mgs.hpp | 115 ++--- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 430 ++++++++-------- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 26 +- .../tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp | 22 +- packages/tpetra/tsqr/src/Tsqr_ParTest.hpp | 23 +- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 38 +- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 9 +- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 8 +- .../tsqr/src/Tsqr_Random_NormalGenerator.hpp | 4 +- packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp | 26 +- packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp | 7 +- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 16 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 205 ++++---- .../tpetra/tsqr/src/Tsqr_StatTimeMonitor.cpp | 20 +- .../tpetra/tsqr/src/Tsqr_StatTimeMonitor.hpp | 7 +- packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp | 28 +- packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp | 87 ++-- packages/tpetra/tsqr/src/Tsqr_TestUtils.hpp | 2 - .../tpetra/tsqr/src/Tsqr_TeuchosMessenger.hpp | 227 ++++---- packages/tpetra/tsqr/src/Tsqr_TimeStats.cpp | 4 +- packages/tpetra/tsqr/src/Tsqr_TimeStats.hpp | 2 - .../tpetra/tsqr/src/Tsqr_TrivialMessenger.hpp | 133 +++-- .../tpetra/tsqr/src/Tsqr_TrivialTimer.cpp | 6 +- .../tpetra/tsqr/src/Tsqr_TrivialTimer.hpp | 18 +- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 19 +- packages/tpetra/tsqr/src/Tsqr_Util.hpp | 6 +- .../tpetra/tsqr/src/Tsqr_generateStack.hpp | 12 +- .../tpetra/tsqr/src/Tsqr_nodeTestProblem.hpp | 7 +- .../tsqr/src/Tsqr_printGlobalMatrix.hpp | 13 +- .../tsqr/src/Tsqr_verifyTimerConcept.hpp | 25 +- .../tpetra/tsqr/test/Tsqr_TestCombine.cpp | 4 +- .../tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp | 22 +- .../tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp | 14 +- .../tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp | 3 - packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp | 2 - .../tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp | 4 +- .../tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp | 4 +- 82 files changed, 1338 insertions(+), 1825 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr.hpp index 8e379dbca34a..996d76e94eec 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,22 +43,18 @@ #ifndef __TSQR_TbbTsqr_hpp #define __TSQR_TbbTsqr_hpp -#include -#include -#include -#include -#include -// #include - +#include "TbbTsqr_TbbParallelTsqr.hpp" +#include "Tsqr_TimeStats.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_ParameterListExceptions.hpp" +#include "Teuchos_Time.hpp" #include #include #include // std::pair #include - namespace TSQR { namespace TBB { - /// \class TbbTsqr /// \brief Intranode TSQR, parallelized with Intel TBB /// diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp index e10521e046fd..62609a868499 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,11 +41,8 @@ #define __TSQR_TBB_ApplyTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp index 41ba8436d5f1..602e5c275777 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,22 +41,17 @@ #define __TSQR_TBB_CacheBlockTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { - /// \class CacheBlockTask /// \brief TBB task for recursive TSQR cache blocking phase. /// /// "Cache blocking" here means copying the input matrix, which is /// stored with noncontiguous cache blocks, to the output matrix, /// which is stored with contiguous cache blocks. - /// template class CacheBlockTask : public tbb::task { public: @@ -84,11 +77,11 @@ namespace TSQR { using tbb::task; if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) - return NULL; + return nullptr; else if (P_first_ == P_last_) { execute_base_case (); - return NULL; + return nullptr; } else { @@ -109,7 +102,7 @@ namespace TSQR { if (out_split.second.empty() || out_split.second.nrows() == 0) { execute_base_case (); - return NULL; + return nullptr; } // "c": continuation task diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp index 4c29646ebd91..377e3c16495f 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,18 +41,13 @@ #define __TSQR_TBB_ExplicitQTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { - /// \class ExplicitQTask /// \brief TBB task for recursive TSQR "compute explicit Q" phase. - /// template< class LocalOrdinal, class Scalar > class ExplicitQTask : public tbb::task { public: @@ -66,8 +59,6 @@ namespace TSQR { typedef std::pair const_split_t; public: - /// \brief Constructor. - /// ExplicitQTask (const size_t P_first__, const size_t P_last__, mat_view_type Q_out, @@ -149,9 +140,7 @@ namespace TSQR { } } }; - } // namespace TBB } // namespace TSQR - #endif // __TSQR_TBB_ExplicitQTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index bf21e571f85f..876cfa876cec 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,20 +41,15 @@ #define __TSQR_TBB_FactorTask_hpp #include -#include -#include -#include +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" +#include "Teuchos_Assert.hpp" #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace TBB { - /// \class FactorTask /// \brief TBB task for recursive TSQR factorization phase. - /// template class FactorTask : public tbb::task { public: diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp index a45538e64848..b8e2e21651cc 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,18 +41,13 @@ #define __TSQR_TBB_FillWithZerosTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { - /// \class FillWithZerosTask /// \brief TBB task for recursive TSQR "fill with zeros" phase. - /// template class FillWithZerosTask : public tbb::task { public: @@ -78,48 +71,46 @@ namespace TSQR { tbb::task* execute () { - if (P_first_ > P_last_ || C_.empty()) - return NULL; - else if (P_first_ == P_last_) - { + if (P_first_ > P_last_ || C_.empty()) { + return nullptr; + } + else if (P_first_ == P_last_) { + execute_base_case (); + return nullptr; + } + else { + // Recurse on two intervals: [P_first, P_mid] and + // [P_mid+1, P_last]. + const size_t P_mid = (P_first_ + P_last_) / 2; + split_type C_split = + partitioner_.split (C_, P_first_, P_mid, P_last_, + contiguous_cache_blocks_); + // The partitioner may decide that the current block C_ + // has too few rows to be worth splitting. In that case, + // C_split.second (the bottom block) will be empty. We + // can deal with this by treating it as the base case. + if (C_split.second.empty() || C_split.second.nrows() == 0) { execute_base_case (); - return NULL; + return nullptr; } - else - { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last]. - const size_t P_mid = (P_first_ + P_last_) / 2; - split_type C_split = - partitioner_.split (C_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block C_ - // has too few rows to be worth splitting. In that case, - // C_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (C_split.second.empty() || C_split.second.nrows() == 0) - { - execute_base_case (); - return NULL; - } - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - FillWithZerosTask& topTask = *new( c.allocate_child() ) - FillWithZerosTask (P_first_, P_mid, C_split.first, seq_, - contiguous_cache_blocks_); - FillWithZerosTask& botTask = *new( c.allocate_child() ) - FillWithZerosTask (P_mid+1, P_last_, C_split.second, seq_, - contiguous_cache_blocks_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } + // "c": continuation task + tbb::empty_task& c = + *new( allocate_continuation() ) tbb::empty_task; + // Recurse on the split + FillWithZerosTask& topTask = *new( c.allocate_child() ) + FillWithZerosTask (P_first_, P_mid, C_split.first, seq_, + contiguous_cache_blocks_); + FillWithZerosTask& botTask = *new( c.allocate_child() ) + FillWithZerosTask (P_mid+1, P_last_, C_split.second, seq_, + contiguous_cache_blocks_); + // Set reference count of parent (in this case, the + // continuation task) to 2 (since 2 children -- no + // additional task since no waiting). + c.set_ref_count (2); + c.spawn (botTask); + return &topTask; // scheduler bypass optimization + } } private: diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp index bf86c2e09bc4..a1aa38f7e4a9 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp @@ -34,15 +34,13 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TBB_Partitioner_hpp #define __TSQR_TBB_Partitioner_hpp -#include +#include "Tsqr_MatView.hpp" #include // size_t #include @@ -50,13 +48,9 @@ #include #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace TBB { - - template< class Ordinal, class Scalar > + template class Partitioner { private: bool @@ -67,20 +61,18 @@ namespace TSQR { using std::invalid_argument; using std::ostringstream; - if (nrows < ncols) - { - ostringstream os; - os << "Partitioner::should_split: nrows (= " << nrows - << ") < ncols (= " << ncols << ")"; - throw invalid_argument (os.str()); - } - else if (num_partitions == 0) - { - ostringstream os; - os << "Partitioner::should_split: nrows (= " << nrows - << ") < ncols (= " << ncols << ")"; - throw invalid_argument (os.str()); - } + if (nrows < ncols) { + ostringstream os; + os << "Partitioner::should_split: nrows (= " << nrows + << ") < ncols (= " << ncols << ")"; + throw invalid_argument (os.str()); + } + else if (num_partitions == 0) { + ostringstream os; + os << "Partitioner::should_split: nrows (= " << nrows + << ") < ncols (= " << ncols << ")"; + throw invalid_argument (os.str()); + } // FIXME (mfh 11 Jul 2010) Need more overflow checks here. return static_cast(nrows) / num_partitions >= static_cast(ncols); } diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp index f22ba6b19962..7cc815a330d9 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,15 +41,11 @@ #define __TSQR_TBB_RevealRankTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { - /// \class RevealRankTask /// \brief TBB task for recursive TSQR "rank-revealing" phase. /// diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp index 618e9e8bf37e..53a473d2e5f7 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -48,27 +46,16 @@ #include #include // std::pair -#include -#include -#include - -#include +#include "Tsqr_MessengerBase.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_RCP.hpp" #include #include #include #include -// #define TBB_MGS_DEBUG 1 -#ifdef TBB_MGS_DEBUG -# include -using std::cerr; -using std::endl; -#endif // TBB_MGS_DEBUG - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace TBB { @@ -397,19 +384,8 @@ namespace TSQR { for (LocalOrdinal i = 0; i < j; ++i) { const Scalar* const q = &A_local[i*lda_local]; R[i + j*ldr] = ops.project (nrows_local, q, v); -#ifdef TBB_MGS_DEBUG - if (my_rank == 0) { - cerr << "(i,j) = (" << i << "," << j << "): coeff = " - << R[i + j*ldr] << endl; - } -#endif // TBB_MGS_DEBUG } const magnitude_type denom = ops.norm2 (nrows_local, v); -#ifdef TBB_MGS_DEBUG - if (my_rank == 0) { - cerr << "j = " << j << ": denom = " << denom << endl; - } -#endif // TBB_MGS_DEBUG // FIXME (mfh 29 Apr 2010) // diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index 0c301a4545b5..7fdca2200dcc 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -44,25 +42,20 @@ #include #include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - +#include "TbbTsqr_FactorTask.hpp" +#include "TbbTsqr_ApplyTask.hpp" +#include "TbbTsqr_ExplicitQTask.hpp" +#include "TbbTsqr_RevealRankTask.hpp" +#include "TbbTsqr_CacheBlockTask.hpp" +#include "TbbTsqr_UnCacheBlockTask.hpp" +#include "TbbTsqr_FillWithZerosTask.hpp" +#include "Tsqr_ApplyType.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include - namespace TSQR { namespace TBB { - /// \class TbbParallelTsqr /// \brief Parallel implementation of \c TbbTsqr. /// \author Mark Hoemmen diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp index aac1aac9fe6c..a18e0c643509 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp @@ -34,34 +34,26 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TbbRecursiveTsqr_hpp #define __TSQR_TbbRecursiveTsqr_hpp -#include -#include -#include -#include - +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_CacheBlocker.hpp" +#include "Tsqr_SequentialTsqr.hpp" +#include "TbbTsqr_Partitioner.hpp" #include #include #include // std::pair #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace TBB { - /// \class TbbRecursiveTsqr /// \brief Non-parallel "functioning stub" implementation of \c TbbTsqr. - /// - template< class LocalOrdinal, class Scalar > + template class TbbRecursiveTsqr { public: /// \brief Constructor. @@ -273,6 +265,6 @@ namespace TSQR { } // namespace TBB } // namespace TSQR -#include +#include "TSQR/TBB/TbbRecursiveTsqr_Def.hpp" #endif // __TSQR_TbbRecursiveTsqr_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp index c2bfed252b68..32ed5dbdc9fd 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp @@ -34,26 +34,14 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TBB_TbbRecursiveTsqr_Def_hpp #define __TSQR_TBB_TbbRecursiveTsqr_Def_hpp -#include -#include - -// #define TBB_DEBUG 1 -#ifdef TBB_DEBUG -# include -using std::cerr; -using std::endl; -#endif // TBB_DEBUG - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_TbbRecursiveTsqr.hpp" +#include "Tsqr_Util.hpp" namespace TSQR { namespace TBB { @@ -73,12 +61,6 @@ namespace TSQR { CacheBlocker< LocalOrdinal, Scalar > blocker (Q_out.nrows(), Q_out.ncols(), seq_.cache_blocking_strategy()); -#ifdef TBB_DEBUG - cerr << "explicit_Q_helper: On P_first = " << P_first - << ", filling Q_out with zeros:" << endl - << "Q_out is " << Q_out.nrows() << " x " << Q_out.ncols() - << " with leading dimension " << Q_out.lda() << endl; -#endif // TBB_DEBUG // Fill my partition with zeros. blocker.fill_with_zeros (Q_out, contiguous_cache_blocks); @@ -154,12 +136,6 @@ namespace TSQR { // If we're completely done, extract the final R factor from // the topmost partition. if (depth == 0) { -#ifdef TBB_DEBUG - cerr << "factor_helper: On P_first = " << P_first - << ", extracting R:" << endl - << "A_top is " << A_top.nrows() << " x " << A_top.ncols() - << " with leading dimension " << A_top.lda(); -#endif // TBB_DEBUG seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), A_top.lda(), R, ldr, contiguous_cache_blocks); } @@ -204,13 +180,6 @@ namespace TSQR { mat_view& C, const bool contiguous_cache_blocks) const { -#ifdef TBB_DEBUG - cerr << "build_partition_array: [" << P_first << ", " << P_last << "]:" << endl - << "Q is " << Q.nrows() << " x " << Q.ncols() << " w/ LDA = " - << Q.lda() << endl << "C is " << C.nrows() << " x " << C.ncols() - << " w/ LDA = " << C.lda() << endl; -#endif // TBB_DEBUG - if (P_first > P_last) return; else if (P_first == P_last) @@ -252,12 +221,6 @@ namespace TSQR { const bool contiguous_cache_blocks) const { typedef std::pair< const_mat_view, mat_view > apply_t; -#ifdef TBB_DEBUG - cerr << "apply_helper: [" << P_first << ", " << P_last << "]:" << endl - << "Q is " << Q.nrows() << " x " << Q.ncols() << " w/ LDA = " - << Q.lda() << endl << "C is " << C.nrows() << " x " << C.ncols() - << " w/ LDA = " << C.lda() << endl; -#endif // TBB_DEBUG if (apply_helper_empty (P_first, P_last, Q, C)) return; @@ -267,9 +230,6 @@ namespace TSQR { seq_.apply ("N", Q.nrows(), Q.ncols(), Q.get(), Q.lda(), seq_outputs[P_first], C.ncols(), C.get(), C.lda(), contiguous_cache_blocks); -#ifdef TBB_DEBUG - cerr << "BOO!!!" << endl; -#endif // TBB_DEBUG } else { diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp index c323929ae91c..351228c64f22 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -43,22 +41,17 @@ #define __TSQR_TBB_UnCacheBlockTask_hpp #include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// +#include "TbbTsqr_Partitioner.hpp" +#include "Tsqr_SequentialTsqr.hpp" namespace TSQR { namespace TBB { - /// \class UnCacheBlockTask /// \brief TBB task for recursive TSQR un-(cache blocking) phase. /// /// "Un-(cache blocking)" here means copying the input matrix, /// which is stored with contiguous cache blocks, to the output /// matrix, which is stored with noncontiguous cache blocks. - /// template class UnCacheBlockTask : public tbb::task { public: diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 0cfc7f1f9f03..f0fa8051b0a8 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -34,31 +34,26 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr.hpp /// \brief Parallel Tall Skinny QR (TSQR) implementation -/// + #ifndef __TSQR_Tsqr_hpp #define __TSQR_Tsqr_hpp -#include -#include -#include -#include -#include -#include - -#include -#include -#include - +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_DistTsqr.hpp" +#include "Tsqr_SequentialTsqr.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_as.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_SerialDenseMatrix.hpp" namespace TSQR { - /// \class Tsqr /// \brief Parallel Tall Skinny QR (TSQR) factorization /// \author Mark Hoemmen @@ -94,10 +89,9 @@ namespace TSQR { /// are the same (int, in the case of Epetra). For other /// distributed linear algebra libraries, such as Tpetra, the /// local and global ordinal types may be different. - /// template > + class NodeTsqrType = SequentialTsqr> class Tsqr { public: typedef MatView mat_view_type; diff --git a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp index e697216b895c..e8b3f23cf6fe 100644 --- a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp @@ -34,30 +34,25 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER - #ifndef __TSQR_Trilinos_TsqrAdaptor_hpp #define __TSQR_Trilinos_TsqrAdaptor_hpp /// \file TsqrAdaptor.hpp /// \brief Abstract interface between TSQR and multivector type -/// -#include -#include -#include -#include -#include -#include + +#include "Tsqr_ConfigDefs.hpp" +#include "Teuchos_SerialDenseMatrix.hpp" +#include "TsqrTypeAdaptor.hpp" +#include "TsqrCommFactory.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include - namespace TSQR { - /// \namespace Trilinos /// \brief Interface between TSQR implementation and "the rest of Trilinos." /// @@ -66,7 +61,6 @@ namespace TSQR { /// interface to other linear algebra libraries, but requires its /// own special TSQR adaptor). namespace Trilinos { - /// \class TsqrAdaptor /// \brief Abstract interface between TSQR and multivector type /// @@ -227,15 +221,15 @@ namespace TSQR { // This is guaranteed to be _correct_ for any Node type, but // won't necessary be efficient. The desired model is that // A_local requires no copying. - Teuchos::ArrayRCP< scalar_type > A_local = fetchNonConstView (A); + Teuchos::ArrayRCP A_local = fetchNonConstView (A); // Reshape R if necessary. This operation zeros out all the // entries of R, which is what we want anyway. - if (R.numRows() != ncols || R.numCols() != ncols) - { - if (0 != R.shape (ncols, ncols)) - throw std::runtime_error ("Failed to reshape matrix R"); + if (R.numRows() != ncols || R.numCols() != ncols) { + if (0 != R.shape (ncols, ncols)) { + throw std::runtime_error ("Failed to reshape matrix R"); } + } return pTsqr_->factor (nrowsLocal, ncols, A_local.get(), LDA, R.values(), R.stride(), contiguousCacheBlocks); } @@ -282,16 +276,15 @@ namespace TSQR { local_ordinal_type nrowsLocal_out, ncols_out, LDQ_out; fetchDims (Q_out, nrowsLocal_out, ncols_out, LDQ_out); - if (nrowsLocal_out != nrowsLocal) - { - std::ostringstream os; - os << "TSQR explicit Q: input Q factor\'s node-local part has a di" - "fferent number of rows (" << nrowsLocal << ") than output Q fac" - "tor\'s node-local part (" << nrowsLocal_out << ")."; - throw std::runtime_error (os.str()); - } - ArrayRCP< const scalar_type > pQin = fetchConstView (Q_in); - ArrayRCP< scalar_type > pQout = fetchNonConstView (Q_out); + if (nrowsLocal_out != nrowsLocal) { + std::ostringstream os; + os << "TSQR explicit Q: input Q factor\'s node-local part has a di" + "fferent number of rows (" << nrowsLocal << ") than output Q fac" + "tor\'s node-local part (" << nrowsLocal_out << ")."; + throw std::runtime_error (os.str()); + } + ArrayRCP pQin = fetchConstView (Q_in); + ArrayRCP pQout = fetchNonConstView (Q_out); pTsqr_->explicit_Q (nrowsLocal, ncols_in, pQin.get(), LDQ_in, factorOutput, @@ -369,24 +362,22 @@ namespace TSQR { local_ordinal_type nrowsLocal_out, ncols_out, LDA_out; fetchDims (A_out, nrowsLocal_out, ncols_out, LDA_out); - if (nrowsLocal_out != nrowsLocal) - { - std::ostringstream os; - os << "TSQR cache block: the input matrix\'s node-local part has a" - " different number of rows (" << nrowsLocal << ") than the outpu" - "t matrix\'s node-local part (" << nrowsLocal_out << ")."; - throw std::runtime_error (os.str()); - } - else if (ncols_out != ncols) - { - std::ostringstream os; - os << "TSQR cache block: the input matrix\'s node-local part has a" - " different number of columns (" << ncols << ") than the output " - "matrix\'s node-local part (" << ncols_out << ")."; - throw std::runtime_error (os.str()); - } - ArrayRCP< const scalar_type > pA_in = fetchConstView (A_in); - ArrayRCP< scalar_type > pA_out = fetchNonConstView (A_out); + if (nrowsLocal_out != nrowsLocal) { + std::ostringstream os; + os << "TSQR cache block: the input matrix\'s node-local part has a" + " different number of rows (" << nrowsLocal << ") than the outpu" + "t matrix\'s node-local part (" << nrowsLocal_out << ")."; + throw std::runtime_error (os.str()); + } + else if (ncols_out != ncols) { + std::ostringstream os; + os << "TSQR cache block: the input matrix\'s node-local part has a" + " different number of columns (" << ncols << ") than the output " + "matrix\'s node-local part (" << ncols_out << ")."; + throw std::runtime_error (os.str()); + } + ArrayRCP pA_in = fetchConstView (A_in); + ArrayRCP pA_out = fetchNonConstView (A_out); pTsqr_->cache_block (nrowsLocal, ncols, pA_out.get(), pA_in.get(), LDA_in); } @@ -410,24 +401,22 @@ namespace TSQR { local_ordinal_type nrowsLocal_out, ncols_out, LDA_out; fetchDims (A_out, nrowsLocal_out, ncols_out, LDA_out); - if (nrowsLocal_out != nrowsLocal) - { - std::ostringstream os; - os << "TSQR un-cache-block: the input matrix\'s node-local part ha" - "s a different number of rows (" << nrowsLocal << ") than the ou" - "tput matrix\'s node-local part (" << nrowsLocal_out << ")."; - throw std::runtime_error (os.str()); - } - else if (ncols_out != ncols) - { - std::ostringstream os; - os << "TSQR cache block: the input matrix\'s node-local part has a" - " different number of columns (" << ncols << ") than the output " - "matrix\'s node-local part (" << ncols_out << ")."; - throw std::runtime_error (os.str()); - } - ArrayRCP< const scalar_type > pA_in = fetchConstView (A_in); - ArrayRCP< scalar_type > pA_out = fetchNonConstView (A_out); + if (nrowsLocal_out != nrowsLocal) { + std::ostringstream os; + os << "TSQR un-cache-block: the input matrix\'s node-local part ha" + "s a different number of rows (" << nrowsLocal << ") than the ou" + "tput matrix\'s node-local part (" << nrowsLocal_out << ")."; + throw std::runtime_error (os.str()); + } + else if (ncols_out != ncols) { + std::ostringstream os; + os << "TSQR cache block: the input matrix\'s node-local part has a" + " different number of columns (" << ncols << ") than the output " + "matrix\'s node-local part (" << ncols_out << ")."; + throw std::runtime_error (os.str()); + } + ArrayRCP pA_in = fetchConstView (A_in); + ArrayRCP pA_out = fetchNonConstView (A_out); pTsqr_->un_cache_block (nrowsLocal, ncols, pA_out.get(), LDA_out, pA_in.get()); } @@ -464,15 +453,14 @@ namespace TSQR { throw std::runtime_error ("R must have no fewer rows than columns"); // Const views suffice for verification - ArrayRCP< const scalar_type > A_ptr = fetchConstView (A); - ArrayRCP< const scalar_type > Q_ptr = fetchConstView (Q); + ArrayRCP A_ptr = fetchConstView (A); + ArrayRCP Q_ptr = fetchConstView (Q); return global_verify (nrowsLocal_A, ncols_A, A_ptr.get(), LDA, Q_ptr.get(), LDQ, R.values(), R.stride(), pScalarMessenger_.get()); } protected: - /// \brief A "nonconstructor constructor." /// /// This method initializes the adaptor, as a constructor would @@ -567,7 +555,6 @@ namespace TSQR { /// factorization. tsqr_ptr pTsqr_; }; - } // namespace Trilinos } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/TsqrCommFactory.hpp b/packages/tpetra/tsqr/src/TsqrCommFactory.hpp index 10dc9b01853f..abf7813845b3 100644 --- a/packages/tpetra/tsqr/src/TsqrCommFactory.hpp +++ b/packages/tpetra/tsqr/src/TsqrCommFactory.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -47,8 +45,8 @@ /// /// \warning TSQR users should not include this file directly. -#include -#include +#include "Teuchos_RCP.hpp" +#include "Tsqr_MessengerBase.hpp" namespace TSQR { namespace Trilinos { @@ -83,8 +81,8 @@ namespace TSQR { /// suitable for TSQR. virtual void makeMessengers (const comm_ptr& comm, - scalar_messenger_ptr& scalarMessenger, - ordinal_messenger_ptr& ordinalMessenger) = 0; + scalar_messenger_ptr& scalarMessenger, + ordinal_messenger_ptr& ordinalMessenger) = 0; //! Virtual destructor for memory safety. virtual ~CommFactory () {} diff --git a/packages/tpetra/tsqr/src/TsqrFactory.hpp b/packages/tpetra/tsqr/src/TsqrFactory.hpp index bcb8c8450541..ad4be2e7f831 100644 --- a/packages/tpetra/tsqr/src/TsqrFactory.hpp +++ b/packages/tpetra/tsqr/src/TsqrFactory.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -47,11 +45,10 @@ /// /// \warning TSQR users should _not_ include this file directly. -#include -#include -#include -#include - +#include "Tsqr_NodeTsqrFactory.hpp" +#include "Teuchos_Comm.hpp" +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr.hpp" namespace TSQR { namespace Trilinos { @@ -125,21 +122,21 @@ namespace TSQR { /// \return The node_tsqr_type instance that implements TSQR. Teuchos::RCP makeTsqr (const Teuchos::RCP& plist, - Teuchos::RCP& nodeTsqr, - Teuchos::RCP& distTsqr) + Teuchos::RCP& nodeTsqr, + Teuchos::RCP& distTsqr) { - using Teuchos::RCP; - using Teuchos::rcp; + using Teuchos::RCP; + using Teuchos::rcp; - nodeTsqr = makeNodeTsqr (plist); - distTsqr = makeDistTsqr (plist); - return rcp (new tsqr_type (nodeTsqr, distTsqr)); + nodeTsqr = makeNodeTsqr (plist); + distTsqr = makeDistTsqr (plist); + return rcp (new tsqr_type (nodeTsqr, distTsqr)); } void prepareTsqr - const Teuchos::RCP& messenger, + const Teuchos::RCP& messenger, //! Virtual destructor for memory safety of derived classes. virtual ~TsqrFactory () {}; @@ -147,7 +144,7 @@ namespace TSQR { private: /// \brief Instantiate and return the TSQR's intranode object. /// - /// \param plist [in/out] Same as the epinonymous input of + /// \param plist [in/out] Same as the epinonymous input of /// \c makeTsqr(). /// /// \return The node_tsqr_type object that TSQR will use for the @@ -167,7 +164,7 @@ namespace TSQR { virtual Teuchos::RCP makeNodeTsqr (const Teuchos::RCP& plist) const { - return Teuchos::rcp (new node_tsqr_type (plist)); + return Teuchos::rcp (new node_tsqr_type (plist)); } /// \brief Instantiate and return TSQR's internode object. @@ -175,7 +172,7 @@ namespace TSQR { /// \param messenger [in] Object used by TSQR for communicating /// between MPI processes. /// - /// \param plist [in/out] Same as the epinonymous input of + /// \param plist [in/out] Same as the epinonymous input of /// \c makeTsqr(). /// /// \return The dist_tsqr_type object that TSQR will use for the @@ -188,10 +185,10 @@ namespace TSQR { /// varies for different dist_tsqr_type types. virtual Teuchos::RCP makeDistTsqr (const Teuchos::RCP& messenger, - const Teuchos::RCP& plist) const + const Teuchos::RCP& plist) const { - (void) plist; - return Teuchos::rcp (new dist_tsqr_type (messenger)); + (void) plist; + return Teuchos::rcp (new dist_tsqr_type (messenger)); } }; } // namespace Trilinos diff --git a/packages/tpetra/tsqr/src/TsqrFactory_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/TsqrFactory_SequentialTsqr.hpp index aa6cab4b2d8a..258753d83ed6 100644 --- a/packages/tpetra/tsqr/src/TsqrFactory_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/TsqrFactory_SequentialTsqr.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -44,13 +42,11 @@ /// \file TsqrFactory_SequentialTsqr.hpp /// \brief Declaration and definition of SequentialTsqrFactory. -/// #include "Tsqr_SequentialTsqr.hpp" #include "Tsqr.hpp" #include "Teuchos_ParameterListExceptions.hpp" - namespace TSQR { namespace Trilinos { diff --git a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp b/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp index 312d95e734cc..4e5d22e1403c 100644 --- a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp +++ b/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -46,13 +44,12 @@ /// /// \warning Trilinos users should _not_ include this file directly. -#include +#include "Tsqr_ConfigDefs.hpp" #ifdef HAVE_KOKKOSTSQR_TBB # include "TbbTsqr.hpp" #endif // HAVE_KOKKOSTSQR_TBB - namespace TSQR { namespace Trilinos { diff --git a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp index e2acd102efe2..5e6dccdbb87a 100644 --- a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,26 +34,24 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file TsqrTypeAdaptor.hpp -/// \brief Traits class mapping between multivector type and TSQR implementation types. +/// \brief Traits class mapping between multivector type and TSQR +/// implementation types. /// /// \warning Trilinos users should not include this file directly. + #ifndef __TSQR_Trilinos_TsqrTypeAdaptor_hpp #define __TSQR_Trilinos_TsqrTypeAdaptor_hpp -#include -#include -#include - +#include "Teuchos_RCP.hpp" +#include "TsqrFactory.hpp" +#include "Tsqr.hpp" namespace TSQR { namespace Trilinos { - /// \class UndefinedComm /// \brief Class used to catch undefined specializations of \c TsqrTypeAdaptor. class UndefinedComm {}; diff --git a/packages/tpetra/tsqr/src/Tsqr_ApplyType.cpp b/packages/tpetra/tsqr/src/Tsqr_ApplyType.cpp index 28b0352aa001..becacf3daa25 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ApplyType.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_ApplyType.cpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,15 +34,12 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_ApplyType.hpp" #include - namespace TSQR { ApplyType::ApplyType (const std::string& op) : type_ (decide_apply_type (op)), @@ -64,7 +61,7 @@ namespace TSQR { const ApplyType ApplyType::Transpose = ApplyType ("T"); const ApplyType ApplyType::ConjugateTranspose = ApplyType ("C"); - std::string + std::string ApplyType::enumToLapackString (const ApplyType::ApplyType_ theType) { if (theType == NoTranspose_) @@ -77,26 +74,26 @@ namespace TSQR { throw std::logic_error("Invalid ApplyType: should never get here"); } - bool - ApplyType::decide_transposed (const std::string& op) const + bool + ApplyType::decide_transposed (const std::string& op) const { if (op[0] == 'N' || op[0] == 'n') return false; else { - const char validTransposeLetters[] = "TtCcHh"; - const int numValidTransposeLetters = 6; + const char validTransposeLetters[] = "TtCcHh"; + const int numValidTransposeLetters = 6; - for (int k = 0; k < numValidTransposeLetters; ++k) - if (op[0] == validTransposeLetters[k]) - return true; + for (int k = 0; k < numValidTransposeLetters; ++k) + if (op[0] == validTransposeLetters[k]) + return true; - throw std::invalid_argument ("Invalid \"op\" argument \"" + op + "\""); + throw std::invalid_argument ("Invalid \"op\" argument \"" + op + "\""); } } ApplyType::ApplyType_ - ApplyType::decide_apply_type (const std::string& op) const + ApplyType::decide_apply_type (const std::string& op) const { if (op[0] == 'T' || op[0] == 't') return Transpose_; diff --git a/packages/tpetra/tsqr/src/Tsqr_ApplyType.hpp b/packages/tpetra/tsqr/src/Tsqr_ApplyType.hpp index aa8be847cd38..b2f9102c1708 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ApplyType.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ApplyType.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,10 +43,9 @@ #ifndef __TSQR_TsqrApplyType_hpp #define __TSQR_TsqrApplyType_hpp -#include +#include "Tsqr_ConfigDefs.hpp" #include - namespace TSQR { /// \class ApplyType diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 8440bee3b944..35d216262bc9 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -34,17 +34,15 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_CacheBlocker_hpp #define __TSQR_CacheBlocker_hpp -#include -#include -#include +#include "Tsqr_CacheBlockingStrategy.hpp" +#include "Tsqr_MatView.hpp" +#include "Tsqr_Util.hpp" #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp index d5b8b2286b41..aa70035044ac 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -48,7 +46,6 @@ #include #include // std::pair - namespace TSQR { /// \class CacheBlockingStrategy diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index fe982612540d..925d900b1f93 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,24 +34,20 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr_Combine.hpp /// \brief TSQR's six computational kernels. -/// + #ifndef __TSQR_Combine_hpp #define __TSQR_Combine_hpp -#include -#include -#include - +#include "Teuchos_ScalarTraits.hpp" +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_CombineNative.hpp" namespace TSQR { - /// \class Combine /// \brief TSQR's six computational kernels /// \author Mark Hoemmen @@ -61,36 +57,41 @@ namespace TSQR { /// each represent an n x n upper triangular matrix, A represents an /// m x n cache block, and C_1 and C_2 represent cache blocks with /// some number of columns p: - /// - Factor A (factor_first) - /// - Apply Q factor of A to C (apply_first) - /// - Factor [R; A] (factor_inner) - /// - Factor [R_1; R_2] (factor_pair) - /// - Apply Q factor of [R; A] to [C_1; C_2] (apply_inner) - /// - Apply Q factor of [R_1; R_2] to [C_1; C_2] (apply_pair) + /// + ///
    + ///
  • Factor A (factor_first)
  • + ///
  • Apply Q factor of A to C (apply_first)
  • + ///
  • Factor [R; A] (factor_inner)
  • + ///
  • Factor [R_1; R_2] (factor_pair)
  • + ///
  • Apply Q factor of [R; A] to [C_1; C_2] (apply_inner)
  • + ///
  • Apply Q factor of [R_1; R_2] to [C_1; C_2] (apply_pair)
  • + ///
/// /// \tparam Ordinal Type of indices into matrices. /// \tparam Scalar Type of entries of matrices. /// \tparam CombineImpl Type of a particular implementation of /// Combine. Its public interface must contain this class' /// interface. - /// + /// /// All Combine methods are implemented using CombineImpl methods /// with the same name. TSQR includes three implementations of the /// CombineImpl interface: - /// - \c CombineDefault, which uses LAPACK and copies in and out of - /// scratch space that it owns, - /// - \c CombineNative, a C++ in-place (no scratch space) generic - /// implementation), and - /// - \c CombineFortran, a Fortran 9x in-place implementation for - /// LAPACK's four data types S, D, C, Z. /// - /// The default CombineImpl is \c CombineNative, since that should - /// work for any Ordinal and Scalar types for which LAPACK and BLAS are implemented. + ///
    + ///
  • CombineDefault, which uses LAPACK and copies in and out of + /// scratch space that it owns,
  • + ///
  • CombineNative, a C++ in-place (no scratch space) generic + /// implementation), and
  • + ///
  • CombineFortran, a Fortran 9x in-place implementation for + /// LAPACK's four data types (S, D, C, and Z).
  • + ///
/// - template< class Ordinal, - class Scalar, - class CombineImpl = CombineNative::isComplex> > + /// The default CombineImpl is CombineNative, since that should work + /// for any Ordinal and Scalar types for which LAPACK and BLAS are implemented. + template< class Ordinal, + class Scalar, + class CombineImpl = CombineNative::isComplex> > class Combine { public: /// \typedef scalar_type @@ -108,7 +109,7 @@ namespace TSQR { /// Whether or not the QR factorizations computed by methods of /// this class produce an R factor with all nonnegative diagonal - /// entries. + /// entries. static bool QR_produces_R_factor_with_nonnegative_diagonal() { return combine_impl_type::QR_produces_R_factor_with_nonnegative_diagonal(); } @@ -119,7 +120,7 @@ namespace TSQR { /// (with leading dimension lda). Overwrite the upper triangle of /// A with the resulting R factor, and the lower trapezoid of A /// (along with the length ncols tau array) with the implicitly - /// stored Q factor. + /// stored Q factor. /// /// \param nrows [in] Number of rows in A /// \param ncols [in] Number of columns in A @@ -128,16 +129,16 @@ namespace TSQR { /// On output: upper triangle contains the R factor, and lower /// part contains the implicitly stored Q factor. /// \param lda [in] Leading dimension of A - /// \param tau [out] Array of length ncols; on output, the + /// \param tau [out] Array of length ncols; on output, the /// scaling factors for the Householder reflectors /// \param work [out] Workspace array of length ncols void factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, - Scalar tau[], - Scalar work[]) const + const Ordinal ncols, + Scalar A[], + const Ordinal lda, + Scalar tau[], + Scalar work[]) const { return impl_.factor_first (nrows, ncols, A, lda, tau, work); } @@ -148,18 +149,18 @@ namespace TSQR { /// implicitly in A and tau, to the matrix C. void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, - const Scalar tau[], - Scalar C[], - const Ordinal ldc, - Scalar work[]) const + const Ordinal nrows, + const Ordinal ncols_C, + const Ordinal ncols_A, + const Scalar A[], + const Ordinal lda, + const Scalar tau[], + Scalar C[], + const Ordinal ldc, + Scalar work[]) const { - return impl_.apply_first (applyType, nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work); + return impl_.apply_first (applyType, nrows, ncols_C, ncols_A, + A, lda, tau, C, ldc, work); } /// Apply the result of \c factor_inner(). @@ -180,11 +181,11 @@ namespace TSQR { /// \param m [in] number of rows of A /// \param ncols_C [in] number of columns of [C_top; C_bot] /// \param ncols_Q [in] number of columns of [R; A] - /// \param A [in] m by ncols_Q matrix, in which the Householder + /// \param A [in] m by ncols_Q matrix, in which the Householder /// reflectors representing the Q factor are stored /// \param lda [in] leading dimension of A /// \param tau [in] array of length ncols_Q, storing the scaling - /// factors for the Householder reflectors representing Q + /// factors for the Householder reflectors representing Q /// \param C_top [inout] ncols_Q by ncols_C matrix /// \param ldc_top [in] leading dimension of C_top /// \param C_bot [inout] m by ncols_C matrix @@ -192,21 +193,21 @@ namespace TSQR { /// \param work [out] workspace array of length ncols_C void apply_inner (const ApplyType& apply_type, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, - const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const + const Ordinal m, + const Ordinal ncols_C, + const Ordinal ncols_Q, + const Scalar A[], + const Ordinal lda, + const Scalar tau[], + Scalar C_top[], + const Ordinal ldc_top, + Scalar C_bot[], + const Ordinal ldc_bot, + Scalar work[]) const { - impl_.apply_inner (apply_type, m, ncols_C, ncols_Q, - A, lda, tau, - C_top, ldc_top, C_bot, ldc_bot, work); + impl_.apply_inner (apply_type, m, ncols_C, ncols_Q, + A, lda, tau, + C_top, ldc_top, C_bot, ldc_bot, work); } /// \brief Factor [R; A] for square upper triangular R and cache block A. @@ -218,7 +219,7 @@ namespace TSQR { /// belongs to the sequential part (i.e., operating on cache blocks on /// a single processor). Only the first cache block $A_0$ is factored /// as $Q_0 R_0 = A_0$ (see tsqr_factor_first); subsequent cache blocks - /// $A_k$ are factored using this routine, which combines them with + /// $A_k$ are factored using this routine, which combines them with /// $R_{k-1}$. /// /// Here is the matrix to factor: @@ -234,56 +235,56 @@ namespace TSQR { /// A_k]$) entirely in $A_k$ (specifically, in all of $A_k$, not just /// below the diagonal). /// - /// \param m [in] Number of rows in the "bottom" block to factor. - /// The number of rows in the top block doesn't matter, given the + /// \param m [in] Number of rows in the "bottom" block to factor. + /// The number of rows in the top block doesn't matter, given the /// assumptions above, as long as $m_{k-1} \geq n$. /// \param n [in] Number of columns (same in both blocks) /// \param R [inout] "Top" upper triangular n by n block $R_{k-1}$. /// Overwritten with the new R factor $R_k$ of $[R_{k-1}; A_k]$. /// \param ldr [in] Leading dimension of R - /// \param A [inout] "Bottom" dense m by n block $A_k$. Overwritten - /// with the Householder reflectors representing the Q factor of + /// \param A [inout] "Bottom" dense m by n block $A_k$. Overwritten + /// with the Householder reflectors representing the Q factor of /// $[R_{k-1}; A_k]$. - /// \param tau [out] Scaling factors of the Householder reflectors. + /// \param tau [out] Scaling factors of the Householder reflectors. /// Corresponds to the TAU output of LAPACK's _GEQRF. - /// \param work [out] Workspace (length >= n; don't need lwork or + /// \param work [out] Workspace (length >= n; don't need lwork or /// workspace query) void factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, - Scalar tau[], - Scalar work[]) const + const Ordinal n, + Scalar R[], + const Ordinal ldr, + Scalar A[], + const Ordinal lda, + Scalar tau[], + Scalar work[]) const { impl_.factor_inner (m, n, R, ldr, A, lda, tau, work); } - /// \brief Factor the pair of square upper triangular matrices [R_top; R_bot]. + /// \brief Factor the pair of square upper triangular matrices [R_top; R_bot]. /// /// Store the resulting R factor in R_top, and the resulting /// Householder reflectors implicitly in R_bot and tau. /// /// \param n [in] Number of rows and columns of each of R_top and R_bot - /// \param R_top [inout] n by n upper triangular matrix + /// \param R_top [inout] n by n upper triangular matrix /// \param ldr_top [in] Leading dimension of R_top - /// \param R_bot [inout] n by n upper triangular matrix + /// \param R_bot [inout] n by n upper triangular matrix /// \param ldr_bot [in] Leading dimension of R_bot /// \param tau [out] Scaling factors for Householder reflectors /// \param work [out] Workspace array (of length >= n) /// void factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, - Scalar tau[], - Scalar work[]) const + Scalar R_top[], + const Ordinal ldr_top, + Scalar R_bot[], + const Ordinal ldr_bot, + Scalar tau[], + Scalar work[]) const { - impl_.factor_pair (n, R_top, ldr_top, R_bot, ldr_bot, tau, work); + impl_.factor_pair (n, R_top, ldr_top, R_bot, ldr_bot, tau, work); } /// \brief Apply the result of \c factor_pair(). @@ -298,20 +299,20 @@ namespace TSQR { /// means apply Q^T, and ConjugateTranspose means apply Q^H. void apply_pair (const ApplyType& apply_type, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar R_bot[], - const Ordinal ldr_bot, - const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const + const Ordinal ncols_C, + const Ordinal ncols_Q, + const Scalar R_bot[], + const Ordinal ldr_bot, + const Scalar tau[], + Scalar C_top[], + const Ordinal ldc_top, + Scalar C_bot[], + const Ordinal ldc_bot, + Scalar work[]) const { - impl_.apply_pair (apply_type, ncols_C, ncols_Q, - R_bot, ldr_bot, tau, - C_top, ldc_top, C_bot, ldc_bot, work); + impl_.apply_pair (apply_type, ncols_C, ncols_Q, + R_bot, ldr_bot, tau, + C_top, ldc_top, C_bot, ldc_bot, work); } private: diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp index 81cbf96ee729..c4575b98b3ac 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp @@ -34,20 +34,18 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_CombineBenchmark_hpp #define __TSQR_Test_CombineBenchmark_hpp -#include -#include -#include -#include +#include "Tsqr_ConfigDefs.hpp" +#include "Tsqr_CombineBenchmarker.hpp" +#include "Tsqr_CombineDefault.hpp" +#include "Tsqr_CombineNative.hpp" #ifdef HAVE_KOKKOSTSQR_FORTRAN -# include +# include "Tsqr_CombineFortran.hpp" #endif // HAVE_KOKKOSTSQR_FORTRAN #include diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index e5d50a523d24..546c6097efd5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -34,22 +34,20 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __Tsqr_CombineBenchmarker_hpp #define __Tsqr_CombineBenchmarker_hpp -#include -#include -#include -#include +#include "Tsqr_ConfigDefs.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_Random_MatrixGenerator.hpp" +#include "Tsqr_verifyTimerConcept.hpp" -#include -#include -#include +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_Util.hpp" #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index b853266704b6..b81b4b39f5fe 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,11 +43,10 @@ #ifndef __TSQR_CombineDefault_hpp #define __TSQR_CombineDefault_hpp -#include - -#include -#include -#include +#include "Teuchos_ScalarTraits.hpp" +#include "Tsqr_ApplyType.hpp" +#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Matrix.hpp" #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index b567d5b1b40f..4b7e0db3e138 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,12 +43,10 @@ #ifndef __TSQR_CombineNative_hpp #define __TSQR_CombineNative_hpp -#include -#include - +#include "Teuchos_LAPACK.hpp" +#include "Teuchos_ScalarTraits.hpp" #include "Tsqr_ApplyType.hpp" #include "Tsqr_CombineDefault.hpp" - #include "Kokkos_Core.hpp" #include "KokkosBlas2_gemv.hpp" #include "Kokkos_ArithTraits.hpp" diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 1569e448f381..5a3e52afb231 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -34,23 +34,20 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include #include "Tsqr_CombineTest.hpp" -#include -#include +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_Random_MatrixGenerator.hpp" -#include -#include -#include -#include +#include "Tsqr_Combine.hpp" +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_Util.hpp" -#include +#include "Teuchos_Assert.hpp" #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.hpp index f52e906e18fc..bb4900d1e804 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -50,8 +48,7 @@ #ifndef __TSQR_Test_CombineTest_hpp #define __TSQR_Test_CombineTest_hpp -#include - +#include "Tsqr_ConfigDefs.hpp" namespace TSQR { namespace Test { diff --git a/packages/tpetra/tsqr/src/Tsqr_ConfigDefs.hpp b/packages/tpetra/tsqr/src/Tsqr_ConfigDefs.hpp index ab9fffe84127..07404f002215 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ConfigDefs.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ConfigDefs.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -47,7 +45,7 @@ // Users should not include TpetraTSQR_config.h directly. // Include Tsqr_ConfigDefs.hpp instead. -#include +#include "TpetraTSQR_config.h" /// \namespace TSQR /// \brief Implementation of the Tall Skinny QR (TSQR) factorization. diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index e0742db9bce5..502d18bd96e2 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,13 +43,13 @@ #ifndef __TSQR_Tsqr_DistTsqr_hpp #define __TSQR_Tsqr_DistTsqr_hpp -#include -#include -#include -#include -#include -#include // std::pair +#include "Tsqr_DistTsqrHelper.hpp" +#include "Tsqr_DistTsqrRB.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_ParameterListAcceptorDefaultBase.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include // std::pair namespace TSQR { /// \class DistTsqr diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp index 25dfa328a0fe..d6ad83b7f57e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,25 +34,22 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_DistTsqrHelper_hpp #define __TSQR_Tsqr_DistTsqrHelper_hpp -#include -#include -#include -#include +#include "Tsqr_MatView.hpp" +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_Combine.hpp" +#include "Tsqr_Util.hpp" #include // std::min, std::max #include #include #include - namespace TSQR { /// \class DistTsqrHelper @@ -68,21 +65,21 @@ namespace TSQR { void factor_pair (const LocalOrdinal ncols, - std::vector< Scalar >& R_mine, - const LocalOrdinal P_mine, - const LocalOrdinal P_other, - const LocalOrdinal tag, - MessengerBase* const messenger, - std::vector >& Q_factors, - std::vector >& tau_arrays, - std::vector& work) + std::vector< Scalar >& R_mine, + const LocalOrdinal P_mine, + const LocalOrdinal P_other, + const LocalOrdinal tag, + MessengerBase* const messenger, + std::vector >& Q_factors, + std::vector >& tau_arrays, + std::vector& work) { using std::endl; using std::ostringstream; using std::vector; if (P_mine == P_other) - return; // nothing to do + return; // nothing to do const int P_top = std::min (P_mine, P_other); const int P_bot = std::max (P_mine, P_other); @@ -96,144 +93,144 @@ namespace TSQR { Combine< LocalOrdinal, Scalar > combine; if (P_mine == P_top) - { - combine.factor_pair (ncols, &R_mine[0], ldr, &R_other[0], ldr, &tau[0], &work[0]); - Q_factors.push_back (R_other); - tau_arrays.push_back (tau); - } + { + combine.factor_pair (ncols, &R_mine[0], ldr, &R_other[0], ldr, &tau[0], &work[0]); + Q_factors.push_back (R_other); + tau_arrays.push_back (tau); + } else if (P_mine == P_bot) - { - combine.factor_pair (ncols, &R_other[0], ldr, &R_mine[0], ldr, &tau[0], &work[0]); - Q_factors.push_back (R_mine); - // Make sure that the "bottom" processor gets the current R - // factor, which is returned in R_mine. - copy_matrix (ncols, ncols, &R_mine[0], ldr, &R_other[0], ldr); - tau_arrays.push_back (tau); - } + { + combine.factor_pair (ncols, &R_other[0], ldr, &R_mine[0], ldr, &tau[0], &work[0]); + Q_factors.push_back (R_mine); + // Make sure that the "bottom" processor gets the current R + // factor, which is returned in R_mine. + copy_matrix (ncols, ncols, &R_mine[0], ldr, &R_other[0], ldr); + tau_arrays.push_back (tau); + } else - { - // mfh 16 Apr 2010: the troubles with assert statements are as follows: - // - // 1. They go away in a release build. - // 2. They don't often print out useful diagnostic information. - // 3. If you mistype the assert, like "assert(errcode = 1);" instead of - // "assert(errcode == 1)", you'll get false positives. - ostringstream os; - os << "Should never get here: P_mine (= " << P_mine - << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; - throw std::logic_error (os.str()); - } + { + // mfh 16 Apr 2010: the troubles with assert statements are as follows: + // + // 1. They go away in a release build. + // 2. They don't often print out useful diagnostic information. + // 3. If you mistype the assert, like "assert(errcode = 1);" instead of + // "assert(errcode == 1)", you'll get false positives. + ostringstream os; + os << "Should never get here: P_mine (= " << P_mine + << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; + throw std::logic_error (os.str()); + } } void factor_helper (const LocalOrdinal ncols, - std::vector< Scalar >& R_mine, - const LocalOrdinal my_rank, - const LocalOrdinal P_first, - const LocalOrdinal P_last, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - std::vector< std::vector< Scalar > >& Q_factors, - std::vector< std::vector< Scalar > >& tau_arrays, - std::vector< Scalar >& work) + std::vector< Scalar >& R_mine, + const LocalOrdinal my_rank, + const LocalOrdinal P_first, + const LocalOrdinal P_last, + const LocalOrdinal tag, + MessengerBase< Scalar >* const messenger, + std::vector< std::vector< Scalar > >& Q_factors, + std::vector< std::vector< Scalar > >& tau_arrays, + std::vector< Scalar >& work) { using std::endl; using std::ostringstream; using std::vector; if (P_last <= P_first) - return; + return; else - { - const int P = P_last - P_first + 1; - // Whether the interval [P_first, P_last] has an even number of - // elements. Our interval splitting scheme ensures that the - // interval [P_first, P_mid - 1] always has an even number of - // elements. - const bool b_even = (P % 2 == 0); - // We split the interval [P_first, P_last] into 2 intervals: - // [P_first, P_mid-1], and [P_mid, P_last]. We bias the - // splitting procedure so that the lower interval always has an - // even number of processor ranks, and never has fewer processor - // ranks than the higher interval. - const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); - - if (my_rank < P_mid) // Interval [P_first, P_mid-1] - { - factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1, - tag + 1, messenger, Q_factors, tau_arrays, work); - - // If there aren't an even number of processors in the - // original interval, then the last processor in the lower - // interval has to skip this round. - if (b_even || my_rank < P_mid - 1) - { - const int my_offset = my_rank - P_first; - const int P_other = P_mid + my_offset; - if (P_other < P_mid || P_other > P_last) - throw std::logic_error ("P_other not in [P_mid,P_last] range"); - - factor_pair (ncols, R_mine, my_rank, P_other, tag, - messenger, Q_factors, tau_arrays, work); - } - - // If I'm skipping this round, get the "current" R factor - // from P_mid. - if (! b_even && my_rank == P_mid - 1) - { - const int theTag = 142; // magic constant - messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag); - } - } - else // Interval [P_mid, P_last] - { - factor_helper (ncols, R_mine, my_rank, P_mid, P_last, - tag + 1, messenger, Q_factors, tau_arrays, work); - - const int my_offset = my_rank - P_mid; - const int P_other = P_first + my_offset; - - if (P_other < P_first || P_other >= P_mid) - throw std::logic_error ("P_other not in [P_first,P_mid-1] range"); - factor_pair (ncols, R_mine, my_rank, P_other, tag, - messenger, Q_factors, tau_arrays, work); - - // If Proc P_mid-1 is skipping this round, Proc P_mid will - // send it the "current" R factor. - if (! b_even) - { - const int theTag = 142; // magic constant - messenger->send (&R_mine[0], ncols*ncols, P_mid-1, theTag); - } - } - } + { + const int P = P_last - P_first + 1; + // Whether the interval [P_first, P_last] has an even number of + // elements. Our interval splitting scheme ensures that the + // interval [P_first, P_mid - 1] always has an even number of + // elements. + const bool b_even = (P % 2 == 0); + // We split the interval [P_first, P_last] into 2 intervals: + // [P_first, P_mid-1], and [P_mid, P_last]. We bias the + // splitting procedure so that the lower interval always has an + // even number of processor ranks, and never has fewer processor + // ranks than the higher interval. + const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); + + if (my_rank < P_mid) // Interval [P_first, P_mid-1] + { + factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1, + tag + 1, messenger, Q_factors, tau_arrays, work); + + // If there aren't an even number of processors in the + // original interval, then the last processor in the lower + // interval has to skip this round. + if (b_even || my_rank < P_mid - 1) + { + const int my_offset = my_rank - P_first; + const int P_other = P_mid + my_offset; + if (P_other < P_mid || P_other > P_last) + throw std::logic_error ("P_other not in [P_mid,P_last] range"); + + factor_pair (ncols, R_mine, my_rank, P_other, tag, + messenger, Q_factors, tau_arrays, work); + } + + // If I'm skipping this round, get the "current" R factor + // from P_mid. + if (! b_even && my_rank == P_mid - 1) + { + const int theTag = 142; // magic constant + messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag); + } + } + else // Interval [P_mid, P_last] + { + factor_helper (ncols, R_mine, my_rank, P_mid, P_last, + tag + 1, messenger, Q_factors, tau_arrays, work); + + const int my_offset = my_rank - P_mid; + const int P_other = P_first + my_offset; + + if (P_other < P_first || P_other >= P_mid) + throw std::logic_error ("P_other not in [P_first,P_mid-1] range"); + factor_pair (ncols, R_mine, my_rank, P_other, tag, + messenger, Q_factors, tau_arrays, work); + + // If Proc P_mid-1 is skipping this round, Proc P_mid will + // send it the "current" R factor. + if (! b_even) + { + const int theTag = 142; // magic constant + messenger->send (&R_mine[0], ncols*ncols, P_mid-1, theTag); + } + } + } } void apply_pair (const ApplyType& apply_type, - const LocalOrdinal ncols_C, - const LocalOrdinal ncols_Q, - Scalar C_mine[], - const LocalOrdinal ldc_mine, - Scalar C_other[], // contiguous ncols_C x ncols_C scratch - const LocalOrdinal P_mine, - const LocalOrdinal P_other, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - const std::vector< Scalar >& Q_cur, - const std::vector< Scalar >& tau_cur, - std::vector< Scalar >& work) + const LocalOrdinal ncols_C, + const LocalOrdinal ncols_Q, + Scalar C_mine[], + const LocalOrdinal ldc_mine, + Scalar C_other[], // contiguous ncols_C x ncols_C scratch + const LocalOrdinal P_mine, + const LocalOrdinal P_other, + const LocalOrdinal tag, + MessengerBase< Scalar >* const messenger, + const std::vector< Scalar >& Q_cur, + const std::vector< Scalar >& tau_cur, + std::vector< Scalar >& work) { using std::endl; using std::ostringstream; using std::vector; if (P_mine == P_other) - return; // nothing to do - + return; // nothing to do + const int P_top = std::min (P_mine, P_other); const int P_bot = std::max (P_mine, P_other); - + const LocalOrdinal nelts = ncols_C * ncols_C; const LocalOrdinal ldq = ncols_Q; const LocalOrdinal ldc_other = ncols_C; @@ -244,132 +241,132 @@ namespace TSQR { Combine< LocalOrdinal, Scalar > combine; if (P_mine == P_top) - combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, - &tau_cur[0], C_mine, ldc_mine, C_other, ldc_other, - &work[0]); + combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, + &tau_cur[0], C_mine, ldc_mine, C_other, ldc_other, + &work[0]); else if (P_mine == P_bot) - combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, - &tau_cur[0], C_other, ldc_other, C_mine, ldc_mine, - &work[0]); + combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, + &tau_cur[0], C_other, ldc_other, C_mine, ldc_mine, + &work[0]); else - { - ostringstream os; - os << "Should never get here: P_mine (= " << P_mine - << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; - throw std::logic_error (os.str()); - } + { + ostringstream os; + os << "Should never get here: P_mine (= " << P_mine + << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; + throw std::logic_error (os.str()); + } } void apply_helper (const ApplyType& apply_type, - const LocalOrdinal ncols_C, - const LocalOrdinal ncols_Q, - Scalar C_mine[], - const LocalOrdinal ldc_mine, - Scalar C_other[], // contiguous ncols_C x ncols_C scratch - const LocalOrdinal my_rank, - const LocalOrdinal P_first, - const LocalOrdinal P_last, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - const std::vector< std::vector< Scalar > >& Q_factors, - const std::vector< std::vector< Scalar > >& tau_arrays, - const LocalOrdinal cur_pos, - std::vector< Scalar >& work) + const LocalOrdinal ncols_C, + const LocalOrdinal ncols_Q, + Scalar C_mine[], + const LocalOrdinal ldc_mine, + Scalar C_other[], // contiguous ncols_C x ncols_C scratch + const LocalOrdinal my_rank, + const LocalOrdinal P_first, + const LocalOrdinal P_last, + const LocalOrdinal tag, + MessengerBase< Scalar >* const messenger, + const std::vector< std::vector< Scalar > >& Q_factors, + const std::vector< std::vector< Scalar > >& tau_arrays, + const LocalOrdinal cur_pos, + std::vector< Scalar >& work) { using std::endl; using std::ostringstream; using std::vector; if (P_last <= P_first) - return; + return; else - { - const int P = P_last - P_first + 1; - // Whether the interval [P_first, P_last] has an even number of - // elements. Our interval splitting scheme ensures that the - // interval [P_first, P_mid - 1] always has an even number of - // elements. - const bool b_even = (P % 2 == 0); - // We split the interval [P_first, P_last] into 2 intervals: - // [P_first, P_mid-1], and [P_mid, P_last]. We bias the - // splitting procedure so that the lower interval always has an - // even number of processor ranks, and never has fewer processor - // ranks than the higher interval. - const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); - - if (my_rank < P_mid) // Interval [P_first, P_mid - 1] - { - const bool b_participating = b_even || my_rank < P_mid - 1; - - if (cur_pos < 0) - { - ostringstream os; - os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos - << ") < 0; lower interval [" << P_first << "," << (P_mid-1) - << "]; original interval [" << P_first << "," << P_last - << "]" << endl; - throw std::logic_error (os.str()); - } - - // If there aren't an even number of processors in the - // original interval, then the last processor in the lower - // interval has to skip this round. Since we skip this - // round, don't decrement cur_pos (else we'll skip an entry - // and eventually fall off the front of the array. - int new_cur_pos; - if (b_even || my_rank < P_mid - 1) - { - if (! b_participating) - throw std::logic_error("Should never get here"); - - const int my_offset = my_rank - P_first; - const int P_other = P_mid + my_offset; - // assert (P_mid <= P_other && P_other <= P_last); - if (P_other < P_mid || P_other > P_last) - throw std::logic_error("Should never get here"); - - apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_other, tag, messenger, - Q_factors[cur_pos], tau_arrays[cur_pos], work); - new_cur_pos = cur_pos - 1; - } - else - { - if (b_participating) - throw std::logic_error("Should never get here"); - - new_cur_pos = cur_pos; - } - apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_first, P_mid - 1, tag + 1, - messenger, Q_factors, tau_arrays, new_cur_pos, - work); - } - else - { - if (cur_pos < 0) - { - ostringstream os; - os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos - << ") < 0; upper interval [" << P_mid << "," << P_last - << "]; original interval [" << P_first << "," << P_last - << "]" << endl; - throw std::logic_error (os.str()); - } - - const int my_offset = my_rank - P_mid; - const int P_other = P_first + my_offset; - // assert (0 <= P_other && P_other < P_mid); - apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_other, tag, messenger, - Q_factors[cur_pos], tau_arrays[cur_pos], work); - apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_mid, P_last, tag + 1, - messenger, Q_factors, tau_arrays, cur_pos - 1, - work); - } - } + { + const int P = P_last - P_first + 1; + // Whether the interval [P_first, P_last] has an even number of + // elements. Our interval splitting scheme ensures that the + // interval [P_first, P_mid - 1] always has an even number of + // elements. + const bool b_even = (P % 2 == 0); + // We split the interval [P_first, P_last] into 2 intervals: + // [P_first, P_mid-1], and [P_mid, P_last]. We bias the + // splitting procedure so that the lower interval always has an + // even number of processor ranks, and never has fewer processor + // ranks than the higher interval. + const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); + + if (my_rank < P_mid) // Interval [P_first, P_mid - 1] + { + const bool b_participating = b_even || my_rank < P_mid - 1; + + if (cur_pos < 0) + { + ostringstream os; + os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos + << ") < 0; lower interval [" << P_first << "," << (P_mid-1) + << "]; original interval [" << P_first << "," << P_last + << "]" << endl; + throw std::logic_error (os.str()); + } + + // If there aren't an even number of processors in the + // original interval, then the last processor in the lower + // interval has to skip this round. Since we skip this + // round, don't decrement cur_pos (else we'll skip an entry + // and eventually fall off the front of the array. + int new_cur_pos; + if (b_even || my_rank < P_mid - 1) + { + if (! b_participating) + throw std::logic_error("Should never get here"); + + const int my_offset = my_rank - P_first; + const int P_other = P_mid + my_offset; + // assert (P_mid <= P_other && P_other <= P_last); + if (P_other < P_mid || P_other > P_last) + throw std::logic_error("Should never get here"); + + apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_other, tag, messenger, + Q_factors[cur_pos], tau_arrays[cur_pos], work); + new_cur_pos = cur_pos - 1; + } + else + { + if (b_participating) + throw std::logic_error("Should never get here"); + + new_cur_pos = cur_pos; + } + apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_first, P_mid - 1, tag + 1, + messenger, Q_factors, tau_arrays, new_cur_pos, + work); + } + else + { + if (cur_pos < 0) + { + ostringstream os; + os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos + << ") < 0; upper interval [" << P_mid << "," << P_last + << "]; original interval [" << P_first << "," << P_last + << "]" << endl; + throw std::logic_error (os.str()); + } + + const int my_offset = my_rank - P_mid; + const int P_other = P_first + my_offset; + // assert (0 <= P_other && P_other < P_mid); + apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_other, tag, messenger, + Q_factors[cur_pos], tau_arrays[cur_pos], work); + apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_mid, P_last, tag + 1, + messenger, Q_factors, tau_arrays, cur_pos - 1, + work); + } + } } }; diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index cd96eb0b95df..b13c888e659d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -35,8 +35,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ @@ -44,13 +42,13 @@ #ifndef __TSQR_DistTsqrRB_hpp #define __TSQR_DistTsqrRB_hpp -#include -#include -#include -#include +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_Combine.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_StatTimeMonitor.hpp" -#include -#include +#include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_TimeMonitor.hpp" #include #include @@ -58,7 +56,6 @@ #include #include - namespace TSQR { /// \namespace details diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 1460b7d8b864..ede5bb95dc35 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -34,23 +34,20 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_FullTsqrTest_hpp #define __TSQR_Test_FullTsqrTest_hpp -#include -#include -#include -#include -//#include -#include -#include +#include "Tsqr.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_Random_GlobalMatrix.hpp" +#include "Tsqr_TestSetup.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Tsqr_TeuchosMessenger.hpp" #include "Tsqr_TestUtils.hpp" -#include +#include "Teuchos_ScalarTraits.hpp" #include #include @@ -61,7 +58,6 @@ namespace TSQR { /// \class TsqrInaccurate /// \brief Signals that a TSQR test failed due to insufficient accuracy. - /// class TsqrInaccurate : public std::exception { public: //! Constructor diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.cpp b/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.cpp index 542448255c22..b146b28449e0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.cpp @@ -35,14 +35,12 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ -#include -#include +#include "Tsqr_GlobalTimeStats.hpp" +#include "Tsqr_MessengerBase.hpp" #include namespace TSQR { diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.hpp b/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.hpp index f5212301b14d..3b26725aef79 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalTimeStats.hpp @@ -35,8 +35,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ @@ -44,7 +42,7 @@ #ifndef __TSQR_GlobalTimeStats_hpp #define __TSQR_GlobalTimeStats_hpp -#include +#include "Tsqr_TimeStats.hpp" namespace TSQR { diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp index 50f9f58841e9..9ff5b04ffd24 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp @@ -34,27 +34,21 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_GlobalVerify_hpp #define __TSQR_Tsqr_GlobalVerify_hpp -#include -#include -#include - -#include -#include - +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_BLAS.hpp" +#include "Teuchos_ScalarTraits.hpp" #include // std::pair #include - namespace TSQR { - /// \class GlobalSummer /// /// Compute a global sum of (magnitudes of) Scalar values, returning diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 410f6b2fc2f6..7f1f2ffe4858 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -34,23 +34,21 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr_KokkosNodeTsqr.hpp /// \brief Parallel intranode TSQR implemented using the Kokkos Node API. -/// + #ifndef __TSQR_KokkosNodeTsqr_hpp #define __TSQR_KokkosNodeTsqr_hpp -#include -#include -#include +#include "Tsqr_CacheBlocker.hpp" +#include "Tsqr_Combine.hpp" +#include "Tsqr_NodeTsqr.hpp" -#include -#include +#include "Teuchos_ParameterListAcceptorDefaultBase.hpp" +#include "Teuchos_ScalarTraits.hpp" //#define KNR_DEBUG 1 #ifdef KNR_DEBUG diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index 9803efc40c6e..6c50d18fabea 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -34,25 +34,21 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_KokkosNodeTsqrTest_hpp #define __TSQR_Test_KokkosNodeTsqrTest_hpp -#include -#include -#include -#include -#include -#include - -#include -#include -#include - +#include "Tsqr_nodeTestProblem.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_KokkosNodeTsqr.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_Time.hpp" +#include "Teuchos_TypeNameTraits.hpp" #include #include #include @@ -60,7 +56,6 @@ namespace TSQR { namespace Test { - /// \fn verifyKokkosNodeTsqr /// \brief Test accuracy of KokkosNodeTsqr's QR factorization. /// @@ -87,7 +82,6 @@ namespace TSQR { /// for a script to parse. /// \param debug [in] Whether to print extra debugging output to /// stderr. - /// template void verifyKokkosNodeTsqr (const Teuchos::RCP& node, diff --git a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp index c2938b21e91c..8e04d6ed85ba 100644 --- a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp @@ -34,16 +34,14 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_LocalVerify_hpp #define __TSQR_Tsqr_LocalVerify_hpp -#include -#include +#include "Tsqr_Util.hpp" +#include "Teuchos_BLAS.hpp" #include #include #include // std::pair, std::make_pair diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 541614691bcb..2f3d99cef4ba 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -34,16 +34,12 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_MatView_hpp #define __TSQR_Tsqr_MatView_hpp -#include // NULL - // Define for bounds checking and other safety features, undefine for speed. // #define TSQR_MATVIEW_DEBUG 1 @@ -173,9 +169,7 @@ namespace TSQR { typedef Ordinal ordinal_type; typedef Scalar* pointer_type; - /// \note g++ with -Wall wants A_ to be initialized after lda_, - /// otherwise it emits a compiler warning. - MatView () : nrows_(0), ncols_(0), lda_(0), A_(NULL) {} + MatView () = default; MatView (const Ordinal num_rows, const Ordinal num_cols, @@ -191,23 +185,10 @@ namespace TSQR { #endif // TSQR_MATVIEW_DEBUG } - MatView (const MatView& view) : - nrows_(view.nrows()), - ncols_(view.ncols()), - lda_(view.lda()), - A_(view.get()) - {} - - //! Assignment operator: Does a shallow (pointer) assignment. - MatView& operator= (const MatView& view) { - if (this != &view) { - nrows_ = view.nrows (); - ncols_ = view.ncols (); - A_ = view.get (); - lda_ = view.lda (); - } - return *this; - } + MatView (const MatView& view) = default; + MatView& operator= (const MatView& view) = default; + MatView (MatView&& view) = default; + MatView& operator= (MatView&& view) = default; /// \note The function is const, only because returning a /// reference to the matrix data doesn't change any members of @@ -387,8 +368,10 @@ namespace TSQR { } private: - ordinal_type nrows_, ncols_, lda_; - scalar_type* A_; + ordinal_type nrows_ = 0; + ordinal_type ncols_ = 0; + ordinal_type lda_ = 0; + scalar_type* A_ = nullptr; }; diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index df1e7feaa452..a5c975f70f9c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -34,27 +34,20 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_Matrix_hpp #define __TSQR_Tsqr_Matrix_hpp -#include -#include - +#include "Tsqr_Util.hpp" +#include "Tsqr_MatView.hpp" #include #include #include #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { - /// \class Matrix /// \brief A column-oriented dense matrix /// \author Mark Hoemmen diff --git a/packages/tpetra/tsqr/src/Tsqr_MessengerBase.hpp b/packages/tpetra/tsqr/src/Tsqr_MessengerBase.hpp index 7a452e52662b..2c087b4a66e8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MessengerBase.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MessengerBase.hpp @@ -34,16 +34,13 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_MessengerBase_hpp #define __TSQR_Tsqr_MessengerBase_hpp -#include - +#include "Tsqr_ConfigDefs.hpp" namespace TSQR { /// \class MessengerBase @@ -56,8 +53,7 @@ namespace TSQR { template class MessengerBase { public: - //! Virtual destructor for memory safety of derived classes. - virtual ~MessengerBase() {} + virtual ~MessengerBase() = default; /// Send sendData[0:sendCount-1] to process destProc. /// diff --git a/packages/tpetra/tsqr/src/Tsqr_Mgs.hpp b/packages/tpetra/tsqr/src/Tsqr_Mgs.hpp index f3bf84b9aae2..ce560b906a26 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Mgs.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Mgs.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -47,19 +45,10 @@ #include #include // std::pair -#include -#include - -#include -#include - -// #define MGS_DEBUG 1 -#ifdef MGS_DEBUG -# include -using std::cerr; -using std::endl; -#endif // MGS_DEBUG - +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_ScalarTraits.hpp" namespace TSQR { @@ -77,7 +66,7 @@ namespace TSQR { /// /// \param messenger [in/out] Communicator wrapper instance. /// - MGS (const Teuchos::RCP< MessengerBase< Scalar > >& messenger) : + MGS (const Teuchos::RCP>& messenger) : messenger_ (messenger) {} /// \brief Does the R factor have a nonnegative diagonal? @@ -102,7 +91,7 @@ namespace TSQR { const LocalOrdinal ldr); private: - Teuchos::RCP > messenger_; + Teuchos::RCP> messenger_; }; @@ -145,18 +134,9 @@ namespace TSQR { { Scalar local_result (0); -#ifdef MGS_DEBUG - // for (LocalOrdinal k = 0; k != nrows_local; ++k) - // cerr << "(x[" << k << "], y[" << k << "]) = (" << x_local[k] << "," << y_local[k] << ")" << " "; - // cerr << endl; -#endif // MGS_DEBUG - - for (LocalOrdinal i = 0; i < nrows_local; ++i) + for (LocalOrdinal i = 0; i < nrows_local; ++i) { local_result += x_local[i] * STS::conjugate (y_local[i]); - -#ifdef MGS_DEBUG - // cerr << "-- Final value on this proc = " << local_result << endl; -#endif // MGS_DEBUG + } // FIXME (mfh 23 Apr 2010) Does MPI_SUM do the right thing for // complex or otherwise general MPI data types? Perhaps an MPI_Op @@ -173,22 +153,18 @@ namespace TSQR { // Doing the right thing in the complex case requires taking // an absolute value. We want to avoid this additional cost // in the real case, which is why we check is_complex. - if (STS::isComplex) - { - for (LocalOrdinal i = 0; i < nrows_local; ++i) - { - const Scalar xi = STS::magnitude (x_local[i]); - localResult += xi * xi; - } + if (STS::isComplex) { + for (LocalOrdinal i = 0; i < nrows_local; ++i) { + const Scalar xi = STS::magnitude (x_local[i]); + localResult += xi * xi; } - else - { - for (LocalOrdinal i = 0; i < nrows_local; ++i) - { - const Scalar xi = x_local[i]; - localResult += xi * xi; - } + } + else { + for (LocalOrdinal i = 0; i < nrows_local; ++i) { + const Scalar xi = x_local[i]; + localResult += xi * xi; } + } const Scalar globalResult = messenger_->globalSum (localResult); // sqrt doesn't make sense if the type of Scalar is complex, // even if the imaginary part of global_result is zero. @@ -206,11 +182,10 @@ namespace TSQR { } private: - Teuchos::RCP< MessengerBase< Scalar > > messenger_; + Teuchos::RCP> messenger_; }; } // namespace details - template void MGS::mgs (const LocalOrdinal nrows_local, @@ -222,38 +197,28 @@ namespace TSQR { { details::MgsOps ops (messenger_); - for (LocalOrdinal j = 0; j < ncols; ++j) - { - Scalar* const v = &A_local[j*lda_local]; - for (LocalOrdinal i = 0; i < j; ++i) - { - const Scalar* const q = &A_local[i*lda_local]; - R[i + j*ldr] = ops.project (nrows_local, q, v); -#ifdef MGS_DEBUG - if (my_rank == 0) - cerr << "(i,j) = (" << i << "," << j << "): coeff = " << R[i + j*ldr] << endl; -#endif // MGS_DEBUG - } - const magnitude_type denom = ops.norm2 (nrows_local, v); -#ifdef MGS_DEBUG - if (my_rank == 0) - cerr << "j = " << j << ": denom = " << denom << endl; -#endif // MGS_DEBUG - - // FIXME (mfh 29 Apr 2010) - // - // NOTE IMPLICIT CAST. This should work for complex numbers. - // If it doesn't work for your Scalar data type, it means that - // you need a different data type for the diagonal elements of - // the R factor, than you need for the other elements. This - // is unlikely if we're comparing MGS against a Householder QR - // factorization; I don't really understand how the latter - // would work (not that it couldn't be given a sensible - // interpretation) in the case of Scalars that aren't plain - // old real or complex numbers. - R[j + j*ldr] = Scalar (denom); - ops.scale (nrows_local, v, denom); + for (LocalOrdinal j = 0; j < ncols; ++j) { + Scalar* const v = &A_local[j*lda_local]; + for (LocalOrdinal i = 0; i < j; ++i) { + const Scalar* const q = &A_local[i*lda_local]; + R[i + j*ldr] = ops.project (nrows_local, q, v); } + const magnitude_type denom = ops.norm2 (nrows_local, v); + + // FIXME (mfh 29 Apr 2010) + // + // NOTE IMPLICIT CAST. This should work for complex numbers. + // If it doesn't work for your Scalar data type, it means that + // you need a different data type for the diagonal elements of + // the R factor, than you need for the other elements. This + // is unlikely if we're comparing MGS against a Householder QR + // factorization; I don't really understand how the latter + // would work (not that it couldn't be given a sensible + // interpretation) in the case of Scalars that aren't plain + // old real or complex numbers. + R[j + j*ldr] = Scalar (denom); + ops.scale (nrows_local, v, denom); + } } } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index b128325db5af..873c5627e5b0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -34,26 +34,22 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_MgsTest_hpp #define __TSQR_Test_MgsTest_hpp -#include - -#include +#include "Tsqr_ConfigDefs.hpp" +#include "Tsqr_Mgs.hpp" #ifdef HAVE_KOKKOSTSQR_TBB -# include +# include "TbbTsqr_TbbMgs.hpp" #endif // HAVE_KOKKOSTSQR_TBB -#include -#include -#include -#include - -#include +#include "Tsqr_TestSetup.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Tsqr_printGlobalMatrix.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Teuchos_RCP.hpp" #include #include @@ -69,18 +65,19 @@ namespace TSQR { static std::string mgs_human_readable_name (const std::string& which) { - if (which == "MpiSeqMGS") + if (which == "MpiSeqMGS") { return std::string ("MPI parallel / sequential MGS"); - else if (which == "MpiTbbMGS") - { + } + else if (which == "MpiTbbMGS") { #ifdef HAVE_KOKKOSTSQR_TBB - return std::string ("MPI parallel / TBB parallel MGS"); + return std::string ("MPI parallel / TBB parallel MGS"); #else - throw std::logic_error("MGS not built with Intel TBB support"); + throw std::logic_error("MGS not built with Intel TBB support"); #endif // HAVE_KOKKOSTSQR_TBB - } - else + } + else { throw std::logic_error("Unknown MGS implementation type \"" + which + "\""); + } } template< class MgsType > @@ -108,12 +105,11 @@ namespace TSQR { orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), Q_local.get(), Q_local.lda(), R.get(), R.lda()); - if (b_debug) - { - messenger->barrier(); - if (messenger->rank() == 0) - cerr << "-- Finished MGS::mgs" << endl; - } + if (b_debug) { + messenger->barrier(); + if (messenger->rank() == 0) + cerr << "-- Finished MGS::mgs" << endl; + } } }; @@ -137,141 +133,138 @@ namespace TSQR { const bool b_extra_debug = false; const int nprocs = scalarComm->size(); const int my_rank = scalarComm->rank(); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "mgs_verify:" << endl; - scalarComm->barrier(); + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "mgs_verify:" << endl; } + scalarComm->barrier(); + } const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); // Set up storage for the test problem - Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) + Matrix A_local (nrows_local, ncols); + if (std::numeric_limits::has_quiet_NaN) { A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); - Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0)); + } + Matrix R (ncols, ncols, Scalar(0)); // Generate the test problem. distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Generated test problem." << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Generated test problem." << endl; } + } // Make sure that the test problem (the matrix to factor) was // distributed correctly. - if (b_extra_debug && b_debug) - { - if (my_rank == 0) + if (b_extra_debug && b_debug) { + if (my_rank == 0) { cerr << "Test matrix A:" << endl; - scalarComm->barrier(); - printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); - scalarComm->barrier(); } + scalarComm->barrier(); + printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); + scalarComm->barrier(); + } // Factoring the matrix stored in A_local overwrites it, so we // copy A_local into Q_local. MGS orthogonalization does not // support contiguously stored cache blocks, unlike TSQR, so we // don't have to consider whether or not to rearrange cache // blocks here (unlike with TSQR). - Matrix< Ordinal, Scalar > Q_local (A_local); + Matrix Q_local (A_local); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Starting verification" << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Starting verification" << endl; } + } - if (which == "MpiTbbMGS") - { + if (which == "MpiTbbMGS") { #ifdef HAVE_KOKKOSTSQR_TBB - typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; - mgs_type mgser (scalarComm); - MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); + typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; + mgs_type mgser (scalarComm); + MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); #else - throw std::logic_error("MGS not built with Intel TBB support"); + throw std::logic_error("MGS not built with Intel TBB support"); #endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqMGS") - { - typedef MGS< Ordinal, Scalar > mgs_type; - mgs_type mgser (scalarComm); - MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); - } - else + } + else if (which == "MpiSeqMGS") { + typedef MGS mgs_type; + mgs_type mgser (scalarComm); + MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); + } + else { throw std::logic_error ("Invalid MGS implementation type \"" + which + "\""); + } // Print out the Q and R factors - if (b_extra_debug && b_debug) - { - if (my_rank == 0) - cerr << endl << "Q factor:" << endl; - scalarComm->barrier (); - printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); - scalarComm->barrier (); - if (my_rank == 0) - { - cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); - cerr << endl; - } - scalarComm->barrier (); + if (b_extra_debug && b_debug) { + if (my_rank == 0) { + cerr << endl << "Q factor:" << endl; + } + scalarComm->barrier (); + printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); + scalarComm->barrier (); + if (my_rank == 0) { + cerr << endl << "R factor:" << endl; + print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + cerr << endl; } + scalarComm->barrier (); + } // Test accuracy of the resulting factorization - std::vector< magnitude_type > results = + std::vector results = global_verify (nrows_local, ncols, A_local.get(), A_local.lda(), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalarComm.get()); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Finished global_verify" << endl; - scalarComm->barrier(); + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Finished global_verify" << endl; } + scalarComm->barrier(); + } // Print the results on Proc 0. - if (my_rank == 0) - { - if (human_readable) - { - cout << mgs_human_readable_name(which) << endl - << "# rows = " << nrows_global << endl - << "# columns = " << ncols << endl - << "# MPI processes = " << nprocs << endl; - if (which == "MpiTbbTSQR") - cout << "# cores per process = " << num_cores << endl; - cout << "Absolute residual $\\|A - Q*R\\|_2: " - << results[0] << endl - << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " - << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " - << results[2] << endl - << endl; - } - else - { - cout << which - << "," << nrows_global - << "," << ncols - << "," << nprocs; - if (which == "MpiTbbTSQR") - cout << "," << num_cores << endl; - cout << "," << results[0] - << "," << results[1] - << "," << results[2] - << endl; - } + if (my_rank == 0) { + if (human_readable) { + cout << mgs_human_readable_name(which) << endl + << "# rows = " << nrows_global << endl + << "# columns = " << ncols << endl + << "# MPI processes = " << nprocs << endl; + if (which == "MpiTbbTSQR") { + cout << "# cores per process = " << num_cores << endl; + } + cout << "Absolute residual $\\|A - Q*R\\|_2: " + << results[0] << endl + << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " + << results[1] << endl + << "Test matrix norm $\\| A \\|_F$: " + << results[2] << endl + << endl; } + else { + cout << which + << "," << nrows_global + << "," << ncols + << "," << nprocs; + if (which == "MpiTbbTSQR") { + cout << "," << num_cores << endl; + } + cout << "," << results[0] + << "," << results[1] + << "," << results[2] + << endl; + } + } } - - template< class MgsBase, class TimerType > + template static double // returns timing in s do_mgs_benchmark (MgsBase& orthogonalizer, Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local, @@ -282,7 +275,7 @@ namespace TSQR { typedef typename MgsBase::ordinal_type ordinal_type; using std::cout; - TSQR::Test::verifyTimerConcept< TimerType >(); + TSQR::Test::verifyTimerConcept(); const ordinal_type nrows_local = Q_local.nrows(); const ordinal_type ncols = Q_local.ncols(); @@ -296,19 +289,17 @@ namespace TSQR { // Name of timer doesn't matter here; we only need the timing. TimerType timer("MGS"); timer.start(); - for (int trial_num = 0; trial_num < num_trials; ++trial_num) - { - // Orthogonalize the columns of A using MGS. Don't worry about - // the fact that we're overwriting the input; this is a - // benchmark, not a numerical verification test. (We have the - // latter implemented as mgs_verify() in this file.) - orthogonalizer.mgs (nrows_local, ncols, Q_local.get(), - Q_local.lda(), R.get(), R.lda()); - // Timings in debug mode likely won't make sense, because - // Proc 0 is outputting the debug messages to cerr. - // Nevertheless, we don't put any "if(b_debug)" calls in the - // timing loop. - } + for (int trial_num = 0; trial_num < num_trials; ++trial_num) { + // Orthogonalize the columns of A using MGS. Don't worry + // about the fact that we're overwriting the input; this is a + // benchmark, not a numerical verification test. (We have the + // latter implemented as mgs_verify() in this file.) + orthogonalizer.mgs (nrows_local, ncols, Q_local.get(), + Q_local.lda(), R.get(), R.lda()); + // Timings in debug mode likely won't make sense, because Proc + // 0 is outputting the debug messages to cerr. Nevertheless, + // we don't put any "if(b_debug)" calls in the timing loop. + } // Compute the resulting total time (in seconds) to execute // num_trials runs of :mgs(). The time may differ on different // MPI processes. @@ -316,154 +307,151 @@ namespace TSQR { return mgs_timing; } - template< class Ordinal, class Scalar, class Generator, class TimerType > + template void benchmarkMgs (const std::string& which, Generator& generator, const int ntrials, const Ordinal nrows_global, const Ordinal ncols, - const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, - const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, + const Teuchos::RCP>& ordinalComm, + const Teuchos::RCP>& scalarComm, const int num_cores, const bool human_readable, const bool b_debug) { - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; + typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; using std::cerr; using std::cout; using std::endl; - TSQR::Test::verifyTimerConcept< TimerType >(); + TSQR::Test::verifyTimerConcept(); const bool b_extra_debug = false; const int nprocs = scalarComm->size(); const int my_rank = scalarComm->rank(); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "mgs_benchmark:" << endl; - scalarComm->barrier(); + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "mgs_benchmark:" << endl; } + scalarComm->barrier(); + } const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); // Set up storage for the test problem. Matrix A_local (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) + if (std::numeric_limits< Scalar >::has_quiet_NaN) { A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); + } Matrix R (ncols, ncols, Scalar(0)); // Generate the test problem. distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Generated test problem." << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Generated test problem." << endl; } + } // Make sure that the test problem (the matrix to factor) was // distributed correctly. - if (b_extra_debug && b_debug) - { - if (my_rank == 0) - cerr << "Test matrix A:" << endl; - scalarComm->barrier (); - printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); - scalarComm->barrier (); + if (b_extra_debug && b_debug) { + if (my_rank == 0) { + cerr << "Test matrix A:" << endl; } + scalarComm->barrier (); + printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); + scalarComm->barrier (); + } // Factoring the matrix stored in A_local overwrites it, so we // make a copy of A_local. MGS orthogonalization does not // support contiguously stored cache blocks, unlike TSQR, so we // don't have to consider whether or not to rearrange cache // blocks here (unlike with TSQR). - Matrix< Ordinal, Scalar > Q_local (A_local); + Matrix Q_local (A_local); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Starting timing loop" << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Starting timing loop" << endl; } + } // Set up MGS and run the benchmark. double mgs_timing; // Total run time in seconds of all ntrials trials - if (which == "MpiTbbMGS") - { + if (which == "MpiTbbMGS") { #ifdef HAVE_KOKKOSTSQR_TBB - typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; - mgs_type mgser (scalarComm); - mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, - ntrials, human_readable); + typedef TSQR::TBB::TbbMgs mgs_type; + mgs_type mgser (scalarComm); + mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, + ntrials, human_readable); #else - throw std::logic_error("MGS not built with Intel TBB support"); + throw std::logic_error("MGS not built with Intel TBB support"); #endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqMGS") - { - typedef MGS< Ordinal, Scalar > mgs_type; - mgs_type mgser (scalarComm); - mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, - ntrials, human_readable); - } - else + } + else if (which == "MpiSeqMGS") { + typedef MGS mgs_type; + mgs_type mgser (scalarComm); + mgs_timing = do_mgs_benchmark (mgser, Q_local, R, + ntrials, human_readable); + } + else { throw std::logic_error ("Invalid MGS implementation type \"" + which + "\""); + } - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Finished timing loop" << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Finished timing loop" << endl; } + } // Find the min and max MGS timing on all processors. const double min_mgs_timing = scalarComm->globalMin (mgs_timing); const double max_mgs_timing = scalarComm->globalMax (mgs_timing); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Computed min and max timings" << endl; + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Computed min and max timings" << endl; } + } // Print the results on Proc 0. - if (my_rank == 0) - { - if (human_readable) - { - cout << mgs_human_readable_name(which) << ":" << endl - << "# rows = " << nrows_global << endl - << "# columns = " << ncols << endl - << "# MPI processes = " << nprocs << endl; - if (which == "MpiTbbTSQR") - cout << "# cores per process = " << num_cores << endl; - cout << "# trials = " << ntrials << endl - << "Min total time (s) over all MPI processes = " - << min_mgs_timing << endl - << "Max total time (s) over all MPI processes = " - << max_mgs_timing << endl - << endl; - } - else - { - cout << which - << "," << nrows_global - << "," << ncols - << "," << nprocs; - if (which == "MpiTbbTSQR") - cout << "," << num_cores << endl; - cout << "," << ntrials - << "," << min_mgs_timing - << "," << max_mgs_timing - << endl; - } + if (my_rank == 0) { + if (human_readable) { + cout << mgs_human_readable_name(which) << ":" << endl + << "# rows = " << nrows_global << endl + << "# columns = " << ncols << endl + << "# MPI processes = " << nprocs << endl; + if (which == "MpiTbbTSQR") { + cout << "# cores per process = " << num_cores << endl; + } + cout << "# trials = " << ntrials << endl + << "Min total time (s) over all MPI processes = " + << min_mgs_timing << endl + << "Max total time (s) over all MPI processes = " + << max_mgs_timing << endl + << endl; + } + else { + cout << which + << "," << nrows_global + << "," << ncols + << "," << nprocs; + if (which == "MpiTbbTSQR") { + cout << "," << num_cores << endl; + } + cout << "," << ntrials + << "," << min_mgs_timing + << "," << max_mgs_timing + << endl; } + } } - - } // namespace Test } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index 84cbbda1decd..11af6c10ff09 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -34,30 +34,25 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr_NodeTsqr.hpp /// \brief Common interface and functionality for intranode TSQR. -/// + #ifndef __TSQR_Tsqr_NodeTsqr_hpp #define __TSQR_Tsqr_NodeTsqr_hpp -#include -#include - -#include -#include -#include -#include -#include - +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_Matrix.hpp" +#include "Teuchos_as.hpp" +#include "Teuchos_Describable.hpp" +#include "Teuchos_LAPACK.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_TypeNameTraits.hpp" #include namespace TSQR { - /// \class NodeTsqr /// \brief Common interface and functionality for intranode TSQR. /// @@ -91,7 +86,6 @@ namespace TSQR { /// would not be useful. This is because ultimately each subclass /// is bound to a Kokkos Node type, and those only use compile-time /// polymorphism. - /// template class NodeTsqr : public Teuchos::Describable { public: @@ -102,10 +96,10 @@ namespace TSQR { typedef ConstMatView const_mat_view_type; //! Constructor - NodeTsqr() {} + NodeTsqr() = default; //! Virtual destructor, for memory safety of derived classes. - virtual ~NodeTsqr() {} + virtual ~NodeTsqr() = default; /// \brief Whether this object is ready to perform computations. /// diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp index a9ee4da7602c..08172ac0a484 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp @@ -34,29 +34,27 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_NodeTsqrFactory_hpp #define __TSQR_NodeTsqrFactory_hpp -#include -#include +#include "Tsqr_ConfigDefs.hpp" +#include "Kokkos_DefaultNode.hpp" #ifdef HAVE_KOKKOSTSQR_TBB -# include +# include "TbbTsqr.hpp" #endif // HAVE_KOKKOSTSQR_TBB -#include -#include +#include "Tsqr_KokkosNodeTsqr.hpp" +#include "Tsqr_SequentialTsqr.hpp" -#include -#include -#include -#include -#include +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_ParameterListExceptions.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_TypeNameTraits.hpp" #include diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp index 08a6ba00868d..ddfa3e4c39e0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp @@ -34,36 +34,29 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_DistTest_hpp #define __TSQR_Test_DistTest_hpp -#include -#include -#include - -#include -#include -#include -#include -#include - +#include "Tsqr_ConfigDefs.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Tsqr_generateStack.hpp" +#include "Tsqr_DistTsqr.hpp" +#include "Tsqr_GlobalTimeStats.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Tsqr_printGlobalMatrix.hpp" #include #include #include #include - namespace TSQR { namespace Test { - /// \class DistTsqrVerifier /// \brief Generic version of \c DistTsqr accuracy test. - /// template class DistTsqrVerifier { TSQR::Random::NormalGenerator gen_; diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index aec3b6e51e61..a94f2a248ade 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -34,22 +34,18 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_RMessenger_hpp #define __TSQR_RMessenger_hpp -#include -#include -#include - +#include "Tsqr_MatView.hpp" +#include "Tsqr_MessengerBase.hpp" +#include "Teuchos_RCP.hpp" #include #include - namespace TSQR { /// \class RMessenger @@ -66,6 +62,8 @@ namespace TSQR { typedef MessengerBase< Scalar > messenger_type; typedef Teuchos::RCP< messenger_type > messenger_ptr; + RMessenger () = delete; + /// \brief Constructor /// /// \param messenger [in/out] Pointer to the communicator wrapper. @@ -120,13 +118,9 @@ namespace TSQR { return *this; } - private: messenger_ptr messenger_; - std::vector< Scalar > buffer_; - - //! Default construction doesn't make sense, so we forbid it syntactically. - RMessenger (); + std::vector buffer_; /// \brief Buffer length as a function of R factor dimension. /// @@ -148,12 +142,11 @@ namespace TSQR { const Ordinal buf_length = buffer_length (ncols); buffer_.resize (buf_length); iter_type iter = buffer_.begin(); - for (view_ordinal_type j = 0; j < ncols; ++j) - { - const view_scalar_type* const R_j = &R(0,j); - std::copy (R_j, R_j + (j+1), iter); - iter += (j+1); - } + for (view_ordinal_type j = 0; j < ncols; ++j) { + const view_scalar_type* const R_j = &R(0,j); + std::copy (R_j, R_j + (j+1), iter); + iter += (j+1); + } } template @@ -165,11 +158,10 @@ namespace TSQR { const view_ordinal_type ncols = R.ncols(); const_iter_type iter = buffer_.begin(); - for (view_ordinal_type j = 0; j < ncols; ++j) - { - std::copy (iter, iter + (j+1), &R(0,j)); - iter += (j+1); - } + for (view_ordinal_type j = 0; j < ncols; ++j) { + std::copy (iter, iter + (j+1), &R(0,j)); + iter += (j+1); + } } }; diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 74a7bb6be61c..2e1bb2a9198e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -45,10 +43,8 @@ #include "Tsqr_Matrix.hpp" #include "Tsqr_Random_MatrixGenerator.hpp" #include "Tsqr_RMessenger.hpp" - -#include -#include - +#include "Teuchos_BLAS.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include #include @@ -56,7 +52,6 @@ #include #include - namespace TSQR { namespace Random { diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index 394c6303ec70..e2ac67fc3719 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -34,17 +34,15 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Random_MatrixGenerator_hpp #define __TSQR_Random_MatrixGenerator_hpp -#include -#include -#include +#include "Tsqr_Matrix.hpp" +#include "Teuchos_LAPACK.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp index f632ce33177d..1a862aa7d951 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp @@ -34,15 +34,13 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Random_NormalGenerator_hpp #define __TSQR_Random_NormalGenerator_hpp -#include +#include "Teuchos_LAPACK.hpp" #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp index 2dd19dfaeaf3..a2fa8436f5a2 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp @@ -34,25 +34,19 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - +#include "Tsqr_SeqTest.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_nodeTestProblem.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_SequentialTsqr.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_LAPACK.hpp" +#include "Teuchos_Time.hpp" #include #include // size_t definition #include diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp index 6dc6c4b6ef42..9f290c2e9c53 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp @@ -34,24 +34,19 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_SeqTest_hpp #define __TSQR_Test_SeqTest_hpp -#include - +#include "Tsqr_ConfigDefs.hpp" #include // size_t definition #include #include - namespace TSQR { namespace Test { - /// \brief Test accuracy of SequentialTsqr. /// /// Test the accuracy of our sequential TSQR implementation diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index 6e10db76e12c..ebd23b6b2850 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -34,22 +34,18 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Tsqr_SequentialCholeskyQR_hpp #define __TSQR_Tsqr_SequentialCholeskyQR_hpp -#include -#include -#include -#include - -#include -#include - +#include "Tsqr_MatView.hpp" +#include "Tsqr_CacheBlockingStrategy.hpp" +#include "Tsqr_CacheBlocker.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_BLAS.hpp" +#include "Teuchos_LAPACK.hpp" #include #include #include diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 1f1e9852e2f5..707d51641707 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -34,31 +34,27 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr_SequentialTsqr.hpp /// \brief Implementation of the sequential cache-blocked part of TSQR. -/// + #ifndef __TSQR_Tsqr_SequentialTsqr_hpp #define __TSQR_Tsqr_SequentialTsqr_hpp -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - +#include "Tsqr_ApplyType.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_CacheBlockingStrategy.hpp" +#include "Tsqr_CacheBlocker.hpp" +#include "Tsqr_Combine.hpp" +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_NodeTsqr.hpp" +#include "Tsqr_Util.hpp" +#include "Teuchos_Describable.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_ParameterListExceptions.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include #include @@ -66,9 +62,7 @@ #include // std::pair #include - namespace TSQR { - /// \class SequentialTsqr /// \brief Sequential cache-blocked TSQR factorization. /// \author Mark Hoemmen @@ -116,28 +110,20 @@ namespace TSQR { /// are not currently thread safe. \c TbbTsqr uses SequentialTsqr /// in parallel to implement each thread's cache-blocked TSQR. /// This can be fixed as soon as RCPs are made thread safe. - /// template class SequentialTsqr : - public NodeTsqr > > + public NodeTsqr>> { public: - typedef LocalOrdinal ordinal_type; - typedef Scalar scalar_type; - - typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; - - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - typedef typename NodeTsqr > >::factor_output_type FactorOutput; + using ordinal_type = LocalOrdinal; + using scalar_type = Scalar; + using mat_view_type = MatView; + using const_mat_view_type = ConstMatView; + using magnitude_type = typename Teuchos::ScalarTraits::magnitudeType; + using FactorOutput = typename NodeTsqr>>::factor_output_type; private: - typedef typename FactorOutput::const_iterator FactorOutputIter; - typedef typename FactorOutput::const_reverse_iterator FactorOutputReverseIter; - typedef std::pair block_pair_type; - typedef std::pair const_block_pair_type; - typedef Teuchos::BLAS blas_type; - /// \brief Factor the first cache block of the matrix. /// /// Compute the QR factorization of the first cache block A_top. @@ -356,14 +342,17 @@ namespace TSQR { size_t cacheSizeHint = 0; size_t sizeOfScalar = sizeof(Scalar); - try { + if (params->isType (cacheSizeHintName)) { cacheSizeHint = params->get (cacheSizeHintName); - } catch (InvalidParameter&) { + } + else { params->set (cacheSizeHintName, cacheSizeHint); } - try { + + if (params->isType (sizeOfScalarName)) { sizeOfScalar = params->get (sizeOfScalarName); - } catch (InvalidParameter&) { + } + else { params->set (sizeOfScalarName, sizeOfScalar); } @@ -395,7 +384,7 @@ namespace TSQR { /// /// See the \c NodeTsqr documentation for details. bool QR_produces_R_factor_with_nonnegative_diagonal () const { - typedef Combine combine_type; + using combine_type = Combine; return combine_type::QR_produces_R_factor_with_nonnegative_diagonal(); } @@ -463,13 +452,12 @@ namespace TSQR { mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work); tau_arrays.push_back (tau_first); - while (! A_rest.empty()) - { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - std::vector tau (ncols); - combine_factor (combine, R_view, A_cur, tau, work); - tau_arrays.push_back (tau); - } + while (! A_rest.empty()) { + A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + std::vector tau (ncols); + combine_factor (combine, R_view, A_cur, tau, work); + tau_arrays.push_back (tau); + } return tau_arrays; } @@ -541,13 +529,12 @@ namespace TSQR { mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work); tau_arrays.push_back (tau_first); - while (! A_rest.empty()) - { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - std::vector< Scalar > tau (ncols); - combine_factor (combine, R_view, A_cur, tau, work); - tau_arrays.push_back (tau); - } + while (! A_rest.empty()) { + A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + std::vector tau (ncols); + combine_factor (combine, R_view, A_cur, tau, work); + tau_arrays.push_back (tau); + } // Copy the R factor resulting from the factorization out of // R_view (a view of the topmost cache block of A) into the R @@ -588,17 +575,17 @@ namespace TSQR { LocalOrdinal count = 0; const_mat_view_type A_rest (nrows, ncols, A, lda); - if (A_rest.empty()) + if (A_rest.empty()) { return count; + } const_mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); ++count; // first factor step - while (! A_rest.empty()) - { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - ++count; // next factor step - } + while (! A_rest.empty()) { + A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + ++count; // next factor step + } return count; } @@ -618,20 +605,19 @@ namespace TSQR { const bool contiguous_cache_blocks) const { // Quick exit and error tests - if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) + if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) { return; - else if (ldc < nrows) - { - std::ostringstream os; - os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")"; - throw std::invalid_argument (os.str()); - } - else if (ldq < nrows) - { - std::ostringstream os; - os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")"; - throw std::invalid_argument (os.str()); - } + } + else if (ldc < nrows) { + std::ostringstream os; + os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")"; + throw std::invalid_argument (os.str()); + } + else if (ldq < nrows) { + std::ostringstream os; + os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")"; + throw std::invalid_argument (os.str()); + } // If contiguous cache blocks are used, then we have to use the // same convention as we did for factor(). Otherwise, we are @@ -663,40 +649,36 @@ namespace TSQR { // not modified. mat_view_type C_top = blocker.top_block (C_rest, contiguous_cache_blocks); - if (transposed) - { - const_mat_view_type Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); - mat_view_type C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); - - // Apply the topmost block of Q. - FactorOutputIter tau_iter = tau_arrays.begin(); - const std::vector& tau = *tau_iter++; - apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work); - - while (! Q_rest.empty()) - { - Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); - C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); - combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); - } + if (transposed) { + const_mat_view_type Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); + mat_view_type C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); + + // Apply the topmost block of Q. + auto tau_iter = tau_arrays.begin(); + const std::vector& tau = *tau_iter++; + apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work); + + while (! Q_rest.empty()) { + Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); + C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); + combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); } - else - { - // Start with the last local Q factor and work backwards up the matrix. - FactorOutputReverseIter tau_iter = tau_arrays.rbegin(); - - const_mat_view_type Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); - mat_view_type C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); - - while (! Q_rest.empty()) - { - combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); - Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); - C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); - } - // Apply to last (topmost) cache block. - apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work); + } + else { + // Start with the last local Q factor and work backwards up the matrix. + auto tau_iter = tau_arrays.rbegin(); + + const_mat_view_type Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); + mat_view_type C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); + + while (! Q_rest.empty()) { + combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); + Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); + C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); } + // Apply to last (topmost) cache block. + apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work); + } } /// \brief Compute the explicit Q factor from the result of factor(). @@ -725,8 +707,9 @@ namespace TSQR { // itself contains the first ncols_C columns of the identity // matrix. fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks); - for (LocalOrdinal j = 0; j < ncols_C; ++j) - C_top(j, j) = Scalar(1); + for (LocalOrdinal j = 0; j < ncols_C; ++j) { + C_top(j, j) = Scalar(1.0); + } // Apply the Q factor to C, to extract the first ncols_C columns // of Q in explicit form. @@ -764,7 +747,8 @@ namespace TSQR { // restructuring of this code would parallelize nicely using // OpenMP. CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); - blas_type blas; + + Teuchos::BLAS blas; mat_view_type Q_rest (nrows, ncols, Q, ldq); Matrix Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized @@ -779,8 +763,8 @@ namespace TSQR { deep_copy (Q_cur_copy, Q_cur); // Q_cur := Q_cur_copy * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows (), ncols, ncols, - Scalar (1), Q_cur_copy.get (), Q_cur_copy.lda (), B, ldb, - Scalar (0), Q_cur.get (), Q_cur.lda ()); + Scalar (1.0), Q_cur_copy.get (), Q_cur_copy.lda (), + B, ldb, Scalar (0.0), Q_cur.get (), Q_cur.lda ()); } } @@ -856,7 +840,6 @@ namespace TSQR { } protected: - /// \brief Return the topmost cache block of the matrix C. /// /// NodeTsqr's top_block() method must be implemented using diff --git a/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.cpp b/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.cpp index 96ef5032bd6a..bddfceaf9008 100644 --- a/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.cpp @@ -35,13 +35,11 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ -#include +#include "Tsqr_StatTimeMonitor.hpp" namespace TSQR { @@ -61,20 +59,4 @@ namespace TSQR { stats_.update (curTime); } -#if 0 - /// \brief Return total elapsed time of a particular timer - /// - /// Return the total elapsed time of a particular timer. - /// Ensures that the timer is not running (which would break - /// totalElapsedTime()). - static double - fetchTime (const Teuchos::RCP< Teuchos::Time >& timer) - { - if (timer->isRunning()) - timer->stop(); - return timer->totalElapsedTime(); - } -#endif // 0 - - } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.hpp b/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.hpp index 713c04c97f9a..d3db20a1a926 100644 --- a/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_StatTimeMonitor.hpp @@ -35,8 +35,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ @@ -44,8 +42,8 @@ #ifndef __TSQR_StatTimeMonitor_hpp #define __TSQR_StatTimeMonitor_hpp -#include -#include +#include "Teuchos_Time.hpp" +#include "Tsqr_TimeStats.hpp" namespace TSQR { @@ -58,7 +56,6 @@ namespace TSQR { /// /// \note Implementers: You may safely add new statistics to /// TimeStats without needing to change this class. - /// class StatTimeMonitor { public: /// \brief Constructor diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp index d610b2c67a8f..a34e69f494a3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp @@ -34,30 +34,26 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_TbbTest_hpp #define __TSQR_Test_TbbTest_hpp -#include -#include -#include - +#include "Tsqr_nodeTestProblem.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" -#include -#include -#include -#include +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_Util.hpp" +#include "TbbTsqr.hpp" -#include -#include +#include "Teuchos_LAPACK.hpp" +#include "Teuchos_Time.hpp" #include #include // size_t definition -//#include #include #include #include @@ -71,16 +67,12 @@ using std::cerr; using std::cout; using std::endl; -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace Test { - /// Test the accuracy of Intel TBB TSQR on an nrows by ncols /// matrix (using the given number of cores and the given cache /// block size (in bytes)), and print the results to stdout. - template< class Ordinal, class Scalar > + template void verifyTbbTsqr (const std::string& scalarTypeName, TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator, diff --git a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp index f81ba3b9b00d..b514294b4436 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,37 +34,32 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TestSetup_hpp #define __TSQR_TestSetup_hpp -#include -#include -#include -#include - +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_Random_GlobalMatrix.hpp" +#include "Tsqr_Matrix.hpp" +#include "Teuchos_ScalarTraits.hpp" #include - -namespace TSQR { +namespace TSQR { namespace Test { - template - Ordinal - numLocalRows (const Ordinal nrowsGlobal, - const CommOrdinal myRank, - const CommOrdinal nprocs) + Ordinal + numLocalRows (const Ordinal nrowsGlobal, + const CommOrdinal myRank, + const CommOrdinal nprocs) { const Ordinal nrowsLocal = nrowsGlobal / Ordinal(nprocs); const Ordinal remainder = nrowsGlobal - nrowsLocal * Ordinal(nprocs); if (myRank != nprocs - 1) - return nrowsLocal; + return nrowsLocal; else - return nrowsLocal + remainder; + return nrowsLocal + remainder; } /// \param generator [in/out] Proc 0 is the only MPI process that @@ -76,9 +71,9 @@ namespace TSQR { template void distributedTestProblem (Generator& generator, - MatrixViewType& A_local, - MessengerBase* const ordinalComm, - MessengerBase* const scalarComm) + MatrixViewType& A_local, + MessengerBase* const ordinalComm, + MessengerBase* const scalarComm) { typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; @@ -87,33 +82,31 @@ namespace TSQR { const int myRank = scalarComm->rank(); const ordinal_type ncols = A_local.ncols(); - if (myRank == 0) - { - // Generate some singular values for the test problem. - std::vector< magnitude_type > singular_values (ncols); - singular_values[0] = 1.0; - for (ordinal_type k = 1; k < ncols; ++k) - singular_values[k] = singular_values[k-1] / double(2); + if (myRank == 0) { + // Generate some singular values for the test problem. + std::vector singular_values (ncols); + singular_values[0] = 1.0; + for (ordinal_type k = 1; k < ncols; ++k) + singular_values[k] = singular_values[k-1] / double(2); - // Generate the test problem. All MPI processes - // participate, but only Proc 0 generates the (pseudo)random - // numbers. - TSQR::Random::randomGlobalMatrix (&generator, A_local, - &singular_values[0], ordinalComm, - scalarComm); - } - else - { - // This helps C++ deduce the type; the values aren't read on - // this proc. - magnitude_type singular_values[1]; + // Generate the test problem. All MPI processes + // participate, but only Proc 0 generates the (pseudo)random + // numbers. + TSQR::Random::randomGlobalMatrix (&generator, A_local, + &singular_values[0], ordinalComm, + scalarComm); + } + else { + // This helps C++ deduce the type; the values aren't read on + // this proc. + magnitude_type singular_values[1]; - // All MPI processes participate in the distribution of the - // test matrix. - TSQR::Random::randomGlobalMatrix (&generator, A_local, - &singular_values[0], ordinalComm, - scalarComm); - } + // All MPI processes participate in the distribution of the + // test matrix. + TSQR::Random::randomGlobalMatrix (&generator, A_local, + &singular_values[0], ordinalComm, + scalarComm); + } } } // namespace Test } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_TestUtils.hpp b/packages/tpetra/tsqr/src/Tsqr_TestUtils.hpp index 3fbdbd126344..14a971477dd9 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TestUtils.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TestUtils.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER diff --git a/packages/tpetra/tsqr/src/Tsqr_TeuchosMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_TeuchosMessenger.hpp index ede5ac79077e..1abbe209995f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TeuchosMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TeuchosMessenger.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,22 +34,17 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TeuchosMessenger_hpp #define __TSQR_TeuchosMessenger_hpp -#include -#include - +#include "Teuchos_CommHelpers.hpp" +#include "Tsqr_MessengerBase.hpp" #include - -namespace TSQR { - +namespace TSQR { /// \class TeuchosMessenger /// \brief Communication object for TSQR /// @@ -63,13 +58,13 @@ namespace TSQR { template class TeuchosMessenger : public MessengerBase { public: - typedef Teuchos::RCP > comm_ptr; + typedef Teuchos::RCP> comm_ptr; //! Constructor, taking the communicator object to wrap. TeuchosMessenger (const comm_ptr& pComm) : pComm_ (pComm) {} //! Virtual destructor for memory safety of derived classes. - virtual ~TeuchosMessenger() {} + virtual ~TeuchosMessenger() = default; /// \brief Send sendData[0:sendCount-1] to process destProc. /// @@ -77,11 +72,11 @@ namespace TSQR { /// \param sendCount [in] Number of elements in the array /// \param destProc [in] Rank of destination process /// \param tag [in] MPI tag (ignored) - void - send (const Datum sendData[], - const int sendCount, - const int destProc, - const int tag) + void + send (const Datum sendData[], + const int sendCount, + const int destProc, + const int tag) { // NOTE (mfh 14 June 2010): Teuchos generates "tag" arguments to // MPI calls internally, so we ignore the tag here. I don't use @@ -95,11 +90,11 @@ namespace TSQR { /// \param recvCount [in] Number of elements to receive in the array /// \param srcProc [in] Rank of sending process /// \param tag [in] MPI tag (ignored) - void - recv (Datum recvData[], - const int recvCount, - const int srcProc, - const int tag) + void + recv (Datum recvData[], + const int recvCount, + const int srcProc, + const int tag) { // NOTE (mfh 14 June 2010): Teuchos generates "tag" arguments to // MPI calls internally, so we ignore the tag here. I don't use @@ -125,138 +120,132 @@ namespace TSQR { /// this process is sending data, and from which this process is /// receiving data) /// \param tag [in] MPI tag (ignored) - void - swapData (const Datum sendData[], - Datum recvData[], - const int sendRecvCount, - const int destProc, - const int tag) + void + swapData (const Datum sendData[], + Datum recvData[], + const int sendRecvCount, + const int destProc, + const int tag) { - if (destProc == rank()) - { - // If the sending and receiving processes are the same, - // then all we need to do is copy the data. Hopefully in - // that case you aren't aliasing. std::copy assumes that - // the third argument does not point to an element in the - // range of the first two arguments. - std::copy (sendData, sendData+sendRecvCount, recvData); - } - else - { - using Teuchos::RCP; - using Teuchos::ArrayRCP; - using Teuchos::CommRequest; + if (destProc == rank()) { + // If the sending and receiving processes are the same, + // then all we need to do is copy the data. Hopefully in + // that case you aren't aliasing. std::copy assumes that + // the third argument does not point to an element in the + // range of the first two arguments. + std::copy (sendData, sendData+sendRecvCount, recvData); + } + else { + using Teuchos::RCP; + using Teuchos::ArrayRCP; + using Teuchos::CommRequest; - const int srcProc = Teuchos::rank (*pComm_); + const int srcProc = Teuchos::rank (*pComm_); - // If we can prove that sendData and recvData don't alias - // one another, use an isend and an ireceive to exchange - // them. (Our test may not necessarily be safe in general, - // since we only check whether the pointers are equal and - // not whether the arrays overlap. However, it is safe for - // the specific case of TSQR.) - // - // Otherwise, if the arrays do alias one another, safely - // perform a send and then a receive (or a receive and then - // a send, depending on whether this MPI process is the - // source or destination process). - // - // (It would be nice if Teuchos had a sendRecv() routine, as - // of summer 2010 when this code was written. As it stands, - // we have to do a send and then a receive.) - if (sendData == recvData) - { - // The smaller-rank process sends first, and the - // larger-rank process receives first. - // - // Teuchos::send() and Teuchos::recv() are blocking, - // so we may safely write to recvBuf even if it - // aliases sendBuf. - if (srcProc < destProc) - { - Teuchos::send (*pComm_, sendRecvCount, sendData, destProc); - Teuchos::receive (*pComm_, destProc, sendRecvCount, recvData); - } - else - { - Teuchos::receive (*pComm_, destProc, sendRecvCount, recvData); - Teuchos::send (*pComm_, sendRecvCount, sendData, destProc); - } - } - else - { - ArrayRCP sendBuf (sendData, 0, sendRecvCount, false); - ArrayRCP recvBuf (recvData, 0, sendRecvCount, false); + // If we can prove that sendData and recvData don't alias one + // another, use an isend and an ireceive to exchange them. + // (Our test may not necessarily be safe in general, since we + // only check whether the pointers are equal and not whether + // the arrays overlap. However, it is safe for the specific + // case of TSQR.) + // + // Otherwise, if the arrays do alias one another, safely + // perform a send and then a receive (or a receive and then a + // send, depending on whether this MPI process is the source + // or destination process). + // + // (It would be nice if Teuchos had a sendRecv() routine, as + // of summer 2010 when this code was written. As it stands, + // we have to do a send and then a receive.) + if (sendData == recvData) { + // The smaller-rank process sends first, and the + // larger-rank process receives first. + // + // Teuchos::send() and Teuchos::recv() are blocking, + // so we may safely write to recvBuf even if it + // aliases sendBuf. + if (srcProc < destProc) { + Teuchos::send (*pComm_, sendRecvCount, sendData, destProc); + Teuchos::receive (*pComm_, destProc, sendRecvCount, recvData); + } + else + { + Teuchos::receive (*pComm_, destProc, sendRecvCount, recvData); + Teuchos::send (*pComm_, sendRecvCount, sendData, destProc); + } + } + else { + ArrayRCP sendBuf (sendData, 0, sendRecvCount, false); + ArrayRCP recvBuf (recvData, 0, sendRecvCount, false); - RCP > sendReq, recvReq; - if (srcProc < destProc) - { - sendReq = Teuchos::isend (*pComm_, sendBuf, destProc); - recvReq = Teuchos::ireceive (*pComm_, recvBuf, destProc); - } - else - { - recvReq = Teuchos::ireceive (*pComm_, recvBuf, destProc); - sendReq = Teuchos::isend (*pComm_, sendBuf, destProc); - } - // Wait on both the send and the receive to complete. The - // two can happen independently, because sendBuf and recvBuf - // are different. (We assert no aliasing of buffers here, - // and we've also checked above that destProc != rank().) - Teuchos::waitAll (*pComm_, Teuchos::tuple (sendReq, recvReq)); - } - } + RCP > sendReq, recvReq; + if (srcProc < destProc) { + sendReq = Teuchos::isend (*pComm_, sendBuf, destProc); + recvReq = Teuchos::ireceive (*pComm_, recvBuf, destProc); + } + else + { + recvReq = Teuchos::ireceive (*pComm_, recvBuf, destProc); + sendReq = Teuchos::isend (*pComm_, sendBuf, destProc); + } + // Wait on both the send and the receive to complete. The + // two can happen independently, because sendBuf and recvBuf + // are different. (We assert no aliasing of buffers here, + // and we've also checked above that destProc != rank().) + Teuchos::waitAll (*pComm_, Teuchos::tuple (sendReq, recvReq)); + } + } } //! Sum inDatum on all processors, and return the result. - Datum - globalSum (const Datum& inDatum) + Datum + globalSum (const Datum& inDatum) { Datum outDatum; - Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_SUM, inDatum, - Teuchos::outArg(outDatum)); + Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_SUM, inDatum, + Teuchos::outArg(outDatum)); return outDatum; } /// \brief Compute the global minimum over all processors. /// /// Assumes that Datum objects are less-than comparable. - Datum + Datum globalMin (const Datum& inDatum) { Datum outDatum; - Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_MIN, inDatum, - Teuchos::outArg(outDatum)); + Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_MIN, inDatum, + Teuchos::outArg(outDatum)); return outDatum; } /// \brief Compute the global maximum over all processors. /// /// Assumes that Datum objects are less-than comparable. - Datum + Datum globalMax (const Datum& inDatum) { Datum outDatum; - Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_MAX, inDatum, - Teuchos::outArg(outDatum)); + Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_MAX, inDatum, + Teuchos::outArg(outDatum)); return outDatum; } //! Sum inData[0:count-1] over all processors into outData. void - globalVectorSum (const Datum inData[], - Datum outData[], - const int count) + globalVectorSum (const Datum inData[], + Datum outData[], + const int count) { - Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_SUM, count, - inData, outData); + Teuchos::reduceAll (*pComm_, Teuchos::REDUCE_SUM, count, + inData, outData); } //! Broadcast data[0:count-1] from root to all processors. void - broadcast (Datum data[], - const int count, - const int root) + broadcast (Datum data[], + const int count, + const int root) { Teuchos::broadcast (*pComm_, root, count, data); } @@ -271,8 +260,6 @@ namespace TSQR { void barrier () const { Teuchos::barrier (*pComm_); } private: - - //! Shared pointer to the the underlying communicator object. comm_ptr pComm_; }; } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_TimeStats.cpp b/packages/tpetra/tsqr/src/Tsqr_TimeStats.cpp index 0565998f5ecb..e0eedc473910 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TimeStats.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_TimeStats.cpp @@ -35,13 +35,11 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ -#include +#include "Tsqr_TimeStats.hpp" #include namespace TSQR { diff --git a/packages/tpetra/tsqr/src/Tsqr_TimeStats.hpp b/packages/tpetra/tsqr/src/Tsqr_TimeStats.hpp index 2af946aa6236..98c8dfcc8554 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TimeStats.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TimeStats.hpp @@ -35,8 +35,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ diff --git a/packages/tpetra/tsqr/src/Tsqr_TrivialMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_TrivialMessenger.hpp index 824b728cfcd2..5e7b2c544f43 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TrivialMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TrivialMessenger.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,24 +34,19 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_TrivialMessenger_hpp #define __TSQR_TrivialMessenger_hpp -#include - +#include "Tsqr_MessengerBase.hpp" #include #include #include #include - -namespace TSQR { - +namespace TSQR { /// \class TrivialMessenger /// \brief Noncommunicating "communication" object for TSQR. /// @@ -66,10 +61,10 @@ namespace TSQR { class TrivialMessenger : public MessengerBase { public: //! Trivial / default constructor, since no member data. - TrivialMessenger () {} + TrivialMessenger () = default; //! Virtual destructor for memory safety of derived classes. - virtual ~TrivialMessenger() {} + virtual ~TrivialMessenger() = default; /// \brief Send sendData[0:sendCount-1] to process destProc. /// @@ -77,11 +72,11 @@ namespace TSQR { /// \param sendCount [in] Number of elements in the array /// \param destProc [in] Rank of destination process /// \param tag [in] MPI tag (ignored) - void - send (const Datum sendData[], - const int sendCount, - const int destProc, - const int tag) + void + send (const Datum sendData[], + const int sendCount, + const int destProc, + const int tag) {} /// \brief Receive recvData[0:recvCount-1] from process srcProc. @@ -90,11 +85,11 @@ namespace TSQR { /// \param recvCount [in] Number of elements to receive in the array /// \param srcProc [in] Rank of sending process /// \param tag [in] MPI tag (ignored) - void - recv (Datum recvData[], - const int recvCount, - const int srcProc, - const int tag) + void + recv (Datum recvData[], + const int recvCount, + const int srcProc, + const int tag) {} /// \brief Exchange data between processors. @@ -113,36 +108,36 @@ namespace TSQR { /// this process is sending data, and from which this process is /// receiving data) /// \param tag [in] MPI tag (ignored) - void - swapData (const Datum sendData[], - Datum recvData[], - const int sendRecvCount, - const int destProc, - const int tag) + void + swapData (const Datum sendData[], + Datum recvData[], + const int sendRecvCount, + const int destProc, + const int tag) { if (destProc != rank()) - { - std::ostringstream os; - os << "Destination rank " << destProc << " is invalid. The only " - << "valid rank for TSQR::TrivialMessenger is 0 (zero)."; - throw std::invalid_argument (os.str()); - } + { + std::ostringstream os; + os << "Destination rank " << destProc << " is invalid. The only " + << "valid rank for TSQR::TrivialMessenger is 0 (zero)."; + throw std::invalid_argument (os.str()); + } else if (sendRecvCount < 0) - { - std::ostringstream os; - os << "sendRecvCount = " << sendRecvCount << " is invalid: " - << "only nonnegative values are allowed."; - throw std::invalid_argument (os.str()); - } + { + std::ostringstream os; + os << "sendRecvCount = " << sendRecvCount << " is invalid: " + << "only nonnegative values are allowed."; + throw std::invalid_argument (os.str()); + } else if (sendRecvCount == 0) - return; // No data to exchange - else - safeCopy (sendData, recvData, sendRecvCount); + return; // No data to exchange + else + safeCopy (sendData, recvData, sendRecvCount); } //! Sum inDatum on all processors, and return the result. - Datum - globalSum (const Datum& inDatum) + Datum + globalSum (const Datum& inDatum) { Datum outDatum (inDatum); return outDatum; @@ -151,7 +146,7 @@ namespace TSQR { /// \brief Compute the global minimum over all processors. /// /// Assumes that Datum objects are less-than comparable. - Datum + Datum globalMin (const Datum& inDatum) { Datum outDatum (inDatum); @@ -161,7 +156,7 @@ namespace TSQR { /// \brief Compute the global maximum over all processors. /// /// Assumes that Datum objects are less-than comparable. - Datum + Datum globalMax (const Datum& inDatum) { Datum outDatum (inDatum); @@ -170,18 +165,18 @@ namespace TSQR { //! Sum inData[0:count-1] over all processors into outData. void - globalVectorSum (const Datum inData[], - Datum outData[], - const int count) + globalVectorSum (const Datum inData[], + Datum outData[], + const int count) { safeCopy (inData, outData, count); } //! Broadcast data[0:count-1] from root to all processors. void - broadcast (Datum data[], - const int count, - const int root) + broadcast (Datum data[], + const int count, + const int root) {} //! Return this process' rank. @@ -196,30 +191,30 @@ namespace TSQR { private: /// \brief Copy count elements of inData into outData. - /// + /// /// Attempt to detect aliasing, and use a method appropriate for /// either the nonaliased or the aliased case. void safeCopy (const Datum inData[], - Datum outData[], - const int count) + Datum outData[], + const int count) { // Check for nonaliasing of inData and outData. if (&inData[count-1] < &outData[0] || - &outData[count-1] < &inData[0]) - // The arrays don't overlap, so we can call std::copy. - // std::copy assumes that the third argument does not - // point to an element in the range of the first two - // arguments. - std::copy (inData, inData+count, outData); + &outData[count-1] < &inData[0]) + // The arrays don't overlap, so we can call std::copy. + // std::copy assumes that the third argument does not + // point to an element in the range of the first two + // arguments. + std::copy (inData, inData+count, outData); else - { - // If inData and outData do alias one another, use - // the buffer as intermediate scratch space. - buf_.resize (count); - std::copy (inData, inData+count, buf_.begin()); - std::copy (buf_.begin(), buf_.end(), outData); - } + { + // If inData and outData do alias one another, use + // the buffer as intermediate scratch space. + buf_.resize (count); + std::copy (inData, inData+count, buf_.begin()); + std::copy (buf_.begin(), buf_.end(), outData); + } } /// Buffer to guard against incorrect behavior for aliased arrays. diff --git a/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.cpp b/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.cpp index aca1b666977a..ea14574286d5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.cpp @@ -35,14 +35,12 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ -#include -#include +#include "Tsqr_TrivialTimer.hpp" +#include "Tsqr_verifyTimerConcept.hpp" namespace TSQR { diff --git a/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.hpp b/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.hpp index 3750a489afdb..2e600a8cc7b9 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TrivialTimer.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -35,8 +35,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER */ @@ -46,15 +44,11 @@ #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { - /// \class TrivialTimer /// \brief Satisfies TimerType concept trivially. /// - /// This is a "prototype" for the TimerType concept; it satisfies + /// This is an "archetype" for the TimerType concept; it satisfies /// the concept trivially. class TrivialTimer { public: @@ -70,7 +64,7 @@ namespace TSQR { /// actually return valid times. However, it satisfies our /// TimerType concept. void start (bool reset = false); - + //! Stop the timer and return (fake) elapsed time. double stop (); @@ -91,7 +85,7 @@ namespace TSQR { /// The \c stop() method computes a fake timing result based on /// the counter value. size_t counter_; - + //! Whether this timer is running bool isRunning_; diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index e94e2c98815c..201d8a0db6c3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -34,33 +34,28 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_TsqrTest_hpp #define __TSQR_Test_TsqrTest_hpp -#include +#include "Tsqr.hpp" #ifdef HAVE_KOKKOSTSQR_TBB -# include +# include "TbbTsqr.hpp" #endif // HAVE_KOKKOSTSQR_TBB -#include -#include -#include -#include -#include - +#include "Tsqr_TestSetup.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Tsqr_printGlobalMatrix.hpp" +#include "Tsqr_verifyTimerConcept.hpp" +#include "Teuchos_ScalarTraits.hpp" #include // size_t #include #include #include - namespace TSQR { namespace Test { - template class TsqrVerifier { public: diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp index 1576e23a592c..9224e4e5bfd8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp @@ -34,19 +34,16 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER /// \file Tsqr_Util.hpp /// \brief Utilities for TSQR (the Tall Skinny QR factorization) -/// #ifndef __TSQR_Tsqr_Util_hpp #define __TSQR_Tsqr_Util_hpp -#include +#include "Teuchos_ScalarTraits.hpp" #ifdef HAVE_KOKKOSTSQR_COMPLEX # include @@ -55,7 +52,6 @@ #include #include - namespace TSQR { /// \class ScalarPrinter diff --git a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp index e2f54bd95db8..a432dc4d6962 100644 --- a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp @@ -34,28 +34,24 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_generateStack_hpp #define __TSQR_Test_generateStack_hpp -#include -#include -#include -#include +#include "Tsqr_Matrix.hpp" +#include "Tsqr_Util.hpp" +#include "Tsqr_Random_MatrixGenerator.hpp" +#include "Tsqr_RMessenger.hpp" #include #include #include #include - namespace TSQR { namespace Test { - /// \brief Generate a random "R stack" test problem on one MPI process. /// /// Generate a (pseudo)random test problem consisting of numProcs diff --git a/packages/tpetra/tsqr/src/Tsqr_nodeTestProblem.hpp b/packages/tpetra/tsqr/src/Tsqr_nodeTestProblem.hpp index 916f59669333..10e7a03f442c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_nodeTestProblem.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_nodeTestProblem.hpp @@ -34,20 +34,17 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __TSQR_Test_nodeTestProblem_hpp #define __TSQR_Test_nodeTestProblem_hpp -#include -#include +#include "Tsqr_Random_MatrixGenerator.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include - namespace TSQR { namespace Test { diff --git a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp index 6ada217fbabc..ec9cdaa39a82 100644 --- a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp @@ -34,26 +34,21 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER #ifndef __Tsqr_printGlobalMatrix_hpp #define __Tsqr_printGlobalMatrix_hpp -#include -#include -#include -#include - +#include "Tsqr_MessengerBase.hpp" +#include "Tsqr_Util.hpp" +#include "Tsqr_Matrix.hpp" +#include "Teuchos_ScalarTraits.hpp" #include #include #include - namespace TSQR { - /// \fn printGlobalMatrix /// /// Print a dense matrix distributed in block row fashion among all diff --git a/packages/tpetra/tsqr/src/Tsqr_verifyTimerConcept.hpp b/packages/tpetra/tsqr/src/Tsqr_verifyTimerConcept.hpp index bd9292749ec0..09d70eb3eec5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_verifyTimerConcept.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_verifyTimerConcept.hpp @@ -1,12 +1,12 @@ //@HEADER // ************************************************************************ -// +// // Kokkos: Node API and Parallel Node Kernels // Copyright (2008) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -44,9 +42,6 @@ #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { namespace Test { /// \function verifyTimerConcept @@ -94,27 +89,27 @@ namespace TSQR { std::string timerName = timer.name(); if (timerName != "NameOfTimer") - throw std::logic_error ("TimerType does not correctly store the timer name"); + throw std::logic_error ("TimerType does not correctly store the timer name"); // Test default argument of start() if (timer.isRunning()) - throw std::logic_error ("TimerType does not correctly initialize isRunning"); + throw std::logic_error ("TimerType does not correctly initialize isRunning"); timer.start (); if (! timer.isRunning()) - throw std::logic_error ("TimerType does not correctly set isRunning"); + throw std::logic_error ("TimerType does not correctly set isRunning"); double result1 = timer.stop(); if (timer.isRunning()) - throw std::logic_error ("TimerType does not correctly reset isRunning"); + throw std::logic_error ("TimerType does not correctly reset isRunning"); // Test nondefault argument of start() if (timer.isRunning()) - throw std::logic_error ("TimerType does not correctly initialize isRunning"); + throw std::logic_error ("TimerType does not correctly initialize isRunning"); timer.start (true); if (! timer.isRunning()) - throw std::logic_error ("TimerType does not correctly set isRunning"); + throw std::logic_error ("TimerType does not correctly set isRunning"); double result2 = timer.stop(); if (timer.isRunning()) - throw std::logic_error ("TimerType does not correctly reset isRunning"); + throw std::logic_error ("TimerType does not correctly reset isRunning"); return result1 + result2; } diff --git a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp index 8a3b848b2622..9e1344065d38 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp @@ -34,12 +34,10 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_ConfigDefs.hpp" #include "Teuchos_ConfigDefs.hpp" // HAVE_MPI #include "Teuchos_Tuple.hpp" #ifdef HAVE_MPI diff --git a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp index cbb5acec4dd9..33210c6c81f4 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp @@ -34,26 +34,24 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_ConfigDefs.hpp" #ifdef HAVE_MPI -# include -# include +# include "Teuchos_GlobalMPISession.hpp" +# include "Teuchos_oblackholestream.hpp" #endif // HAVE_MPI -#include -#include -#include -#include -#include +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_DefaultComm.hpp" +#include "Teuchos_RCP.hpp" +#include "Teuchos_Time.hpp" +#include "Teuchos_StandardCatchMacros.hpp" -#include -#include +#include "Tsqr_ParTest.hpp" +#include "Tsqr_TeuchosMessenger.hpp" #ifdef HAVE_KOKKOSTSQR_COMPLEX # include diff --git a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp index 1960774c6fcd..c60d652fc651 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp @@ -34,20 +34,18 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_FullTsqrTest.hpp" #ifdef HAVE_MPI -# include -# include +# include "Teuchos_GlobalMPISession.hpp" +# include "Teuchos_oblackholestream.hpp" #endif // HAVE_MPI -#include -#include -#include +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_DefaultComm.hpp" +#include "Teuchos_StandardCatchMacros.hpp" #ifdef HAVE_KOKKOSTSQR_COMPLEX # include diff --git a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp index fb6ee22b8e40..84e21e75b49f 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER @@ -53,7 +51,6 @@ # include #endif // HAVE_KOKKOSTSQR_COMPLEX - namespace { // // Instantiate and return a Kokkos Node instance with the given diff --git a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp b/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp index cc583ddac034..3c4da413287b 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER diff --git a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp index cb0f800d8566..26c4222dea57 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp @@ -34,12 +34,10 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_ConfigDefs.hpp" #include "Teuchos_ConfigDefs.hpp" // HAVE_MPI #include "Teuchos_Tuple.hpp" #ifdef HAVE_MPI diff --git a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp index 308adaa5cb71..e70a8c1c3b3c 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp @@ -34,12 +34,10 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ //@HEADER -#include +#include "Tsqr_ConfigDefs.hpp" #include "Teuchos_ConfigDefs.hpp" // HAVE_MPI #include "Teuchos_Tuple.hpp" #ifdef HAVE_MPI From 85ee5bc65c1f1e40973058555bfd5887b99e47d8 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 20 Nov 2019 17:44:29 -0700 Subject: [PATCH 03/50] TSQR: Revive KokkosNodeTsqr with host-only Kokkos::parallel_for --- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 862 +++++++----------- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 87 +- packages/tpetra/tsqr/test/CMakeLists.txt | 10 + .../tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp | 557 +++++------ 4 files changed, 616 insertions(+), 900 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 7f1f2ffe4858..c16904f1c99c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -38,7 +38,7 @@ //@HEADER /// \file Tsqr_KokkosNodeTsqr.hpp -/// \brief Parallel intranode TSQR implemented using the Kokkos Node API. +/// \brief Parallel intranode TSQR implemented using Kokkos::parallel_for. #ifndef __TSQR_KokkosNodeTsqr_hpp #define __TSQR_KokkosNodeTsqr_hpp @@ -48,17 +48,10 @@ #include "Tsqr_NodeTsqr.hpp" #include "Teuchos_ParameterListAcceptorDefaultBase.hpp" -#include "Teuchos_ScalarTraits.hpp" - -//#define KNR_DEBUG 1 -#ifdef KNR_DEBUG -# include -#endif // KNR_DEBUG +#include "Kokkos_Core.hpp" namespace TSQR { - namespace details { - /// \brief Half-exclusive range of my partition's cache block indices. /// /// \c FactorFirstPass (used by the factor() method of \c @@ -92,16 +85,7 @@ namespace TSQR { const int numPartitions, const CacheBlockingStrategy& strategy) { -#ifdef KNR_DEBUG - using std::cerr; - using std::endl; - // cerr << "cacheBlockIndexRange(numRows=" << numRows - // << ", numCols=" << numCols - // << ", partitionIndex=" << partitionIndex - // << ", numPartitions=" << numPartitions - // << ", strategy)" << endl; -#endif // KNR_DEBUG - + using LO = LocalOrdinal; // The input index is a zero-based index of the current // partition (not the "current cache block" -- a partition // contains zero or more cache blocks). If the input index is @@ -117,61 +101,45 @@ namespace TSQR { // Return an empty partition (an empty cache block range) if // the partition index is out of range. - if (partitionIndex >= numPartitions) - return std::make_pair (LocalOrdinal(0), LocalOrdinal(0)); + if (partitionIndex >= numPartitions) { + return {0, 0}; + } - const LocalOrdinal numRowsCacheBlock = + const LO numRowsCacheBlock = strategy.cache_block_num_rows (numCols); - const LocalOrdinal numCacheBlocks = + const LO numCacheBlocks = strategy.num_cache_blocks (numRows, numCols, numRowsCacheBlock); -#ifdef KNR_DEBUG - // cerr << "numRowsCacheBlock=" << numRowsCacheBlock - // << ", numCacheBlocks=" << numCacheBlocks - // << endl; -#endif // KNR_DEBUG - // Figure out how many cache blocks my partition contains. If // the number of partitions doesn't evenly divide the number // of cache blocks, we spread out the remainder among the // first few threads. - const LocalOrdinal quotient = numCacheBlocks / numPartitions; - const LocalOrdinal remainder = numCacheBlocks - quotient * numPartitions; - const LocalOrdinal myNumCacheBlocks = - (partitionIndex < remainder) ? (quotient + 1) : quotient; - -#ifdef KNR_DEBUG - // cerr << "Partition " << partitionIndex << ": quotient=" << quotient - // << ", remainder=" << remainder << ", myNumCacheBlocks=" - // << myNumCacheBlocks << endl; -#endif // KNR_DEBUG + const LO quotient = numCacheBlocks / numPartitions; + const LO remainder = numCacheBlocks - quotient * numPartitions; + const LO myNumCacheBlocks = (partitionIndex < remainder) ? + (quotient + 1) : quotient; // If there are no cache blocks, there is nothing to factor. // Return an empty cache block range to indicate this. - if (myNumCacheBlocks == 0) - return std::make_pair (LocalOrdinal(0), LocalOrdinal(0)); + if (myNumCacheBlocks == 0) { + return {0, 0}; + } // Index of my first cache block (inclusive). - const LocalOrdinal myFirstCacheBlockIndex = - (partitionIndex < remainder) ? + const LO myFirstCacheBlockIndex = (partitionIndex < remainder) ? partitionIndex * (quotient+1) : remainder * (quotient+1) + (partitionIndex - remainder) * quotient; // Index of my last cache block (exclusive). - const LocalOrdinal myLastCacheBlockIndex = - (partitionIndex+1 < remainder) ? + const LO myLastCacheBlockIndex = (partitionIndex+1 < remainder) ? (partitionIndex+1) * (quotient+1) : remainder * (quotient+1) + (partitionIndex+1 - remainder) * quotient; - // Sanity check. - if (myLastCacheBlockIndex <= myFirstCacheBlockIndex) - { - std::ostringstream os; - os << "Partition " << (partitionIndex+1) << " of " - << numPartitions << ": My range of cache block indices [" - << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex - << ") is empty."; - throw std::logic_error(os.str()); - } - return std::make_pair (myFirstCacheBlockIndex, myLastCacheBlockIndex); + TEUCHOS_TEST_FOR_EXCEPTION + (myLastCacheBlockIndex <= myFirstCacheBlockIndex, + std::logic_error, "Partition " << (partitionIndex+1) << " of " + << numPartitions << ": My range of cache block indices [" + << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex + << ") is empty."); + return {myFirstCacheBlockIndex, myLastCacheBlockIndex}; } @@ -237,22 +205,16 @@ namespace TSQR { factor (const std::pair cbIndices, const int partitionIndex) const { -#ifdef KNR_DEBUG - using std::cerr; - using std::endl; -#endif // KNR_DEBUG - - typedef CacheBlockRange range_type; + const char suffix[] = " Please report this bug to the Tpetra developers."; + using cb_range_type = CacheBlockRange; // Workspace is created here, because it must not be shared // among threads. std::vector work (A_.ncols()); // Range of cache blocks to factor. - range_type cbRange (A_, strategy_, - cbIndices.first, - cbIndices.second, - contiguousCacheBlocks_); + cb_range_type cbRange (A_, strategy_, cbIndices.first, + cbIndices.second, contiguousCacheBlocks_); // Iterator in the forward direction over the range of cache // blocks to factor. typedef typename CacheBlockRange::iterator range_iter_type; @@ -263,12 +225,11 @@ namespace TSQR { if (A_top.empty ()) { return A_top; } - TEUCHOS_TEST_FOR_EXCEPTION(cbIndices.first >= cbIndices.second, - std::logic_error, - "FactorFirstPass::factor: A_top is not empty, but " - "the cache block index range " << cbIndices.first - << "," << cbIndices.second << " is empty. Please " - "report this bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (cbIndices.first >= cbIndices.second, std::logic_error, + "FactorFirstPass::factor: A_top is not empty, but the " + "cache block index range " << cbIndices.first << "," + << cbIndices.second << " is empty." << suffix); // Current cache block index. LocalOrdinal curTauIdx = cbIndices.first; @@ -289,27 +250,22 @@ namespace TSQR { mat_view_type A_cur = *cbIter; // Iteration over cache blocks of a partition should // always result in nonempty cache blocks. - TEUCHOS_TEST_FOR_EXCEPTION( - A_cur.empty (), std::logic_error, "FactorFirstPass::factor: " - "The current cache block (the " << count << "-th to factor in the " - "range [" << cbIndices.first << "," << cbIndices.second << ") of " - "cache block indices) in partition " << (partitionIndex+1) << " " - "(out of " << numPartitions_ << " partitions) is empty. " - "Please report this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(static_cast(curTauIdx) >= tauArrays_.size(), - std::logic_error, - "FactorFirstPass::factor: curTauIdx (= " - << curTauIdx << ") >= tauArrays_.size() (= " - << tauArrays_.size() << "). Please report this " - "bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (A_cur.empty (), std::logic_error, "FactorFirstPass::factor: " + "The current cache block (the " << count << "-th to factor in the " + "range [" << cbIndices.first << "," << cbIndices.second << ") of " + "cache block indices) in partition " << (partitionIndex+1) << " " + "(out of " << numPartitions_ << " partitions) is empty." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast(curTauIdx) >= tauArrays_.size(), + std::logic_error, "FactorFirstPass::factor: curTauIdx (= " + << curTauIdx << ") >= tauArrays_.size() (= " + << tauArrays_.size() << ")." << suffix); tauArrays_[curTauIdx++] = factorCacheBlock (combine, A_top, A_cur, work); ++count; ++cbIter; } -#ifdef KNR_DEBUG - cerr << "Factored " << count << " cache blocks" << endl; -#endif // KNR_DEBUG return A_top; } @@ -375,26 +331,8 @@ namespace TSQR { /// \param partitionIndex [in] Zero-based index of the /// partition. If greater than or equal to the number of /// partitions, this routine does nothing. - /// - /// \warning This routine almost certainly won't work in CUDA. - /// If it does, it won't be efficient. If you are interested - /// in a GPU TSQR routine, please contact the author (Mark - /// Hoemmen ) of this code to discuss the - /// possibilities. For this reason, we have not added the - /// KERNEL_PREFIX method prefix. - /// - /// \note Unlike typical Kokkos work-data pairs (WDPs) passed - /// into parallel_for, this one is not declared inline. This - /// method is heavyweight enough that an inline declaration is - /// unlikely to improve performance. - void execute (const int partitionIndex) const + void operator() (const int partitionIndex) const { -#ifdef KNR_DEBUG - using std::cerr; - using std::endl; - // cerr << "FactorFirstPass::execute (" << partitionIndex << ")" << endl; -#endif // KNR_DEBUG - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || A_.empty ()) { return; } @@ -402,12 +340,6 @@ namespace TSQR { const std::pair cbIndices = cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex, numPartitions_, strategy_); -#ifdef KNR_DEBUG - cerr << "Partition " << partitionIndex - << ": Factoring cache block indices [" - << cbIndices.first << ", " << cbIndices.second << ")" - << endl; -#endif // KNR_DEBUG // It's legitimate, though suboptimal, for some partitions // not to get any work to do (in this case, not to get any // cache blocks to factor). @@ -420,7 +352,6 @@ namespace TSQR { } }; - /// \class ApplyFirstPass /// \brief "First" pass of applying KokkosNodeTsqr's implicit Q factor. /// \author Mark Hoemmen @@ -476,21 +407,21 @@ namespace TSQR { const mat_view_type& C_cur, std::vector& work) const { - TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast (Q_cur.ncols()), - std::logic_error, - "ApplyFirstPass::applyCacheBlock: tau.size() " - "(= " << tau.size() << ") < number of columns " - << Q_cur.ncols() << " in the Q factor. Please " - "report this bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (tau.size() < static_cast (Q_cur.ncols()), + std::logic_error, "ApplyFirstPass::applyCacheBlock: tau.size() " + "(= " << tau.size() << ") < number of columns " + << Q_cur.ncols() << " in the Q factor." + " Please report this bug to the Tpetra developers."); // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. combine.apply_inner (applyType, C_cur.nrows(), C_cur.ncols(), Q_cur.ncols(), Q_cur.get(), Q_cur.lda(), - &tau[0], + tau.data(), C_top.get(), C_top.lda(), C_cur.get(), C_cur.lda(), - &work[0]); + work.data()); } /// \fn apply @@ -499,7 +430,7 @@ namespace TSQR { /// \param applyType [in] Whether we are applying Q, Q^T, or Q^H. /// \param cbIndices [in] Half-exclusive range of cache block /// indices. - /// \param partitionIndex [in] The argument to \c execute(); the + /// \param partitionIndex [in] The argument to \c operator(); the /// index of the partition which instance of ApplyFirstPass /// is currently processing. void @@ -507,12 +438,9 @@ namespace TSQR { const std::pair cbIndices, const int partitionIndex) const { -#ifdef KNR_DEBUG - using std::cerr; - using std::endl; -#endif // KNR_DEBUG typedef CacheBlockRange const_range_type; typedef CacheBlockRange range_type; + const char suffix[] = " Please report this bug to the Tpetra developers."; if (cbIndices.first >= cbIndices.second) { return; // My range of cache blocks is empty; nothing to do @@ -526,16 +454,16 @@ namespace TSQR { range_type C_range (C_, strategy_, cbIndices.first, cbIndices.second, contiguousCacheBlocks_); - TEUCHOS_TEST_FOR_EXCEPTION(Q_range.empty(), std::logic_error, - "Q_range is empty, but the range of cache block " - "indices [" << cbIndices.first << ", " - << cbIndices.second << ") is not empty. Please " - "report this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(C_range.empty(), std::logic_error, - "C_range is empty, but the range of cache block " - "indices [" << cbIndices.first << ", " - << cbIndices.second << ") is not empty. Please " - "report this bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (Q_range.empty(), std::logic_error, + "Q_range is empty, but the range of cache block " + "indices [" << cbIndices.first << ", " + << cbIndices.second << ") is not empty." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (C_range.empty(), std::logic_error, + "C_range is empty, but the range of cache block " + "indices [" << cbIndices.first << ", " + << cbIndices.second << ") is not empty." << suffix); // Task-local workspace array of length C_.ncols(). Workspace // must be per task, else there will be race conditions as @@ -545,26 +473,26 @@ namespace TSQR { Combine combine; if (applyType.transposed ()) { - typename const_range_type::iterator Q_rangeIter = Q_range.begin(); - typename range_type::iterator C_rangeIter = C_range.begin(); - TEUCHOS_TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.end(), std::logic_error, - "The Q cache block range claims to be nonempty, " - "but the iterator range is empty. Please report" - " this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(), std::logic_error, - "The C cache block range claims to be nonempty, " - "but the iterator range is empty. Please report" - " this bug to the Kokkos developers."); + auto Q_rangeIter = Q_range.begin(); + auto C_rangeIter = C_range.begin(); + TEUCHOS_TEST_FOR_EXCEPTION + (Q_rangeIter == Q_range.end(), std::logic_error, + "The Q cache block range claims to be nonempty, " + "but the iterator range is empty." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (C_rangeIter == C_range.end(), std::logic_error, + "The C cache block range claims to be nonempty, " + "but the iterator range is empty." << suffix); // Q_top: Topmost cache block in the cache block range of Q. // C_top: Topmost cache block in the cache block range of C. const_mat_view_type Q_top = *Q_rangeIter; mat_view_type C_top = *C_rangeIter; if (explicitQ_) { - C_top.fill (Teuchos::ScalarTraits::zero ()); + C_top.fill (Scalar {}); if (partitionIndex == 0) { for (LocalOrdinal j = 0; j < C_top.ncols(); ++j) { - C_top(j,j) = Teuchos::ScalarTraits::one (); + C_top(j,j) = Scalar (1.0); } } } @@ -578,18 +506,18 @@ namespace TSQR { ++Q_rangeIter; ++C_rangeIter; while (Q_rangeIter != Q_range.end ()) { - TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(), - std::logic_error, - "When applying Q^T or Q^H to C: The Q cache " - "block iterator is not yet at the end, but " - "the C cache block iterator is. Please " - "report this bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (C_rangeIter == C_range.end(), std::logic_error, + "When applying Q^T or Q^H to C: The Q cache " + "block iterator is not yet at the end, but " + "the C cache block iterator is." << suffix); const_mat_view_type Q_cur = *Q_rangeIter; mat_view_type C_cur = *C_rangeIter; ++Q_rangeIter; ++C_rangeIter; - if (explicitQ_) - C_cur.fill (Teuchos::ScalarTraits::zero()); + if (explicitQ_) { + C_cur.fill (Scalar {}); + } applyCacheBlock (combine, applyType, Q_cur, tauArrays_[curTauIndex++], C_top, C_cur, work); @@ -607,9 +535,11 @@ namespace TSQR { // internode part of the Q factor via DistTsqr). However, // we still need to fill the rest of C_top (everything but // the top ncols rows of C_top) with zeros. - mat_view_type C_top_rest (C_top.nrows() - C_top.ncols(), C_top.ncols(), - C_top.get() + C_top.ncols(), C_top.lda()); - C_top_rest.fill (Teuchos::ScalarTraits::zero()); + mat_view_type C_top_rest (C_top.nrows() - C_top.ncols(), + C_top.ncols(), + C_top.get() + C_top.ncols(), + C_top.lda()); + C_top_rest.fill (Scalar {}); } LocalOrdinal curTauIndex = cbIndices.second-1; @@ -617,14 +547,14 @@ namespace TSQR { // cache blocks in reverse order. typename const_range_type::iterator Q_rangeIter = Q_range.rbegin(); typename range_type::iterator C_rangeIter = C_range.rbegin(); - TEUCHOS_TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.rend(), std::logic_error, - "The Q cache block range claims to be nonempty, " - "but the iterator range is empty. Please report" - " this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.rend(), std::logic_error, - "The C cache block range claims to be nonempty, " - "but the iterator range is empty. Please report" - " this bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (Q_rangeIter == Q_range.rend(), std::logic_error, + "The Q cache block range claims to be nonempty, " + "but the iterator range is empty." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (C_rangeIter == C_range.rend(), std::logic_error, + "The C cache block range claims to be nonempty, " + "but the iterator range is empty." << suffix); // Equality of cache block range iterators only tests the // cache block index, not reverse-ness. This means we can @@ -639,32 +569,24 @@ namespace TSQR { mat_view_type C_cur = *C_rangeIter; if (explicitQ_) { - C_cur.fill (Teuchos::ScalarTraits::zero()); + C_cur.fill (Scalar {}); } -#ifdef KNR_DEBUG - cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = " - << tauArrays_[curTauIndex].size() << endl; -#endif // KNR_DEBUG - TEUCHOS_TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error, - "curTauIndex=" << curTauIndex << " out of valid " - "range [" << cbIndices.first << "," - << cbIndices.second << "). Please report this " - "bug to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (curTauIndex < cbIndices.first, std::logic_error, + "curTauIndex=" << curTauIndex << " out of valid " + "range [" << cbIndices.first << "," + << cbIndices.second << ")." << suffix); applyCacheBlock (combine, applyType, Q_cur, tauArrays_[curTauIndex--], C_top, C_cur, work); ++Q_rangeIter; ++C_rangeIter; } - TEUCHOS_TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error, - "curTauIndex=" << curTauIndex << " out of valid " - "range [" << cbIndices.first << "," - << cbIndices.second << "). Please report this " - "bug to the Kokkos developers."); -#ifdef KNR_DEBUG - cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = " - << tauArrays_[curTauIndex].size() << endl; -#endif // KNR_DEBUG + TEUCHOS_TEST_FOR_EXCEPTION + (curTauIndex < cbIndices.first, std::logic_error, + "curTauIndex=" << curTauIndex << " out of valid range " + "[" << cbIndices.first << "," << cbIndices.second << ")." + << suffix); // Apply the first block. applyFirstCacheBlock (combine, applyType, Q_top, tauArrays_[curTauIndex--], C_top, work); @@ -695,7 +617,7 @@ namespace TSQR { /// of A are stored contiguously. ApplyFirstPass (const ApplyType& applyType, const const_mat_view_type& Q, - const std::vector >& tauArrays, + const std::vector>& tauArrays, const std::vector& topBlocks, const mat_view_type& C, const CacheBlockingStrategy& strategy, @@ -727,19 +649,11 @@ namespace TSQR { /// which this instance of ApplyFirstPass is currently /// processing. If greater than or equal to the number of /// partitions, this routine does nothing. - /// - /// \warning This routine almost certainly won't work in CUDA. - /// If it does, it won't be efficient. If you are interested - /// in a GPU TSQR routine, please contact the author (Mark - /// Hoemmen ) of this code to discuss the - /// possibilities. - /// - /// \note Unlike typical Kokkos work-data pairs (WDPs) passed - /// into parallel_for, this one is not declared inline. This - /// method is heavyweight enough that an inline declaration is - /// unlikely to improve performance. - void execute (const int partitionIndex) const + void operator() (const int partitionIndex) const { + const char prefix[] = "TSQR::ApplyFirstPass::operator(): "; + const char suffix[] = " Please report this bug to the Tpetra developers."; + if (partitionIndex < 0 || partitionIndex >= numPartitions_ || Q_.empty () || C_.empty ()) { return; @@ -752,36 +666,33 @@ namespace TSQR { if (cbIndices.second <= cbIndices.first) return; { - std::pair cbInds (static_cast (cbIndices.first), - static_cast (cbIndices.second)); - TEUCHOS_TEST_FOR_EXCEPTION( - cbIndices.first < static_cast(0), std::logic_error, - "TSQR::ApplyFirstPass::execute: cacheBlockIndexRange(" << - Q_.nrows () << ", " << Q_.ncols() << ", " << partitionIndex << ", " - << numPartitions_ << ", strategy) returned a cache block range " << - cbIndices.first << "," << cbIndices.second << " with negative start" - "ing index. Please report this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION( - cbInds.second > tauArrays_.size (), std::logic_error, - "TSQR::ApplyFirstPass::execute: cacheBlockIndexRange(" << - Q_.nrows () << ", " << Q_.ncols() << ", " << partitionIndex << ", " - << numPartitions_ << ", strategy) returned a cache block range " - << cbIndices.first << "," << cbIndices.second << " with starting " - "index larger than the number of tau arrays " << tauArrays_.size () - << ". Please report this bug to the Kokkos developers."); + std::pair cbInds (size_t (cbIndices.first), + size_t (cbIndices.second)); + TEUCHOS_TEST_FOR_EXCEPTION + (cbIndices.first < LocalOrdinal(0), std::logic_error, + prefix << "cacheBlockIndexRange(" << Q_.nrows () << ", " + << Q_.ncols() << ", " << partitionIndex << ", " + << numPartitions_ << ", strategy) returned a cache block " + "range " << cbIndices.first << "," << cbIndices.second << + " with negative starting index." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (cbInds.second > tauArrays_.size (), std::logic_error, + prefix << "cacheBlockIndexRange(" << Q_.nrows () << ", " + << Q_.ncols() << ", " << partitionIndex << ", " + << numPartitions_ << ", strategy) returned a cache block " + "range" << cbIndices.first << "," << cbIndices.second << + " with starting index larger than the number of tau " + "arrays " << tauArrays_.size () << "." << suffix); } - apply (applyType_, cbIndices, partitionIndex); } - }; - - /// \class CacheBlockWDP - /// \brief Kokkos work-data pair (WDP) for KokkosNodeTsqr's (un_)cache_block() methods. + /// \class CacheBlockFunctor + /// \brief Kokkos functor for KokkosNodeTsqr's (un_)cache_block() methods. /// \author Mark Hoemmen template - class CacheBlockWDP { + class CacheBlockFunctor { private: typedef ConstMatView const_mat_view_type; typedef MatView mat_view_type; @@ -798,7 +709,8 @@ namespace TSQR { /// /// \param cbInputRange [in] Range of input cache blocks. /// \param cbOutputRange [out] Range of output cache blocks. - void copyRange (const_range_type& cbInputRange, range_type& cbOutputRange) const + void copyRange (const_range_type& cbInputRange, + range_type& cbOutputRange) const { typedef typename const_range_type::iterator input_iter_type; typedef typename range_type::iterator output_iter_type; @@ -835,44 +747,45 @@ namespace TSQR { /// \param unblock [in] If false, cache-block A_in (a matrix in /// column-major order) into A_out. If true, un-cache-block /// A_in into A_out (a matrix in column-major order). - CacheBlockWDP (const const_mat_view_type A_in, - const mat_view_type A_out, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool unblock) : + CacheBlockFunctor (const const_mat_view_type A_in, + const mat_view_type A_out, + const CacheBlockingStrategy& strategy, + const int numPartitions, + const bool unblock) : A_in_ (A_in), A_out_ (A_out), strategy_ (strategy), numPartitions_ (numPartitions), unblock_ (unblock) { - TEUCHOS_TEST_FOR_EXCEPTION(A_in_.nrows() != A_out_.nrows() || - A_in_.ncols() != A_out_.ncols(), - std::invalid_argument, - "A_in and A_out do not have the same dimensions: " - "A_in is " << A_in_.nrows() << " by " - << A_in_.ncols() << ", but A_out is " - << A_out_.nrows() << " by " - << A_out_.ncols() << "."); - TEUCHOS_TEST_FOR_EXCEPTION(numPartitions_ < 1, - std::invalid_argument, - "The number of partitions " << numPartitions_ - << " is not a positive integer."); + TEUCHOS_TEST_FOR_EXCEPTION + (A_in_.nrows() != A_out_.nrows() || + A_in_.ncols() != A_out_.ncols(), + std::invalid_argument, + "A_in and A_out do not have the same dimensions: " + "A_in is " << A_in_.nrows() << " by " + << A_in_.ncols() << ", but A_out is " + << A_out_.nrows() << " by " + << A_out_.ncols() << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (numPartitions_ < 1, std::invalid_argument, + "The number of partitions " << numPartitions_ + << " is not a positive integer."); } - /// \brief Method called by Kokkos' parallel_for. + /// \brief Method called by Kokkos::parallel_for. /// /// \param partitionIndex [in] Zero-based index of the partition /// of the matrix. We parallelize over partitions. /// Partitions respect cache blocks. - void execute (const int partitionIndex) const + void operator() (const int partitionIndex) const { if (partitionIndex < 0 || partitionIndex >= numPartitions_ || A_in_.empty()) { return; } else { - typedef std::pair index_range_type; + using index_range_type = std::pair; const index_range_type cbIndices = cacheBlockIndexRange (A_in_.nrows (), A_in_.ncols (), partitionIndex, numPartitions_, strategy_); @@ -898,11 +811,11 @@ namespace TSQR { } }; - /// \class MultWDP - /// \brief Kokkos work-data pair (WDP) for \c KokkosNodeTsqr::Q_times_B(). + /// \class MultFunctor + /// \brief Kokkos functor for \c KokkosNodeTsqr::Q_times_B(). /// \author Mark Hoemmen template - class MultWDP { + class MultFunctor { private: typedef ConstMatView const_mat_view_type; typedef MatView mat_view_type; @@ -930,7 +843,7 @@ namespace TSQR { // Q_cur := Q_temp * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows(), numCols, numCols, - Teuchos::ScalarTraits::one(), + Scalar (1.0), Q_temp.get(), Q_temp.lda(), B_.get(), B_.lda(), Scalar(0), Q_cur.get(), Q_cur.lda()); } @@ -966,11 +879,11 @@ namespace TSQR { /// Q; maximum available parallelism. /// \param contiguousCacheBlocks [in] Whether the cache blocks /// of Q are stored contiguously. - MultWDP (const mat_view_type Q, - const const_mat_view_type B, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool contiguousCacheBlocks) : + MultFunctor (const mat_view_type Q, + const const_mat_view_type B, + const CacheBlockingStrategy& strategy, + const int numPartitions, + const bool contiguousCacheBlocks) : Q_ (Q), B_ (B), strategy_ (strategy), @@ -983,7 +896,7 @@ namespace TSQR { /// \param partitionIndex [in] Zero-based index of the partition /// of the matrix. We parallelize over partitions. /// Partitions respect cache blocks. - void execute (const int partitionIndex) const + void operator() (const int partitionIndex) const { if (partitionIndex < 0 || partitionIndex >= numPartitions_ || Q_.empty ()) { @@ -1006,15 +919,14 @@ namespace TSQR { } }; - - /// \class FillWDP - /// \brief Kokkos work-data pair (WDP) for \c KokkosNodeTsqr::fill_with_zeros(). + /// \class FillFunctor + /// \brief Kokkos functor for \c KokkosNodeTsqr::fill_with_zeros(). /// \author Mark Hoemmen template - class FillWDP { + class FillFunctor { private: - typedef MatView mat_view_type; - typedef CacheBlockRange range_type; + using mat_view_type = MatView; + using range_type = CacheBlockRange; mat_view_type A_; CacheBlockingStrategy strategy_; @@ -1045,11 +957,11 @@ namespace TSQR { /// the matrix A; maximum available parallelism. /// \param contiguousCacheBlocks [in] Whether the cache /// blocks of A are stored contiguously. - FillWDP (const mat_view_type A, - const CacheBlockingStrategy& strategy, - const Scalar value, - const int numPartitions, - const bool contiguousCacheBlocks) : + FillFunctor (const mat_view_type A, + const CacheBlockingStrategy& strategy, + const Scalar value, + const int numPartitions, + const bool contiguousCacheBlocks) : A_ (A), strategy_ (strategy), value_ (value), @@ -1062,7 +974,7 @@ namespace TSQR { /// \param partitionIndex [in] Zero-based index of the partition /// of the matrix. We parallelize over partitions. /// Partitions respect cache blocks. - void execute (const int partitionIndex) const + void operator() (const int partitionIndex) const { if (partitionIndex < 0 || partitionIndex >= numPartitions_ || A_.empty ()) { @@ -1121,8 +1033,8 @@ namespace TSQR { // If there's only one partition, we don't even need a second // pass (it's just sequential TSQR), and we don't need a TAU // array for the top partition. - secondPassTauArrays.resize (static_cast (theNumPartitions-1)); - topBlocks.resize (static_cast (theNumPartitions)); + secondPassTauArrays.resize (size_t (theNumPartitions-1)); + topBlocks.resize (size_t (theNumPartitions)); } //! Total number of cache blocks in the matrix (over all partitions). @@ -1132,7 +1044,7 @@ namespace TSQR { int numPartitions() const { return topBlocks.size(); } //! TAU arrays from the first pass; one per cache block. - std::vector > firstPassTauArrays; + std::vector> firstPassTauArrays; /// \brief TAU arrays from the second pass. /// @@ -1147,7 +1059,7 @@ namespace TSQR { /// will likely combine firstPassTauArrays and secondPassTauArrays /// into a single std::vector (variable number of passes) or /// Teuchos::Tuple (fixed number of passes). - std::vector > secondPassTauArrays; + std::vector> secondPassTauArrays; /// \brief Views of the topmost cache blocks in each partition. /// @@ -1156,7 +1068,8 @@ namespace TSQR { }; /// \class KokkosNodeTsqr - /// \brief Intranode TSQR parallelized using the Kokkos Node API. + /// \brief Intranode (within an MPI process) TSQR parallelized using + /// Kokkos::DefaultHostExecutionSpace. /// \author Mark Hoemmen /// /// \tparam LocalOrdinal The type of indices in the (node-local) @@ -1164,29 +1077,27 @@ namespace TSQR { /// /// \tparam Scalar The type of entries in the (node-local) matrix. /// - /// \tparam NodeType The Kokkos Node type. This currently must be a - /// CPU node; this algorithm is not (yet) appropriate for GPUs. - /// /// This implementation of the intranode part of TSQR factors the /// matrix in two passes. The first pass parallelizes over /// partitions, doing Sequential TSQR over each partition. The /// second pass combines the R factors from the partitions, and is /// not currently parallel. Thus, the overall algorithm is similar - /// to that of \c TbbTsqr, except that: - /// - TbbTsqr partitions differently; KokkosNodeTsqr's partitions - /// use the same layout of cache blocks as SequentialTsqr, whereas - /// TbbTsqr uses a different layout. - /// - TbbTsqr reduces the R factors in parallel; it only needs one - /// "pass." - template + /// to that of TbbTsqr, except that: + ///
    + ///
  • TbbTsqr partitions differently; KokkosNodeTsqr's partitions + /// use the same layout of cache blocks as SequentialTsqr, + /// whereas TbbTsqr uses a different layout.
  • + ///
  • TbbTsqr reduces the R factors in parallel; it only needs + /// one "pass."
  • + ///
+ template class KokkosNodeTsqr : - public NodeTsqr >, + public NodeTsqr>, public Teuchos::ParameterListAcceptorDefaultBase { public: typedef LocalOrdinal local_ordinal_type; typedef Scalar scalar_type; - typedef NodeType node_type; typedef ConstMatView const_mat_view_type; typedef MatView mat_view_type; @@ -1197,85 +1108,16 @@ namespace TSQR { /// \brief Constructor (with user-specified parameters). /// - /// \param node [in] Kokkos Node instance. If you don't have this - /// yet, you can set it to null and call \c setNode() later once - /// you have the Node instance. (This is the typical case for - /// lazy initialization of a Belos or Anasazi (Mat)OrthoManager - /// subclass, where you need a vector before you can get a Node - /// instance.) - /// - /// \param params [in/out] List of parameters. Missing parameters - /// will be filled in with default values. - KokkosNodeTsqr (const Teuchos::RCP& node, - const Teuchos::RCP& params) : - node_ (node) - { - setParameterList (params); - } - - /// \brief Constructor (with user-specified parameters but no node). - /// - /// This version of the constructor sets the Kokkos Node instance - /// to null. You must call \c setNode() with a valid Kokkos Node - /// instance before you can invoke any methods that perform - /// computations. - /// /// \param params [in/out] List of parameters. Missing parameters /// will be filled in with default values. - KokkosNodeTsqr (const Teuchos::RCP& params) : - node_ (Teuchos::null) + KokkosNodeTsqr (const Teuchos::RCP& params = Teuchos::null) { setParameterList (params); } - /// \brief Constructor (sets default parameters). - /// - /// \param node [in] Kokkos Node instance. If you don't have this - /// yet, you can set it to null and call \c setNode() later once - /// you have the Node instance. - KokkosNodeTsqr (const Teuchos::RCP& node) : - node_ (node) - { - setParameterList (Teuchos::null); - } - - /// \brief Default constructor (sets default parameters). - /// - /// This version of the constructor sets the Kokkos Node instance - /// to null. You must call \c setNode() with a valid Kokkos Node - /// instance before you can invoke any methods that perform - /// computations. - KokkosNodeTsqr () : node_ (Teuchos::null) - { - setParameterList (Teuchos::null); - } - - /// \brief Set the Kokkos Node instance. - /// - /// You can't compute anything until you set the Kokkos Node - /// instance. - /// - /// \note The whole reason for allowing initialization of the - /// Kokkos Node instance after construction is so that this - /// class can implement \c Teuchos::ParameterListAcceptor. - /// ParameterListAcceptor's getValidParameters() is an instance - /// method, not a class method, so the object has to be - /// instantiated before getValidParameters() can be called. \c - /// NodeTsqrFactory in turn needs to call getValidParameters() - /// so that callers can get a default parameter list before - /// instantiating the NodeTsqr subclass instance. However, - /// NodeTsqrFactory doesn't have the Kokkos Node instance until - /// TSQR gets a multivector to factor. - void setNode (const Teuchos::RCP& node) { - node_ = node; - } - /// \brief Whether this object is ready to perform computations. - /// - /// It is not ready if the Kokkos Node instance has not yet - /// been set. bool ready() const { - return ! getNode().is_null(); + return true; } /// \brief One-line description of this object. @@ -1288,8 +1130,6 @@ namespace TSQR { << TypeNameTraits::name() << ", Scalar=" << TypeNameTraits::name() - << ", NodeType=" - << TypeNameTraits::name() << ">: \"Cache Size Hint\"=" << strategy_.cache_size_hint() << ", \"Size of Scalar\"=" << strategy_.size_of_scalar() << ", \"Num Tasks\"=" << numPartitions_; @@ -1314,7 +1154,8 @@ namespace TSQR { RCP plist; if (paramList.is_null()) { plist = rcp (new ParameterList (*getValidParameters ())); - } else { + } + else { plist = paramList; plist->validateParametersAndSetDefaults (*getValidParameters ()); } @@ -1329,12 +1170,13 @@ namespace TSQR { cacheSizeHint = plist->get ("Cache Size Hint"); sizeOfScalar = plist->get ("Size of Scalar"); numPartitions = plist->get ("Num Tasks"); - } catch (Teuchos::Exceptions::InvalidParameter& e) { + } + catch (Teuchos::Exceptions::InvalidParameter& e) { std::ostringstream os; os << "Failed to read default parameters after setting defaults. Pleas" "e report this bug to the Kokkos developers. Original exception mess" "age: " << e.what(); - throw std::logic_error (os.str()); + TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str()); } numPartitions_ = numPartitions; @@ -1452,19 +1294,16 @@ namespace TSQR { const LocalOrdinal lda, const bool contiguousCacheBlocks) const { - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); - mat_view_type A_view (nrows, ncols, A, lda); - typedef details::FillWDP fill_wdp_type; - typedef Teuchos::ScalarTraits STS; - fill_wdp_type filler (A_view, strategy_, STS::zero(), - numPartitions_, contiguousCacheBlocks); - node->parallel_for (0, numPartitions_, filler); + using functor_type = details::FillFunctor; + const Scalar ZERO {}; + functor_type functor (A_view, strategy_, ZERO, numPartitions_, + contiguousCacheBlocks); + using execution_space = Kokkos::DefaultHostExecutionSpace; + Kokkos::RangePolicy> + range (0, numPartitions_); + Kokkos::parallel_for ("KokkosNodeTsqr::fill_with_zeros", range, functor); } void @@ -1474,12 +1313,6 @@ namespace TSQR { const Scalar A_in[], const LocalOrdinal lda_in) const { - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); - const_mat_view_type A_in_view (nrows, ncols, A_in, lda_in); // The leading dimension of A_out doesn't matter here, since its @@ -1487,10 +1320,13 @@ namespace TSQR { // arbitrarily to a sensible value. mat_view_type A_out_view (nrows, ncols, A_out, nrows); - typedef details::CacheBlockWDP cb_wdp_type; - cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_, - numPartitions_, false); - node->parallel_for (0, numPartitions_, cacheBlocker); + using functor_type = details::CacheBlockFunctor; + functor_type functor (A_in_view, A_out_view, strategy_, + numPartitions_, false); + using execution_space = Kokkos::DefaultHostExecutionSpace; + Kokkos::RangePolicy> + range (0, numPartitions_); + Kokkos::parallel_for ("KokkosNodeTsqr::cache_block", range, functor); } void @@ -1500,22 +1336,19 @@ namespace TSQR { const LocalOrdinal lda_out, const Scalar A_in[]) const { - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); - // The leading dimension of A_in doesn't matter here, since its // cache blocks are contiguously stored. We set it arbitrarily // to a sensible value. const_mat_view_type A_in_view (nrows, ncols, A_in, nrows); mat_view_type A_out_view (nrows, ncols, A_out, lda_out); - typedef details::CacheBlockWDP cb_wdp_type; - cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_, - numPartitions_, true); - node->parallel_for (0, numPartitions_, cacheBlocker); + using functor_type = details::CacheBlockFunctor; + functor_type functor (A_in_view, A_out_view, strategy_, + numPartitions_, true); + using execution_space = Kokkos::DefaultHostExecutionSpace; + Kokkos::RangePolicy> + range (0, numPartitions_); + Kokkos::parallel_for ("KokkosNodeTsqr::un_cache_block", range, functor); } void @@ -1527,35 +1360,25 @@ namespace TSQR { const LocalOrdinal ldb, const bool contiguousCacheBlocks) const { - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); mat_view_type Q_view (nrows, ncols, Q, ldq); const_mat_view_type B_view (ncols, ncols, B, ldb); - typedef details::MultWDP mult_wdp_type; - mult_wdp_type mult (Q_view, B_view, strategy_, numPartitions_, - contiguousCacheBlocks); - node->parallel_for (0, numPartitions_, mult); + using functor_type = details::MultFunctor; + functor_type functor (Q_view, B_view, strategy_, numPartitions_, + contiguousCacheBlocks); + using execution_space = Kokkos::DefaultHostExecutionSpace; + Kokkos::RangePolicy> + range (0, numPartitions_); + Kokkos::parallel_for ("KokkosNodeTsqr::Q_times_B", range, functor); } private: - //! Get the Kokkos Node instance (may be null if it was not set). - Teuchos::RCP getNode () const { - return node_; - } - //! Implementation of fundamental TSQR kernels. Combine combine_; //! Workspace for Combine operations. mutable std::vector work_; - //! Pointer to the Kokkos Node object. - Teuchos::RCP node_; - //! Cache blocking strategy. CacheBlockingStrategy strategy_; @@ -1570,25 +1393,11 @@ namespace TSQR { //! Default parameter list (set by \c getValidParameters()). mutable Teuchos::RCP defaultParams_; - /// \brief Default number of partitions. - /// - /// This method may in the future try to "learn" the optimal - /// number of partitions. For now, it's a constant. Later, we - /// may even try to "learn" the best value, perhaps even at - /// runtime. As a result, this method may not necessarily return - /// the same value each time it is called. - /// - /// \warning We may change this method to take an RCP to a const - /// Kokkos node_type instance, if the Kokkos Node API later - /// supports queries for available computational resources - /// (e.g., number of CPU cores per node). + //! Default number of partitions. int defaultNumPartitions () const { - // Currently the Kokkos Node API does not give us access to the - // amount of available parallelism, so we return a constant. - // Mild oversubscription is OK. - return 16; + return Kokkos::DefaultHostExecutionSpace::concurrency (); } FactorOutput @@ -1596,33 +1405,33 @@ namespace TSQR { mat_view_type R, const bool contiguousCacheBlocks) const { - if (A.empty()) { - TEUCHOS_TEST_FOR_EXCEPTION(! R.empty(), std::logic_error, - "KokkosNodeTsqr::factorImpl: A is empty, but R " - "is not. Please report this bug to the Kokkos " - "developers."); + const char prefix[] = "KokkosNodeTsqr::factorImpl: "; + const char suffix[] = " Please report this bug to the Tpetra developers."; + using LO = LocalOrdinal; + using execution_space = Kokkos::DefaultHostExecutionSpace; + Kokkos::RangePolicy> + range (0, numPartitions_); + + if (A.empty ()) { + TEUCHOS_TEST_FOR_EXCEPTION + (! R.empty (), std::logic_error, prefix << "A is empty, " + "but R is not." << suffix); return FactorOutput (0, 0); } - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); - - const LocalOrdinal numRowsPerCacheBlock = + const LO numRowsPerCacheBlock = strategy_.cache_block_num_rows (A.ncols()); - const LocalOrdinal numCacheBlocks = + const LO numCacheBlocks = strategy_.num_cache_blocks (A.nrows(), A.ncols(), numRowsPerCacheBlock); // // Compute the first factorization pass (over partitions). // FactorOutput result (numCacheBlocks, numPartitions_); - typedef details::FactorFirstPass first_pass_type; + using first_pass_type = details::FactorFirstPass; first_pass_type firstPass (A, result.firstPassTauArrays, result.topBlocks, strategy_, numPartitions_, contiguousCacheBlocks); - // parallel_for wants an exclusive range. - node->parallel_for (0, numPartitions_, firstPass); + Kokkos::parallel_for ("KokkosNodeTsqr::factorImpl::firstPass", + range, firstPass); // Each partition collected a view of its top block, where that // partition's R factor is stored. The second pass reduces @@ -1636,13 +1445,13 @@ namespace TSQR { // The "topmost top block" contains the resulting R factor. const mat_view_type& R_top = result.topBlocks[0]; - TEUCHOS_TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error, - "After factorSecondPass: result.topBlocks[0] is an " - "empty view. Please report this bug to the Kokkos " - "developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (R_top.empty (), std::logic_error, prefix << "After " + "factorSecondPass: result.topBlocks[0] is an empty view." + << suffix); mat_view_type R_top_square (R_top.ncols(), R_top.ncols(), - R_top.get(), R_top.lda()); - R.fill (Teuchos::ScalarTraits::zero()); + R_top.get(), R_top.lda()); + R.fill (Scalar {}); // Only copy the upper triangle of R_top into R. copy_upper_triangle (R.ncols(), R.ncols(), R.get(), R.lda(), R_top.get(), R_top.lda()); @@ -1657,32 +1466,32 @@ namespace TSQR { const bool explicitQ, const bool contiguousCacheBlocks) const { + const char prefix[] = "KokkosNodeTsqr::applyImpl: "; + const char suffix[] = " Please report this bug to the Tpetra developers."; + using LO = LocalOrdinal; using details::cacheBlockIndexRange; - typedef details::ApplyFirstPass first_pass_type; - - Teuchos::RCP node = getNode (); - TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error, - "The Kokkos Node instance has not yet been set. " - "KokkosNodeTsqr needs a Kokkos Node instance in order " - "to perform computations."); - TEUCHOS_TEST_FOR_EXCEPTION(numPartitions_ != factorOutput.numPartitions(), - std::invalid_argument, - "applyImpl: KokkosNodeTsqr's number of partitions " - << numPartitions_ << " does not match the given " - "factorOutput's number of partitions " - << factorOutput.numPartitions() << ". This likely " - "means that the given factorOutput object comes from " - "a different instance of KokkosNodeTsqr. Please " - "report this bug to the Kokkos developers."); + using first_pass_type = details::ApplyFirstPass; + using execution_space = Kokkos::DefaultHostExecutionSpace; + + TEUCHOS_TEST_FOR_EXCEPTION + (numPartitions_ != factorOutput.numPartitions(), + std::invalid_argument, prefix << "KokkosNodeTsqr's number " + "of partitions " << numPartitions_ << " does not match the " + "given factorOutput's number of partitions " + << factorOutput.numPartitions() << ". This likely means " + "that the given factorOutput object comes from a different " + "instance of KokkosNodeTsqr." << suffix); const int numParts = numPartitions_; - first_pass_type firstPass (applyType, Q, factorOutput.firstPassTauArrays, + first_pass_type firstPass (applyType, Q, + factorOutput.firstPassTauArrays, factorOutput.topBlocks, C, strategy_, - numParts, explicitQ, contiguousCacheBlocks); + numParts, explicitQ, + contiguousCacheBlocks); // Get a view of each partition's top block of the C matrix. std::vector topBlocksOfC (numParts); { - typedef std::pair index_range_type; - typedef CacheBlocker blocker_type; + using index_range_type = std::pair; + using blocker_type = CacheBlocker; blocker_type C_blocker (C.nrows(), C.ncols(), strategy_); // For each partition, collect its top block of C. @@ -1691,7 +1500,7 @@ namespace TSQR { cacheBlockIndexRange (C.nrows(), C.ncols(), partIdx, numParts, strategy_); if (cbIndices.first >= cbIndices.second) { - topBlocksOfC[partIdx] = mat_view_type (0, 0, NULL, 0); + topBlocksOfC[partIdx] = mat_view_type (0, 0, nullptr, 0); } else { topBlocksOfC[partIdx] = C_blocker.get_cache_block (C, cbIndices.first, @@ -1700,16 +1509,19 @@ namespace TSQR { } } - if (applyType.transposed()) { - // parallel_for wants an exclusive range. - node->parallel_for (0, numPartitions_, firstPass); + Kokkos::RangePolicy> + range(0, numPartitions_); + if (applyType.transposed ()) { + Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass", + range, firstPass); applySecondPass (applyType, factorOutput, topBlocksOfC, strategy_, explicitQ); - } else { + } + else { applySecondPass (applyType, factorOutput, topBlocksOfC, strategy_, explicitQ); - // parallel_for wants an exclusive range. - node->parallel_for (0, numPartitions_, firstPass); + Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass", + range, firstPass); } } @@ -1717,19 +1529,19 @@ namespace TSQR { factorPair (const mat_view_type& R_top, const mat_view_type& R_bot) const { - TEUCHOS_TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error, - "R_top is empty!"); - TEUCHOS_TEST_FOR_EXCEPTION(R_bot.empty(), std::logic_error, - "R_bot is empty!"); - TEUCHOS_TEST_FOR_EXCEPTION(work_.size() == 0, std::logic_error, - "Workspace array work_ has length zero."); - TEUCHOS_TEST_FOR_EXCEPTION(work_.size() < static_cast (R_top.ncols()), - std::logic_error, - "Workspace array work_ has length = " - << work_.size() << " < R_top.ncols() = " - << R_top.ncols() << "."); - - std::vector tau (R_top.ncols()); + TEUCHOS_TEST_FOR_EXCEPTION + (R_top.empty (), std::logic_error, "R_top is empty!"); + TEUCHOS_TEST_FOR_EXCEPTION + (R_bot.empty(), std::logic_error, "R_bot is empty!"); + TEUCHOS_TEST_FOR_EXCEPTION + (work_.size() == 0, std::logic_error, + "Workspace array work_ has length zero."); + TEUCHOS_TEST_FOR_EXCEPTION + (work_.size() < size_t (R_top.ncols()), std::logic_error, + "Workspace array work_ has length = " << work_.size() + << " < R_top.ncols() = " << R_top.ncols() << "."); + + std::vector tau (R_top.ncols ()); // Our convention for such helper methods is for the immediate // parent to allocate workspace (the work_ array in this case). @@ -1738,7 +1550,8 @@ namespace TSQR { // nonzero (and the same) number of columns, but we have already // checked that above. combine_.factor_pair (R_top.ncols(), R_top.get(), R_top.lda(), - R_bot.get(), R_bot.lda(), &tau[0], &work_[0]); + R_bot.get(), R_bot.lda(), tau.data(), + work_.data()); return tau; } @@ -1747,34 +1560,35 @@ namespace TSQR { std::vector >& tauArrays, const int numPartitions) const { + const char prefix[] = "KokkosNodeTsqr::factorSecondPass: "; + const char suffix[] = " Please report this bug to the Tpetra developers."; + if (numPartitions <= 1) return; // Done! - TEUCHOS_TEST_FOR_EXCEPTION (topBlocks.size() < static_cast(numPartitions), - std::logic_error, - "KokkosNodeTsqr::factorSecondPass: topBlocks.size() " - "(= " << topBlocks.size() << ") < numPartitions (= " - << numPartitions << "). Please report this bug to " - "the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION (tauArrays.size() < static_cast(numPartitions-1), - std::logic_error, - "KokkosNodeTsqr::factorSecondPass: topBlocks.size() " - "(= " << topBlocks.size() << ") < numPartitions-1 (= " - << (numPartitions-1) << "). Please report this bug " - "to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (topBlocks.size () < size_t (numPartitions), std::logic_error, + prefix << "topBlocks.size() (= " << topBlocks.size() << ") " + "< numPartitions (= " << numPartitions << ")." << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (tauArrays.size () < size_t (numPartitions-1), + std::logic_error, prefix << "topBlocks.size() (= " + << topBlocks.size() << ") < numPartitions-1 (= " + << (numPartitions-1) << ")." << suffix); // The top partition (partition index zero) should always be // nonempty if we get this far, so its top block should also be // nonempty. - TEUCHOS_TEST_FOR_EXCEPTION(topBlocks[0].empty(), std::logic_error, - "KokkosNodeTsqr::factorSecondPass: topBlocks[0] is " - "empty. Please report this bug to the Kokkos " - "developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (topBlocks[0].empty(), std::logic_error, + prefix << "topBlocks[0] is empty." << suffix); // However, other partitions besides the top one might be empty, // in which case their top blocks will be empty. We skip over // the empty partitions in the loop below. - work_.resize (static_cast (topBlocks[0].ncols())); - for (int partIdx = 1; partIdx < numPartitions; ++partIdx) - if (! topBlocks[partIdx].empty()) + work_.resize (size_t (topBlocks[0].ncols())); + for (int partIdx = 1; partIdx < numPartitions; ++partIdx) { + if (! topBlocks[partIdx].empty ()) { tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]); + } + } } void @@ -1803,33 +1617,33 @@ namespace TSQR { const CacheBlockingStrategy& strategy, const bool explicitQ) const { + const char prefix[] = "KokkosNodeTsqr::applySecondPass: "; + const char suffix[] = " Please report this bug to the Tpetra developers."; + const int numParts = factorOutput.numPartitions(); if (numParts <= 1) return; // Done! - TEUCHOS_TEST_FOR_EXCEPTION(topBlocksOfC.size() != static_cast(numParts), - std::logic_error, - "KokkosNodeTsqr:applySecondPass: topBlocksOfC.size() (" - "= " << topBlocksOfC.size() << ") != number of partiti" - "ons (= " << numParts << "). Please report this bug t" - "o the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(factorOutput.secondPassTauArrays.size() != - static_cast(numParts-1), - std::logic_error, - "KokkosNodeTsqr:applySecondPass: factorOutput" - ".secondPassTauArrays.size() (= " - << factorOutput.secondPassTauArrays.size() - << ") != number of partitions minus 1 (= " - << (numParts-1) << "). Please report this bug" - " to the Kokkos developers."); + TEUCHOS_TEST_FOR_EXCEPTION + (topBlocksOfC.size () != size_t (numParts), std::logic_error, + prefix << "topBlocksOfC.size() (= " << topBlocksOfC.size() + << ") != number of partitions (= " << numParts << ")." + << suffix); + TEUCHOS_TEST_FOR_EXCEPTION + (factorOutput.secondPassTauArrays.size () != size_t (numParts-1), + std::logic_error, prefix << + "factorOutput.secondPassTauArrays.size() (= " + << factorOutput.secondPassTauArrays.size() + << ") != number of partitions minus 1 (= " + << (numParts-1) << ")." << suffix); const LocalOrdinal numCols = topBlocksOfC[0].ncols(); - work_.resize (static_cast (numCols)); + work_.resize (size_t (numCols)); // Top blocks of C are the whole cache blocks. We only want to // affect the top ncols x ncols part of each of those blocks in // this method. mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].get(), - topBlocksOfC[0].lda()); - if (applyType.transposed()) { + topBlocksOfC[0].lda()); + if (applyType.transposed ()) { // Don't include the topmost (index 0) partition in the // iteration; that corresponds to C_top_square. for (int partIdx = 1; partIdx < numParts; ++partIdx) { @@ -1871,7 +1685,7 @@ namespace TSQR { // just fill the top n x n part of the top blocks // with zeros. if (explicitQ) { - C_cur_square.fill (Teuchos::ScalarTraits::zero()); + C_cur_square.fill (Scalar {}); } applyPair (applyType, factorOutput.topBlocks[partIdx], factorOutput.secondPassTauArrays[partIdx-1], diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index 6c50d18fabea..26c5bffcc2af 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -62,8 +62,6 @@ namespace TSQR { /// Test the accuracy of KokkosNodeTsqr's QR factorization on a /// numRows by numCols matrix, and print results to stdout. /// - /// \param node [in] The Kokkos Node instance on which to execute - /// in parallel. /// \param gen [in/out] Pseudorandom number generator for the /// normal(0,1) distribution. /// \param numRows [in] Number of rows in the test matrix. @@ -82,10 +80,9 @@ namespace TSQR { /// for a script to parse. /// \param debug [in] Whether to print extra debugging output to /// stderr. - template + template void - verifyKokkosNodeTsqr (const Teuchos::RCP& node, - TSQR::Random::NormalGenerator& gen, + verifyKokkosNodeTsqr (TSQR::Random::NormalGenerator& gen, const Ordinal numRows, const Ordinal numCols, const int numPartitions, @@ -102,7 +99,7 @@ namespace TSQR { using std::cerr; using std::cout; using std::endl; - typedef TSQR::KokkosNodeTsqr node_tsqr_type; + using node_tsqr_type = TSQR::KokkosNodeTsqr; typedef typename node_tsqr_type::FactorOutput factor_output_type; typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; @@ -117,33 +114,30 @@ namespace TSQR { params->set ("Cache Size Hint", cacheSizeHint); params->set ("Num Tasks", numPartitions); node_tsqr_type actor (params); - actor.setNode (node); - if (debug) - { - cerr << actor.description() << endl; - if (contiguousCacheBlocks) - cerr << "-- Test with contiguous cache blocks" << endl; + if (debug) { + cerr << actor.description() << endl; + if (contiguousCacheBlocks) { + cerr << "-- Test with contiguous cache blocks" << endl; } + } // Allocate space for test problem. matrix_type A (numRows, numCols); matrix_type A_copy (numRows, numCols); matrix_type Q (numRows, numCols); matrix_type R (numCols, numCols); - if (std::numeric_limits::has_quiet_NaN) - { - A.fill (std::numeric_limits::quiet_NaN()); - A_copy.fill (std::numeric_limits::quiet_NaN()); - Q.fill (std::numeric_limits::quiet_NaN()); - R.fill (std::numeric_limits::quiet_NaN()); - } - else - { - A.fill (STS::zero()); - A_copy.fill (STS::zero()); - Q.fill (STS::zero()); - R.fill (STS::zero()); - } + if (std::numeric_limits::has_quiet_NaN) { + A.fill (std::numeric_limits::quiet_NaN()); + A_copy.fill (std::numeric_limits::quiet_NaN()); + Q.fill (std::numeric_limits::quiet_NaN()); + R.fill (std::numeric_limits::quiet_NaN()); + } + else { + A.fill (Scalar {}); + A_copy.fill (Scalar {}); + Q.fill (Scalar {}); + R.fill (Scalar {}); + } const Ordinal lda = numRows; const Ordinal ldq = numRows; const Ordinal ldr = numCols; @@ -151,18 +145,16 @@ namespace TSQR { // Create a test problem nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), true); - if (debug) - { - cerr << "-- Generated test problem" << endl; - // Don't print the matrix if it's too big. - if (A.nrows() <= 30) - { - cerr << "A = " << endl; - print_local_matrix (cerr, A.nrows(), A.ncols(), - A.get(), A.lda()); - cerr << endl << endl; - } + if (debug) { + cerr << "-- Generated test problem" << endl; + // Don't print the matrix if it's too big. + if (A.nrows() <= 30) { + cerr << "A = " << endl; + print_local_matrix (cerr, A.nrows(), A.ncols(), + A.get(), A.lda()); + cerr << endl << endl; } + } // Copy A into A_copy, since TSQR overwrites the input. If // specified, rearrange the data in A_copy so that the data in @@ -231,7 +223,7 @@ namespace TSQR { if (debug) { cerr << "-- Filling R with zeros" << endl; } - R.fill (STS::zero()); + R.fill (Scalar {}); if (debug) { cerr << "-- Calling factor()" << endl; @@ -254,9 +246,9 @@ namespace TSQR { actor.top_block (Q.view (), contiguousCacheBlocks); mat_view_type Q_top_square (Q_top.ncols(), Q_top.ncols(), Q_top.get(), Q_top.lda()); - Q_top_square.fill (STS::zero ()); + Q_top_square.fill (Scalar {}); for (Ordinal j = 0; j < Q_top_square.ncols(); ++j) { - Q_top_square(j,j) = STS::one (); + Q_top_square(j,j) = Scalar (1.0); } } actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), @@ -355,8 +347,6 @@ namespace TSQR { /// to that of LAPACK's QR factorization. Print results to /// stdout. /// - /// \param node [in] The Kokkos Node instance on which to execute - /// in parallel. /// \param numTrials [in] Number of times to run the benchmark; /// the timing result is cumulative over all trials. Timing /// over larger numbers of trials improves certainty of the @@ -375,11 +365,9 @@ namespace TSQR { /// \param humanReadable [in] Whether to print output that is easy /// for humans to read, or instead to print output that is easy /// for a script to parse. - /// - template + template void - benchmarkKokkosNodeTsqr (const Teuchos::RCP& node, - const int numTrials, + benchmarkKokkosNodeTsqr (const int numTrials, const Ordinal numRows, const Ordinal numCols, const int numPartitions, @@ -395,10 +383,8 @@ namespace TSQR { using std::cerr; using std::cout; using std::endl; - typedef TSQR::KokkosNodeTsqr node_tsqr_type; + using node_tsqr_type = TSQR::KokkosNodeTsqr; typedef typename node_tsqr_type::FactorOutput factor_output_type; - typedef Teuchos::ScalarTraits STS; - // typedef typename STS::magnitudeType magnitude_type; typedef Teuchos::Time timer_type; typedef Matrix matrix_type; @@ -413,7 +399,6 @@ namespace TSQR { params->set ("Cache Size Hint", cacheSizeHint); params->set ("Num Tasks", numPartitions); node_tsqr_type actor (params); - actor.setNode (node); // Allocate space for test problem. matrix_type A (numRows, numCols); @@ -423,7 +408,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (STS::zero()); + R.fill (Scalar {}); // Create a test problem nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), false); diff --git a/packages/tpetra/tsqr/test/CMakeLists.txt b/packages/tpetra/tsqr/test/CMakeLists.txt index 4720a65884c2..efdd959bb082 100644 --- a/packages/tpetra/tsqr/test/CMakeLists.txt +++ b/packages/tpetra/tsqr/test/CMakeLists.txt @@ -27,6 +27,16 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( NUM_MPI_PROCS 1 ) +# Performance and accuracy test suite for TSQR::KokkosNodeTsqr +TRIBITS_ADD_EXECUTABLE_AND_TEST( + KokkosHostTsqr + SOURCES Tsqr_TestKokkosNodeTsqr.cpp + COMM serial mpi + ARGS "--verify --numRows=100000 --numCols=10" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 1 + ) + # This test uses LAPACK's QR factorization to get a reference for # performance and accuracy. It doesn't run any parts of the TSQR # algorithm, but it does depend on some TSQR test code (for generating diff --git a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp index 84e21e75b49f..fa24a5452737 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp @@ -52,248 +52,170 @@ #endif // HAVE_KOKKOSTSQR_COMPLEX namespace { - // - // Instantiate and return a Kokkos Node instance with the given - // parameters. - // - template - Teuchos::RCP - getNode (const Teuchos::RCP& plist, - const bool debug) - { - if (debug) { - std::cerr << "Instantiating a Kokkos Node of type " - << Teuchos::TypeNameTraits::name() << std::endl; - } - return Teuchos::rcp (new NodeType (*plist)); - } - // // The documentation string for this test executable to print out at // the command line on request. // const char docString[] = "This program tests TSQR::KokkosNodeTsqr, " - "which implements an intranode parallel version of TSQR for CPU-based " - "Kokkos Node types. Accuracy and performance tests are included."; + "which implements an intranode parallel version of TSQR for " + "Kokkos::DefaultHostExecutionSpace. Accuracy and performance " + "tests are included."; // // TestParameters encapsulates values of command-line parameters, as // well as state that may change from one benchmark / verify // invocation to the next. // - struct TestParameters { - TestParameters () : - verify (false), - benchmark (false), - numPartitions (1), - numRows (1000), - numCols (10), - numTrials (10), - testReal (true), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (false), -#endif // HAVE_KOKKOSTSQR_COMPLEX - cacheSizeHint (0), - contiguousCacheBlocks (false), - printFieldNames (true), - humanReadable (false), - debug (false) - {} - - TestParameters (const std::vector theSeed) : - verify (false), - benchmark (false), - numPartitions (1), - numRows (1000), - numCols (10), - numTrials (10), - testReal (true), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (false), -#endif // HAVE_KOKKOSTSQR_COMPLEX - cacheSizeHint (0), - contiguousCacheBlocks (false), - printFieldNames (true), - humanReadable (false), - debug (false) - {} - - bool verify, benchmark; - int numPartitions, numRows, numCols, numTrials; - bool testReal; + class TestParameters { + public: + TestParameters () = default; + TestParameters (const std::vector /* theSeed */); + + bool verify = true; + bool benchmark = false; + int numRows = 100000; + int numCols = 10; + int numTrials = 1; + bool testReal = true; #ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; + bool testComplex = true; #endif // HAVE_KOKKOSTSQR_COMPLEX - size_t cacheSizeHint; - bool contiguousCacheBlocks, printFieldNames, humanReadable, debug; + int numPartitions = 16; + int cacheSizeHint = 0; + bool contiguousCacheBlocks = false; + bool printFieldNames = true; + bool humanReadable = true; + bool debug = false; }; - - // // Run the test(s) for a particular Scalar type T. // Used by Cons, which in turn is used by runTests(). - // - template - class Dispatcher { - public: - typedef T dispatch_type; - typedef NodeType node_type; - - static void - benchmark (const Teuchos::RCP& node, - std::vector&, - const TestParameters& params, - bool& printFieldNames) - { - using TSQR::Test::benchmarkKokkosNodeTsqr; - benchmarkKokkosNodeTsqr (node, - params.numTrials, - params.numRows, - params.numCols, - params.numPartitions, - params.cacheSizeHint, - params.contiguousCacheBlocks, - printFieldNames, - params.humanReadable); - printFieldNames = false; - } + template + class Dispatcher { + public: + typedef T dispatch_type; + + static void + benchmark (std::vector&, + const TestParameters& params, + bool& printFieldNames) + { + using TSQR::Test::benchmarkKokkosNodeTsqr; + benchmarkKokkosNodeTsqr (params.numTrials, + params.numRows, + params.numCols, + params.numPartitions, + params.cacheSizeHint, + params.contiguousCacheBlocks, + printFieldNames, + params.humanReadable); + printFieldNames = false; + } - static void - verify (const Teuchos::RCP& node, - std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - TSQR::Random::NormalGenerator gen (seed); - using TSQR::Test::verifyKokkosNodeTsqr; - verifyKokkosNodeTsqr (node, - gen, - params.numRows, - params.numCols, - params.numPartitions, - params.cacheSizeHint, - params.contiguousCacheBlocks, - printFieldNames, - params.humanReadable, - params.debug); - printFieldNames = false; - // Save the seed for next time, since we can't use the same - // NormalGenerator for a different Scalar type T. - gen.getSeed (seed); - } - }; + static void + verify (std::vector& seed, + const TestParameters& params, + bool& printFieldNames) + { + TSQR::Random::NormalGenerator gen (seed); + using TSQR::Test::verifyKokkosNodeTsqr; + verifyKokkosNodeTsqr (gen, + params.numRows, + params.numCols, + params.numPartitions, + params.cacheSizeHint, + params.contiguousCacheBlocks, + printFieldNames, + params.humanReadable, + params.debug); + printFieldNames = false; + // Save the seed for next time, since we can't use the same + // NormalGenerator for a different Scalar type T. + gen.getSeed (seed); + } + }; // // Class for executing a template function over a compile-time // fixed-length list of types. See runTests() for an example. // template - class Cons { - public: - // Ultimately, this depends on NullCons' typedef of node_type. - // That is, NullCons gets to define node_type. We did it this way - // so that we don't have to make NodeType a template parameter for - // all the Cons elements of the compile-time type list. That - // makes the list long and hard to read, and is also prone to - // typos. - typedef typename CdrType::node_type node_type; - - static void - verify (const Teuchos::RCP& node, - std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - Dispatcher::verify (node, seed, params, printFieldNames); - CdrType::verify (node, seed, params, printFieldNames); - } - - static void - benchmark (const Teuchos::RCP& node, - std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - Dispatcher::benchmark (node, seed, params, printFieldNames); - CdrType::benchmark (node, seed, params, printFieldNames); - } - }; - - // - // Base case for Cons template recursion. This class also defines - // the NodeType, so that we don't have to write it multiple times in - // the compile-time type list. - // - template - class NullCons { - public: - typedef NodeType node_type; + class Cons { + public: + static void + verify (std::vector& seed, + const TestParameters& params, + bool& printFieldNames) + { + Dispatcher::verify (seed, params, printFieldNames); + CdrType::verify (seed, params, printFieldNames); + } - static void - verify (const Teuchos::RCP& node, - std::vector&, - const TestParameters&, - bool& printFieldNames) {} + static void + benchmark (std::vector& seed, + const TestParameters& params, + bool& printFieldNames) + { + Dispatcher::benchmark (seed, params, printFieldNames); + CdrType::benchmark (seed, params, printFieldNames); + } + }; - static void - benchmark (const Teuchos::RCP& node, - std::vector&, - const TestParameters&, - bool& printFieldNames) {} - }; + // Base case for Cons template recursion. + class NullCons { + public: + static void + verify (std::vector&, + const TestParameters&, + bool& printFieldNames) {} + + static void + benchmark (std::vector&, + const TestParameters&, + bool& printFieldNames) {} + }; - // // Run the tests for all types of interest. - // This routine will modify TestParameters. - // - template - void - runTests (const Teuchos::RCP& node, - const TestParameters& params) - { - // This screams for syntactic sugar, but welcome to C++, the land - // of verbose obscurity. NullCons gets to define NodeType for all - // the Cons elements "above" it in the recursion. - typedef Cons > > real_tests; + void + runTests (const TestParameters& params) + { + using real_tests = Cons>; #ifdef HAVE_KOKKOSTSQR_COMPLEX - typedef Cons, Cons, NullCons > > complex_tests; + using complex_tests = + Cons, Cons, NullCons>>; #endif // HAVE_KOKKOSTSQR_COMPLEX - // Length-4 seed for the pseudorandom number generator. The last - // entry must be an odd number. There are other restrictions on - // these values; see the LAPACK documentation for details. (0, 0, - // 0, 1) is a typical initial seed if you want reproducible - // results, but don't actually care much about randomness. - std::vector seed (4); - seed[0] = 0; - seed[1] = 0; - seed[2] = 0; - seed[3] = 1; - - bool printFieldNames = params.printFieldNames; - if (params.verify) { - if (params.testReal) { - real_tests::verify (node, seed, params, printFieldNames); - } + // Length-4 seed for the pseudorandom number generator. The last + // entry must be an odd number. There are other restrictions on + // these values; see the LAPACK documentation for details. (0, 0, + // 0, 1) is a typical initial seed if you want reproducible + // results, but don't actually care much about randomness. + std::vector seed {{0, 0, 0, 1}}; + + bool printFieldNames = params.printFieldNames; + if (params.verify) { + if (params.testReal) { + real_tests::verify (seed, params, printFieldNames); + } #ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - complex_tests::verify (node, seed, params, printFieldNames); - } + if (params.testComplex) { + complex_tests::verify (seed, params, printFieldNames); + } #endif // HAVE_KOKKOSTSQR_COMPLEX + } + // Reset this, since the first call of verify() sets it to false. + printFieldNames = params.printFieldNames; + if (params.benchmark) { + if (params.testReal) { + real_tests::benchmark (seed, params, printFieldNames); } - // Reset this, since the first call of verify() sets it to false. - printFieldNames = params.printFieldNames; - if (params.benchmark) { - if (params.testReal) { - real_tests::benchmark (node, seed, params, printFieldNames); - } #ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - complex_tests::benchmark (node, seed, params, printFieldNames); - } -#endif // HAVE_KOKKOSTSQR_COMPLEX + if (params.testComplex) { + complex_tests::benchmark (seed, params, printFieldNames); } +#endif // HAVE_KOKKOSTSQR_COMPLEX } + } // Parse command-line options for this test. // @@ -309,118 +231,114 @@ namespace { // // Return an encapsulation of the command-line options. TestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - TestParameters params; - /// We really want the cache size hint as a size_t, but - /// Teuchos::CommandLineProcessor doesn't offer that option. So - /// we read it in as an int, which means negative inputs are - /// possible. We check for those below in the input validation - /// phase. - // - // Fetch default value of cacheSizeHint. - int cacheSizeHintAsInt = static_cast (params.cacheSizeHint); - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy"); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance"); - cmdLineProc.setOption ("numRows", - ¶ms.numRows, - "Number of rows in the test matrix"); - cmdLineProc.setOption ("numCols", - ¶ms.numCols, - "Number of columns in the test matrix"); - cmdLineProc.setOption ("numTrials", - ¶ms.numTrials, - "Number of trials (only used when \"--benchmark\""); - cmdLineProc.setOption ("testReal", - "noTestReal", - ¶ms.testReal, - "Test real arithmetic"); + parseOptions (int argc, + char* argv[], + const bool allowedToPrint, + bool& printedHelp) + { + using std::cerr; + using std::endl; + + printedHelp = false; + + // Command-line parameters, set to their default values. + TestParameters params; + /// We really want the cache size hint as a size_t, but + /// Teuchos::CommandLineProcessor doesn't offer that option. So + /// we read it in as an int, which means negative inputs are + /// possible. We check for those below in the input validation + /// phase. + // + // Fetch default value of cacheSizeHint. + int cacheSizeHint = params.cacheSizeHint; + try { + using Teuchos::CommandLineProcessor; + + CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, + /* recognizeAllOptions=*/ true); + cmdLineProc.setDocString (docString); + cmdLineProc.setOption ("verify", + "noverify", + ¶ms.verify, + "Test accuracy"); + cmdLineProc.setOption ("benchmark", + "nobenchmark", + ¶ms.benchmark, + "Test performance"); + cmdLineProc.setOption ("numRows", + ¶ms.numRows, + "Number of rows in the test matrix"); + cmdLineProc.setOption ("numCols", + ¶ms.numCols, + "Number of columns in the test matrix"); + cmdLineProc.setOption ("numTrials", + ¶ms.numTrials, + "Number of trials (only used when \"--benchmark\""); + cmdLineProc.setOption ("testReal", + "noTestReal", + ¶ms.testReal, + "Test real arithmetic"); #ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("testComplex", - "noTestComplex", - ¶ms.testComplex, - "Test complex arithmetic"); + cmdLineProc.setOption ("testComplex", + "noTestComplex", + ¶ms.testComplex, + "Test complex arithmetic"); #endif // HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("numPartitions", - ¶ms.numPartitions, - "Number of partitions to use (max available parallelism)"); - cmdLineProc.setOption ("cacheSizeHint", - &cacheSizeHintAsInt, - "Cache size hint in bytes (0 means pick a reasonable default)"); - cmdLineProc.setOption ("contiguousCacheBlocks", - "noncontiguousCacheBlocks", - ¶ms.contiguousCacheBlocks, - "Whether cache blocks should be stored contiguously"); - cmdLineProc.setOption ("printFieldNames", - "noPrintFieldNames", - ¶ms.printFieldNames, - "Print field names (for machine-readable output only)"); - cmdLineProc.setOption ("humanReadable", - "machineReadable", - ¶ms.humanReadable, - "If set, make output easy to read by humans " - "(but hard to parse)"); - cmdLineProc.setOption ("debug", - "noDebug", - ¶ms.debug, - "Print debugging information"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } + params.numPartitions = Kokkos::DefaultHostExecutionSpace::concurrency(); + cmdLineProc.setOption ("numPartitions", + ¶ms.numPartitions, + "Number of partitions to use (max available parallelism)"); + cmdLineProc.setOption ("cacheSizeHint", + &cacheSizeHint, + "Cache size hint in bytes (0 means pick a reasonable default)"); + cmdLineProc.setOption ("contiguousCacheBlocks", + "noncontiguousCacheBlocks", + ¶ms.contiguousCacheBlocks, + "Whether cache blocks should be stored contiguously"); + cmdLineProc.setOption ("printFieldNames", + "noPrintFieldNames", + ¶ms.printFieldNames, + "Print field names (for machine-readable output only)"); + cmdLineProc.setOption ("humanReadable", + "machineReadable", + ¶ms.humanReadable, + "If set, make output easy to read by humans " + "(but hard to parse)"); + cmdLineProc.setOption ("debug", + "noDebug", + ¶ms.debug, + "Print debugging information"); + cmdLineProc.parse (argc, argv); + } + catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { + if (allowedToPrint) + cerr << "Unrecognized command-line option: " << e.what() << endl; + throw e; + } + catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { + printedHelp = true; + return params; // Don't verify parameters in this case + } - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - if (params.numRows <= 0) { - throw std::invalid_argument ("Number of rows must be positive"); - } else if (params.numCols <= 0) { - throw std::invalid_argument ("Number of columns must be positive"); - } else if (params.numRows < params.numCols) { - throw std::invalid_argument ("Number of rows must be >= number of columns"); - } else if (params.benchmark && params.numTrials < 1) { - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - } else if (params.numPartitions < 1) { - throw std::invalid_argument ("\"--numPartitions\" option must be >= 1"); - } else { - if (cacheSizeHintAsInt < 0) { - throw std::invalid_argument ("Cache size hint must be nonnegative"); - } else { - params.cacheSizeHint = static_cast (cacheSizeHintAsInt); - } - } - return params; + // Validate command-line options. We provide default values + // for unset options, so we don't have to validate those. + if (params.numRows <= 0) { + throw std::invalid_argument ("Number of rows must be positive"); + } else if (params.numCols <= 0) { + throw std::invalid_argument ("Number of columns must be positive"); + } else if (params.numRows < params.numCols) { + throw std::invalid_argument ("Number of rows must be >= number of columns"); + } else if (params.benchmark && params.numTrials < 1) { + throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); + } else if (params.numPartitions < 1) { + throw std::invalid_argument ("\"--numPartitions\" option must be >= 1"); + } else if (params.cacheSizeHint < 0) { + throw std::invalid_argument ("Cache size hint must be nonnegative"); } + return params; + } } // namespace (anonymous) - // // The "main" test driver. // @@ -473,23 +391,12 @@ main (int argc, char *argv[]) bool verbose = false; try { if (performingTests) { - using std::endl; - typedef KokkosClassic::DefaultNode::DefaultNodeType node_type; - - RCP nodeParams = - rcp (new ParameterList (node_type::getDefaultParameters ())); - - // We allow the same run to do both benchmark and verify. - runTests (getNode (nodeParams, params.debug), params); - + runTests (params); success = true; - // The Trilinos test framework expects a message like this. - // Obviously we haven't tested anything, but eventually we - // will include accuracy integration tests. - out << "\nEnd Result: TEST PASSED" << endl; + out << "\nEnd Result: TEST PASSED" << std::endl; } } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); + return success ? EXIT_SUCCESS : EXIT_FAILURE; } From 184167176775dfe53d30890aed37adcf99ea3fc3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 20 Nov 2019 18:14:15 -0700 Subject: [PATCH 04/50] TSQR: Remove unimplemented method from TsqrAdapter --- packages/tpetra/tsqr/src/TsqrAdaptor.hpp | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp index e8b3f23cf6fe..7ed5b992c0ec 100644 --- a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp @@ -140,7 +140,7 @@ namespace TSQR { typedef Teuchos::RCP< MessengerBase > ordinal_messenger_ptr; //! Virtual destructor ensures memory safety for derived classes. - virtual ~TsqrAdaptor() {} + virtual ~TsqrAdaptor() = default; /// \brief Compute explicit "thin" QR factorization of A. /// @@ -165,9 +165,6 @@ namespace TSQR { dense_matrix_type& R, const bool contiguousCacheBlocks = false) { - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (A); - factor_output_type output = factor (A, R, contiguousCacheBlocks); explicitQ (A, output, Q, contiguousCacheBlocks); } @@ -213,9 +210,6 @@ namespace TSQR { dense_matrix_type& R, const bool contiguousCacheBlocks = false) { - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (A); - local_ordinal_type nrowsLocal, ncols, LDA; fetchDims (A, nrowsLocal, ncols, LDA); // This is guaranteed to be _correct_ for any Node type, but @@ -268,9 +262,6 @@ namespace TSQR { { using Teuchos::ArrayRCP; - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (Q_in); - local_ordinal_type nrowsLocal, ncols_in, LDQ_in; fetchDims (Q_in, nrowsLocal, ncols_in, LDQ_in); local_ordinal_type nrowsLocal_out, ncols_out, LDQ_out; @@ -324,9 +315,6 @@ namespace TSQR { { using Teuchos::ArrayRCP; - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (Q); - local_ordinal_type nrowsLocal, ncols, ldqLocal; fetchDims (Q, nrowsLocal, ncols, ldqLocal); @@ -354,9 +342,6 @@ namespace TSQR { { using Teuchos::ArrayRCP; - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (A_in); - local_ordinal_type nrowsLocal, ncols, LDA_in; fetchDims (A_in, nrowsLocal, ncols, LDA_in); local_ordinal_type nrowsLocal_out, ncols_out, LDA_out; @@ -393,9 +378,6 @@ namespace TSQR { { using Teuchos::ArrayRCP; - // Lazily init the intranode part of TSQR if necessary. - initNodeTsqr (A_in); - local_ordinal_type nrowsLocal, ncols, LDA_in; fetchDims (A_in, nrowsLocal, ncols, LDA_in); local_ordinal_type nrowsLocal_out, ncols_out, LDA_out; @@ -481,9 +463,6 @@ namespace TSQR { factory.makeTsqr (plist, pScalarMessenger_, pTsqr_); } - // Lazily init the intranode part of TSQR if necessary. - virtual void initNodeTsqr (const multivector_type& A); - private: /// \brief Return dimensions of a multivector object. /// From 7ba99cc826dc5abcaf7e412a358611df69b63757 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 20 Nov 2019 18:17:21 -0700 Subject: [PATCH 05/50] TSQR: Fix KokkosNodeTsqr test to call Kokkos::initialize and Kokkos::finalize --- .../tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp | 33 ++----------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp index fa24a5452737..d47000f68846 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp @@ -37,15 +37,11 @@ // ************************************************************************ //@HEADER -#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI #include "Teuchos_CommandLineProcessor.hpp" #include "Teuchos_DefaultComm.hpp" #include "Teuchos_StandardCatchMacros.hpp" #include "Tsqr_KokkosNodeTsqrTest.hpp" +#include "Kokkos_Core.hpp" #ifdef HAVE_KOKKOSTSQR_COMPLEX # include @@ -345,39 +341,13 @@ namespace { int main (int argc, char *argv[]) { -#ifdef HAVE_MPI - using Teuchos::Comm; -#endif // HAVE_MPI using Teuchos::ParameterList; using Teuchos::RCP; using Teuchos::rcp; -#ifdef HAVE_MPI - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - RCP > comm = Teuchos::DefaultComm::getComm (); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to stdout. The other MPI process ranks - // send their output to something that looks like /dev/null (and - // likely is, on Unix-y operating systems). - std::ostream& out = (myRank == 0) ? std::cout : blackhole; - // Only Rank 0 performs the tests. - bool performingTests = (myRank == 0); - const bool allowedToPrint = (myRank == 0); -#else // Don't HAVE_MPI: single-node test bool performingTests = true; const bool allowedToPrint = true; std::ostream& out = std::cout; -#endif // HAVE_MPI - - // FIXME (mfh 02 Jul 2013) This test immediately segfaults on Linux - // PGI 11.1. I have no idea why and I don't have time to fix it - // now. The issue might go away if I get time to rewrite TSQR using - // Kokkos Array, so for now, I'm just going to make the test pass - // trivially when building with the PGI compiler. -#ifdef __PGI - performingTests = false; -#endif // __PGI // Fetch command-line parameters. bool printedHelp = false; @@ -391,6 +361,7 @@ main (int argc, char *argv[]) bool verbose = false; try { if (performingTests) { + Kokkos::ScopeGuard kokkosScope (argc, argv); runTests (params); success = true; // The Trilinos test framework expects a message like this. From 49a66c5267918af1d2a4cffa7996e477b5126994 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Thu, 21 Nov 2019 15:22:40 -0700 Subject: [PATCH 06/50] TSQR: Hide Teuchos::LAPACK behind opaque "raw" interface Instead of using Teuchos::LAPACK directly, TSQR now uses it through a lowest-level interface TSQR::Impl::Lapack. This is a subclass of a new interface TSQR::Impl::RawQR. The point of this interface is to encapsulate any third-party local QR implementations that run on host, with either host or device data represented as raw pointers. We have cuSOLVER specifically in mind, but it might also help build times to hide large header files like Teuchos_LAPACK.hpp. --- packages/tpetra/tsqr/src/CMakeLists.txt | 7 + .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 9 +- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 6 +- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 109 ++++--------- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 34 ++-- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 9 +- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 7 - .../tpetra/tsqr/src/Tsqr_GlobalVerify.hpp | 26 ++-- packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp | 121 +++++++++++++++ packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp | 78 ++++++++++ packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp | 74 +++++++++ packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 59 +++---- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 146 +++++++----------- .../tsqr/src/Tsqr_Random_NormalGenerator.hpp | 19 ++- packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp | 132 ++++++---------- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 21 +-- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 13 +- 17 files changed, 486 insertions(+), 384 deletions(-) create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp diff --git a/packages/tpetra/tsqr/src/CMakeLists.txt b/packages/tpetra/tsqr/src/CMakeLists.txt index 776fb93c95cc..9295e27c1691 100644 --- a/packages/tpetra/tsqr/src/CMakeLists.txt +++ b/packages/tpetra/tsqr/src/CMakeLists.txt @@ -32,3 +32,10 @@ TRIBITS_ADD_LIBRARY( SOURCES ${SOURCES} ) +# +# Make a trivial change to this comment if you add / remove a file either to +# / from this directory, or to / from the 'impl' subdirectory. That ensures +# that running "make" will also rerun CMake in order to regenerate Makefiles. +# +# Here is another such change, and here is another. +# diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index 7fdca2200dcc..e67ef077ef80 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -122,14 +122,7 @@ namespace TSQR { /// with all nonnegative diagonal entries. static bool QR_produces_R_factor_with_nonnegative_diagonal() { typedef Combine combine_type; - //typedef LAPACK lapack_type; - - const bool combineMakesNonnegDiag = - combine_type::QR_produces_R_factor_with_nonnegative_diagonal (); - //const bool lapackMakesNonnegDiag = - // lapack_type::QR_produces_R_factor_with_nonnegative_diagonal (); - const bool lapackMakesNonnegDiag = false; - return combineMakesNonnegDiag && lapackMakesNonnegDiag; + return combine_type::QR_produces_R_factor_with_nonnegative_diagonal (); } /// \typedef SeqOutput diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 925d900b1f93..42857f63e704 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -85,10 +85,6 @@ namespace TSQR { ///
  • CombineFortran, a Fortran 9x in-place implementation for /// LAPACK's four data types (S, D, C, and Z).
  • /// - /// - /// The default CombineImpl is CombineNative, since that should work - /// for any Ordinal and Scalar types for which LAPACK and BLAS are implemented. template< class Ordinal, class Scalar, class CombineImpl = CombineNative::isComplex> > @@ -105,7 +101,7 @@ namespace TSQR { typedef CombineImpl combine_impl_type; //! Constructor. - Combine () {} + Combine () = default; /// Whether or not the QR factorizations computed by methods of /// this class produce an R factor with all nonnegative diagonal diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index b81b4b39f5fe..07cacc0a6941 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -45,14 +45,9 @@ #include "Teuchos_ScalarTraits.hpp" #include "Tsqr_ApplyType.hpp" -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include "Tsqr_Matrix.hpp" -#include -#include -#include - - namespace TSQR { /// \class CombineDefault @@ -68,9 +63,6 @@ namespace TSQR { /// upper triangular). template class CombineDefault { - private: - typedef Teuchos::LAPACK lapack_type; - public: typedef Ordinal ordinal_type; typedef Scalar scalar_type; @@ -78,8 +70,6 @@ namespace TSQR { typedef ConstMatView const_mat_view_type; typedef MatView mat_view_type; - CombineDefault () {} - /// \brief Does the R factor have a nonnegative diagonal? /// /// CombineDefault implements a QR factorization (of a matrix with @@ -101,19 +91,8 @@ namespace TSQR { Scalar tau[], Scalar work[]) { - // info must be an int, not a LocalOrdinal, since LAPACK - // routines always (???) use int for the INFO output argument, - // whether or not they were built with 64-bit integer index - // support. - int info = 0; - lapack_.GEQR2 (nrows, ncols, A, lda, tau, work, &info); - if (info != 0) - { - std::ostringstream os; - os << "TSQR::CombineDefault::factor_first(): LAPACK\'s " - << "GEQR2 failed with INFO = " << info; - throw std::logic_error (os.str()); - } + const int lwork = ncols; + lapack_.compute_QR (nrows, ncols, A, lda, tau, work, lwork); } void @@ -128,22 +107,15 @@ namespace TSQR { const Ordinal ldc, Scalar work[]) { - int info = 0; // LAPACK has the nice feature that it only reads the first // letter of input strings that specify things like which side // to which to apply the operator, or whether to apply the // transpose. That means we can make the strings more verbose, // as in "Left" here for the SIDE parameter. - lapack_.UNM2R ('L', (applyType.toString ().c_str ())[0], - nrows, ncols_C, ncols_A, - A, lda, tau, - C, ldc, work, &info); - if (info != 0) { - std::ostringstream os; - os << "TSQR::CombineDefault::apply_first(): LAPACK\'s " - << "UNM2R failed with INFO = " << info; - throw std::logic_error (os.str()); - } + const std::string trans = applyType.toString (); + const int lwork = ncols_C; + lapack_.apply_Q_factor ('L', trans[0], nrows, ncols_C, ncols_A, + A, lda, tau, C, ldc, work, lwork); } void @@ -177,18 +149,12 @@ namespace TSQR { deep_copy (C_buf_top, C_top_view); deep_copy (C_buf_bot, C_bot_view); - int info = 0; - lapack_.UNM2R ('L', (apply_type.toString ().c_str ())[0], - numRows, ncols_C, ncols_Q, - A_buf_.get(), A_buf_.lda(), tau, - C_buf_.get(), C_buf_.lda(), - work, &info); - if (info != 0) { - std::ostringstream os; - os << "TSQR::CombineDefault::apply_inner(): LAPACK\'s " - << "UNM2R failed with INFO = " << info; - throw std::logic_error (os.str()); - } + const std::string trans = apply_type.toString (); + const int lwork = ncols_C; + lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, + A_buf_.get(), A_buf_.lda(), tau, + C_buf_.get(), C_buf_.lda(), + work, lwork); // Copy back the results. deep_copy (C_top_view, C_buf_top); deep_copy (C_bot_view, C_buf_bot); @@ -207,7 +173,7 @@ namespace TSQR { const Ordinal numRows = m + n; A_buf_.reshape (numRows, n); - A_buf_.fill (Scalar(0)); + A_buf_.fill (Scalar {}); // R might be a view of the upper triangle of a cache block, but // we only want to include the upper triangle in the // factorization. Thus, only copy the upper triangle of R into @@ -215,11 +181,9 @@ namespace TSQR { copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.lda(), R, ldr); copy_matrix (m, n, &A_buf_(n, 0), A_buf_.lda(), A, lda); - int info = 0; - lapack_.GEQR2 (numRows, n, A_buf_.get(), A_buf_.lda(), tau, work, &info); - if (info != 0) - throw std::logic_error ("TSQR::CombineDefault: GEQR2 failed"); - + const int lwork = n; + lapack_.compute_QR (numRows, n, A_buf_.get(), A_buf_.lda(), + tau, work, lwork); // Copy back the results. R might be a view of the upper // triangle of a cache block, so only copy into the upper // triangle of R. @@ -239,7 +203,7 @@ namespace TSQR { const Ordinal numRows = Ordinal(2) * n; A_buf_.reshape (numRows, n); - A_buf_.fill (Scalar(0)); + A_buf_.fill (Scalar {}); // Copy the inputs into the compute buffer. Only touch the // upper triangles of R_top and R_bot, since they each may be // views of some cache block (where the strict lower triangle @@ -248,16 +212,9 @@ namespace TSQR { copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.lda(), R_top, ldr_top); copy_upper_triangle (n, n, &A_buf_(n, 0), A_buf_.lda(), R_bot, ldr_bot); - int info = 0; - lapack_.GEQR2 (numRows, n, A_buf_.get(), A_buf_.lda(), tau, work, &info); - if (info != 0) - { - std::ostringstream os; - os << "TSQR::CombineDefault::factor_pair(): " - << "GEQR2 failed with INFO = " << info; - throw std::logic_error (os.str()); - } - + const int lwork = n; + lapack_.compute_QR (numRows, n, A_buf_.get(), A_buf_.lda(), + tau, work, lwork); // Copy back the results. Only read the upper triangles of the // two n by n row blocks of A_buf_ (this means we don't have to // zero out the strict lower triangles), and only touch the @@ -282,7 +239,7 @@ namespace TSQR { const Ordinal numRows = Ordinal(2) * ncols_Q; A_buf_.reshape (numRows, ncols_Q); - A_buf_.fill (Scalar(0)); + A_buf_.fill (Scalar {}); copy_upper_triangle (ncols_Q, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.lda(), R_bot, ldr_bot); @@ -290,30 +247,22 @@ namespace TSQR { copy_matrix (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.lda(), C_top, ldc_top); copy_matrix (ncols_Q, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.lda(), C_bot, ldc_bot); - int info = 0; - lapack_.UNM2R ('L', (apply_type.toString ().c_str ())[0], - numRows, ncols_C, ncols_Q, - A_buf_.get(), A_buf_.lda(), tau, - C_buf_.get(), C_buf_.lda(), - work, &info); - if (info != 0) { - std::ostringstream os; - os << "TSQR::CombineDefault: UNM2R failed with INFO = " << info; - throw std::logic_error (os.str ()); - } - + const int lwork = ncols_Q; + const std::string trans = apply_type.toString (); + lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, + A_buf_.get(), A_buf_.lda(), tau, + C_buf_.get(), C_buf_.lda(), + work, lwork); // Copy back the results. copy_matrix (ncols_Q, ncols_C, C_top, ldc_top, &C_buf_(0, 0), C_buf_.lda()); copy_matrix (ncols_Q, ncols_C, C_bot, ldc_bot, &C_buf_(ncols_Q, 0), C_buf_.lda()); } private: - lapack_type lapack_; + Impl::Lapack lapack_; Matrix A_buf_; Matrix C_buf_; }; - - } // namespace TSQR #endif // __TSQR_CombineDefault_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index 4b7e0db3e138..56d402d19368 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -43,13 +43,13 @@ #ifndef __TSQR_CombineNative_hpp #define __TSQR_CombineNative_hpp -#include "Teuchos_LAPACK.hpp" #include "Teuchos_ScalarTraits.hpp" #include "Tsqr_ApplyType.hpp" #include "Tsqr_CombineDefault.hpp" #include "Kokkos_Core.hpp" #include "KokkosBlas2_gemv.hpp" #include "Kokkos_ArithTraits.hpp" +#include "Tsqr_Impl_Lapack.hpp" namespace TSQR { @@ -77,13 +77,9 @@ namespace TSQR { typedef Ordinal ordinal_type; private: - typedef Teuchos::LAPACK lapack_type; typedef CombineDefault combine_default_type; public: - - CombineNative () {} - /// Whether or not the QR factorizations computed by methods of /// this class produce an R factor with all nonnegative diagonal /// entries. It depends on LAPACK because this implementation @@ -92,8 +88,7 @@ namespace TSQR { /// of {LARFGP, LARFP}, which is necessary to ensure that the BETA /// output of the function is always nonnegative. static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return /* lapack_type::QR_produces_R_factor_with_nonnegative_diagonal() */ false && - combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); } void @@ -193,7 +188,6 @@ namespace TSQR { using device_type = Kokkos::Device; private: - typedef Teuchos::LAPACK lapack_type; typedef CombineDefault combine_default_type; void @@ -204,12 +198,13 @@ namespace TSQR { void LARFG (const Ordinal n, - scalar_type* const alpha, + scalar_type& alpha, const Kokkos::View& x, - scalar_type* const tau) const + scalar_type& tau) const { constexpr Ordinal incx {1}; - lapack_type ().LARFG (n, alpha, x.data (), incx, tau); + Impl::Lapack lapack; + lapack.LARFG (n, alpha, x.data (), incx, tau); } magnitude_type @@ -278,8 +273,7 @@ namespace TSQR { CombineNative () = default; static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return /* lapack_type::QR_produces_R_factor_with_nonnegative_diagonal() */ false && - combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); } void @@ -371,15 +365,11 @@ namespace TSQR { typedef Ordinal ordinal_type; private: - typedef Teuchos::LAPACK lapack_type; typedef CombineDefault combine_default_type; public: - CombineNative () {} - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return /* lapack_type::QR_produces_R_factor_with_nonnegative_diagonal() */ false && - combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); } void @@ -556,7 +546,7 @@ namespace TSQR { auto A_1k = subview (A_view, ALL (), k); auto A_1kp1 = subview (A_view, range_type (0, m), range_type (k+1, n)); - this->LARFG (m + 1, &R_kk, A_1k, &tau_view[k]); + this->LARFG (m + 1, R_kk, A_1k, tau_view[k]); this->GEMV ("T", ONE, A_1kp1, A_1k, ZERO, work_view); for (Ordinal j = k+1; j < n; ++j) { @@ -570,7 +560,7 @@ namespace TSQR { Scalar& R_nn = R_view(n-1, n-1); auto A_1n = subview (A_view, ALL (), n-1); - this->LARFG (m+1, &R_nn, A_1n, &tau_view[n-1]); + this->LARFG (m+1, R_nn, A_1n, tau_view[n-1]); } @@ -725,7 +715,7 @@ namespace TSQR { // k+2: 1 element in R_top (R_top(k,k)), and k+1 elements in // R_bot (R_bot(1:k,k), in 1-based indexing notation). - this->LARFG (k+2, &R_top_kk, R_bot_1k, &tau_view[k]); + this->LARFG (k+2, R_top_kk, R_bot_1k, tau_view[k]); // One-based indexing, Matlab version of the GEMV call below: // work(1:k) := R_bot(1:k,k+1:n)' * R_bot(1:k,k) @@ -743,7 +733,7 @@ namespace TSQR { // n+1: 1 element in R_top (n,n), and n elements in R_bot (the // whole last column). - this->LARFG (n+1, &R_top_nn, R_bot_1n, &tau_view[n-1]); + this->LARFG (n+1, R_top_nn, R_bot_1n, tau_view[n-1]); } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 502d18bd96e2..5f37e9b65a37 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -60,13 +60,8 @@ namespace TSQR { /// \tparam Scalar Value type for matrices to factor. /// /// This class combines the square R factors computed by the - /// intranode TSQR factorization (\c NodeTsqr subclass) on - /// individual MPI processes. - /// - /// It should be possible to instantiate - /// DistTsqr for any LocalOrdinal and Scalar - /// types for which \c Combine and \c - /// LAPACK can be instantiated. + /// intranode TSQR factorization (NodeTsqr subclass) on individual + /// MPI processes. template class DistTsqr : public Teuchos::ParameterListAcceptorDefaultBase { public: diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index ede5bb95dc35..94dfdac67005 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -541,13 +541,6 @@ namespace TSQR { public: /// \typedef ordinal_type /// \brief The (local) Ordinal type to use for TSQR. - /// - /// This must be a type for which Teuchos::BLAS and Teuchos::LAPACK each have - /// an instantiation. That means a signed integer type. LAPACK - /// and the BLAS can be built with signed 64-bit integers - /// (int64_t), but usually they are only built with signed - /// 32-bit integers (int). typedef int ordinal_type; /// \brief Return a valid parameter list for verifying Tsqr. diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp index 9ff5b04ffd24..636e73b19708 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp @@ -144,22 +144,14 @@ namespace TSQR { typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; - // FIXME (mfh 20 Apr 2010) This is currently implemented using an - // all-reduction. This may result in different processors getting - // slightly different answers, due to floating-point arithmetic - // roundoff. We might not want this if we are using this function - // to test a routine. - - magnitude_type localResult (0); - for (LocalOrdinal j = 0; j < ncols; j++) - { - const Scalar* const cur_col = &A_local[j*lda_local]; - for (LocalOrdinal i = 0; i < nrows_local; ++i) - { - const magnitude_type abs_xi = STS::magnitude (cur_col[i]); - localResult = localResult + abs_xi * abs_xi; - } + magnitude_type localResult {}; + for (LocalOrdinal j = 0; j < ncols; j++) { + const Scalar* const cur_col = &A_local[j*lda_local]; + for (LocalOrdinal i = 0; i < nrows_local; ++i) { + const magnitude_type abs_xi = STS::magnitude (cur_col[i]); + localResult = localResult + abs_xi * abs_xi; } + } // GlobalSummmer() is a hack to let us use a Scalar - type // MessengerBase with magnitude_type inputs and outputs. // Otherwise we would need to carry around a @@ -190,8 +182,8 @@ namespace TSQR { using std::pair; using std::vector; - const magnitude_type ZERO (0); - const magnitude_type ONE (1); + const magnitude_type ZERO {}; + const magnitude_type ONE (1.0); Teuchos::BLAS blas; // diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp new file mode 100644 index 000000000000..3bee59a96325 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp @@ -0,0 +1,121 @@ +#include "Tsqr_Impl_Lapack.hpp" +#include "Teuchos_LAPACK.hpp" +#include +#include + +namespace TSQR { +namespace Impl { + +#define TSQR_IMPL_LAPACK_IMPL( Scalar ) \ +void Lapack:: \ +LARNV(const int idist, int seed[], const int n, \ + value_type v[]) const \ +{ \ + Teuchos::LAPACK lapack; \ + lapack.LARNV(idist, seed, n, v); \ +} \ + \ +void Lapack:: \ +POTRF(const char UPLO, const int n, \ + value_type A[], const int lda) const \ +{ \ + Teuchos::LAPACK lapack; \ + int info = 0; \ + lapack.POTRF(UPLO, n, A, lda, &info); \ + if (info != 0) { \ + std::ostringstream os; \ + os << "LAPACK POTRF (Cholesky factorization) " \ + << "failed with INFO = " << info << "."; \ + throw std::logic_error (os.str ()); \ + } \ +} \ + \ +void Lapack:: \ +GESVD(const char JOBU, const char JOBVT, \ + const int m, const int n, \ + value_type A[], const int lda, \ + magnitude_type S[], value_type U[], const int ldu, \ + value_type V[], const int ldv, \ + value_type WORK[], const int lwork, \ + magnitude_type RWORK[]) const \ +{ \ + Teuchos::LAPACK lapack; \ + int info = 0; \ + lapack.GESVD(JOBU, JOBVT, m, n, A, lda, S, \ + U, ldu, V, ldv, WORK, lwork, RWORK, &info); \ + if (info != 0) { \ + std::ostringstream os; \ + os << "LAPACK GESVD (singular value decomposition) " \ + << "failed with INFO = " << info << "."; \ + throw std::logic_error (os.str ()); \ + } \ +} \ + \ +void Lapack:: \ +LARFG(const int n, value_type& alpha, value_type x[], \ + const int incx, value_type& tau) const \ +{ \ + Teuchos::LAPACK lapack; \ + lapack.LARFG(n, &alpha, x, incx, &tau); \ +} \ + \ +void Lapack:: \ +compute_QR(const int m, const int n, value_type A[], const int lda, \ + value_type TAU[], value_type WORK[], const int lwork) const \ +{ \ + Teuchos::LAPACK lapack; \ + int info = 0; \ + lapack.GEQRF(m, n, A, lda, TAU, WORK, lwork, &info); \ + if (info != 0) { \ + std::ostringstream os; \ + os << "LAPACK GEQRF (QR factorization) failed with INFO = " \ + << info << "."; \ + throw std::logic_error (os.str()); \ + } \ +} \ + \ +void Lapack:: \ +apply_Q_factor(const char SIDE, const char TRANS, \ + const int m, const int n, const int k, \ + const value_type A[], const int lda, \ + const value_type TAU[], \ + value_type C[], const int ldc, \ + value_type WORK[], const int lwork) const \ +{ \ + Teuchos::LAPACK lapack; \ + int info = 0; \ + value_type* A_nc = const_cast(A); \ + lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, WORK, \ + lwork, &info); \ + if (info != 0) { \ + std::ostringstream os; \ + os << "LAPACK UNMQR (apply Q factor from GEQRF) failed with " \ + "INFO = " << info << "."; \ + throw std::logic_error (os.str()); \ + } \ +} \ + \ +void Lapack:: \ +compute_explicit_Q(const int m, const int n, const int k, \ + value_type A[], const int lda, \ + const value_type TAU[], value_type WORK[], \ + const int lwork) const \ +{ \ + Teuchos::LAPACK lapack; \ + int info = 0; \ + lapack.UNGQR(m, n, k, A, lda, TAU, WORK, lwork, &info); \ + if (info != 0) { \ + std::ostringstream os; \ + os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) " \ + "failed with INFO = " << info << "."; \ + throw std::logic_error (os.str()); \ + } \ +} + +TSQR_IMPL_LAPACK_IMPL( float ) +TSQR_IMPL_LAPACK_IMPL( double ) +TSQR_IMPL_LAPACK_IMPL( std::complex ) +TSQR_IMPL_LAPACK_IMPL( std::complex ) + +} // namespace Impl +} // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp new file mode 100644 index 000000000000..88e615ab637d --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp @@ -0,0 +1,78 @@ +#ifndef TSQR_IMPL_LAPACK_HPP +#define TSQR_IMPL_LAPACK_HPP + +#include "Tsqr_Impl_RawQR.hpp" +#include + +namespace TSQR { +namespace Impl { + +template +class Lapack {}; + +// CombineNative needs LARFG, but it's not properly part of RawQR. +// RawQR needs to be able to wrap lots of different functions, +// including whatever cuSOLVER provides. It doesn't make sense to +// launch a device kernel from host for ever column of the matrix, +// especially not when cuSOLVER already has all the needed QR +// factorization and apply Q factor functions. + +#define TSQR_IMPL_LAPACK_DECL( Scalar ) \ +template<> \ +class Lapack : public RawQR { \ +public: \ + using value_type = Scalar; \ + using magnitude_type = decltype(std::abs(Scalar{})); \ + \ + ~Lapack() = default; \ + \ + void \ + compute_QR(const int m, const int n, value_type A[], \ + const int lda, value_type TAU[], value_type WORK[], \ + const int lwork) const override; \ + \ + void \ + apply_Q_factor(const char SIDE, const char TRANS, \ + const int m, const int n, const int k, \ + const value_type A[], const int lda, \ + const value_type TAU[], \ + value_type C[], const int ldc, \ + value_type WORK[], const int lwork) const override; \ + \ + void \ + compute_explicit_Q(const int m, const int n, const int k, \ + value_type A[], const int lda, \ + const value_type TAU[], value_type WORK[], \ + const int lwork) const override; \ + \ + void \ + GESVD(const char JOBU, const char JOBVT, \ + const int m, const int n, \ + value_type A[], const int lda, \ + magnitude_type S[], value_type U[], const int ldu, \ + value_type V[], const int ldv, \ + value_type WORK[], const int lwork, \ + magnitude_type RWORK[]) const; \ + \ + void \ + LARFG(const int n, value_type& alpha, value_type x[], \ + const int incx, value_type& tau) const; \ + \ + void \ + POTRF(const char UPLO, const int n, \ + value_type A[], const int lda) const; \ + \ + void \ + LARNV(const int idist, int seed[], const int n, \ + value_type v[]) const; \ +}; + +TSQR_IMPL_LAPACK_DECL( float ) +TSQR_IMPL_LAPACK_DECL( double ) +TSQR_IMPL_LAPACK_DECL( std::complex ) +TSQR_IMPL_LAPACK_DECL( std::complex ) + +} // namespace Impl +} // namespace TSQR + +#endif // TSQR_IMPL_LAPACK_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp new file mode 100644 index 000000000000..307aa103e9a9 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp @@ -0,0 +1,74 @@ +#ifndef TSQR_IMPL_RAWQR_HPP +#define TSQR_IMPL_RAWQR_HPP + +namespace TSQR { +namespace Impl { + +/// \brief "Raw" local QR factorization interface. +/// +/// Subclass and specialize this interface as needed. +/// +/// The methods are instance methods so that subclass instances may +/// have state. For example, a cuSOLVER implementation would have a +/// CUDA stream instance (cudaStream_t) and a cuSOLVER handle +/// (cusolverDnHandle_t). +/// +/// WORK size query ("LWORK query") happens as in LAPACK, by passing +/// in lwork = -1. A cuSOLVER Implementation would just check if +/// lwork is -1, and call cusolverDn?geqrf_bufferSize in that case +/// (replace the question mark with S, D, C, or Z as appropriate for +/// the Scalar type). +/// +/// Methods are virtual because they are meant to be called from host. +/// (For the CUDA case, we plan to make cuSOLVER calls from host; we +/// don't need to call QR from device.) +template +class RawQR { +public: + using value_type = Scalar; + + virtual ~RawQR() = default; + + //! Compute QR factorization of a general m by n matrix A. + virtual void + compute_QR(const int m, const int n, + value_type A[], const int lda, + value_type TAU[], + value_type WORK[], const int lwork) const = 0; + + /// \brief Apply Householder reflectors. + /// + /// Overwrite the general complex m by n matrix C with the product + /// of Q and C, where Q is the product of k elementary (Householder) + /// reflectors as returned by GEQRF. + /// + /// This corresponds to LAPACK's _UNMQR for complex value_type types, + /// and to LAPACK's _ORMQR for real value_type types. + virtual void + apply_Q_factor(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc, + value_type WORK[], const int lwork) const = 0; + + /// \brief Compute explicit QR factor from QR factorization (GEQRF). + /// + /// Generate the m by n matrix Q with orthonormal (or unitary, if + /// value_type is complex) columns corresponding to the first n columns + /// of a product of k elementary reflectors of order m, as returned + /// by GEQRF. + /// + /// This corresponds to LAPACK's _UNGQR for complex value_type types, + /// and to LAPACK's _ORGQR for real value_type types. + virtual void + compute_explicit_Q(const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[], + value_type WORK[], const int lwork) const = 0; +}; + +} // namespace Impl +} // namespace TSQR + +#endif // TSQR_IMPL_RAWQR_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index 11af6c10ff09..21077b665726 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -47,7 +47,7 @@ #include "Tsqr_Matrix.hpp" #include "Teuchos_as.hpp" #include "Teuchos_Describable.hpp" -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include "Teuchos_ScalarTraits.hpp" #include "Teuchos_TypeNameTraits.hpp" #include @@ -321,7 +321,6 @@ namespace TSQR { const bool contiguousCacheBlocks) const = 0; protected: - /// \brief Return view of topmost cache block of C /// /// \param C [in] Matrix (view), supporting the usual nrows(), @@ -343,7 +342,6 @@ namespace TSQR { const bool contiguousCacheBlocks) const = 0; public: - /// \brief Return view of topmost cache block of C. /// /// \param C [in] View of a matrix C. @@ -498,7 +496,7 @@ namespace TSQR { // factor is full rank (expected to be the common case), we need // to leave it alone (so that it stays upper triangular). // - Teuchos::LAPACK lapack; + Impl::Lapack lapack; mat_view_type R_view (ncols, ncols, R, ldr); Matrix B (R_view); // B := R (deep copy) mat_view_type U_view (ncols, ncols, U, ldu); @@ -508,30 +506,23 @@ namespace TSQR { std::vector svd_rwork (5*ncols); std::vector singular_values (ncols); Ordinal svd_lwork = -1; // -1 for LWORK query; will be changed - int svd_info = 0; // LAPACK workspace ("LWORK") query for SVD. The workspace // ("WORK") array is always of Scalar type, even in the complex // case. { // Exception messages in this scope all start with this. - const char prefix[] = "In NodeTsqr::reveal_R_rank: LAPACK SVD (_GESVD) " - "workspace query returned "; + const char prefix[] = "In NodeTsqr::reveal_R_rank: LAPACK SVD " + "(_GESVD) workspace query returned "; // std::logic_error messages in this scope all end with this. const char postfix[] = ". Please report this bug to the Kokkos " "developers."; - Scalar svd_lwork_scalar = STS::zero (); + Scalar svd_lwork_scalar {}; lapack.GESVD ('A', 'A', ncols, ncols, B.get(), B.lda(), - &singular_values[0], U_view.get(), U_view.lda(), + singular_values.data(), U_view.get(), U_view.lda(), VT.get(), VT.lda(), &svd_lwork_scalar, svd_lwork, - &svd_rwork[0], &svd_info); - // Failure of the LAPACK workspace query is a logic error (a - // bug) because we have already validated the matrix - // dimensions above. - TEUCHOS_TEST_FOR_EXCEPTION(svd_info != 0, std::logic_error, - prefix << "a nonzero INFO = " << svd_info - << postfix); + svd_rwork.data()); // LAPACK returns the workspace array length as a Scalar. We // have to convert it back to an Ordinal in order to allocate // the workspace array and pass it in to LAPACK as the LWORK @@ -549,22 +540,20 @@ namespace TSQR { // original Scalar result. This should work unless Scalar and // Ordinal are user-defined types with weird definitions of // the type casts. - TEUCHOS_TEST_FOR_EXCEPTION(as (svd_lwork) != svd_lwork_scalar, - std::logic_error, - prefix << "a workspace array length (LWORK) of type " - "Scalar=" << TypeNameTraits::name() - << " that does not fit in an Ordinal=" - << TypeNameTraits::name() << " type. " - "As a Scalar, LWORK=" << svd_lwork_scalar - << ", but cast to Ordinal, LWORK=" << svd_lwork - << postfix); + TEUCHOS_TEST_FOR_EXCEPTION + (as (svd_lwork) != svd_lwork_scalar, std::logic_error, + prefix << "a workspace array length (LWORK) of type Scalar=" + << TypeNameTraits::name() << " that does not fit in " + << "Ordinal=" << TypeNameTraits::name() << " type." + " As a Scalar, LWORK=" << svd_lwork_scalar << ", but cast " + << "to Ordinal, LWORK=" << svd_lwork << postfix); // Make sure svd_lwork is nonnegative. (Ordinal must be a // signed type, as we explain above, so this test should never // signal any unsigned-to-signed conversions from the compiler. // If it does, you're probably using the wrong Ordinal type. - TEUCHOS_TEST_FOR_EXCEPTION(svd_lwork < 0, std::logic_error, - prefix << "a negative workspace array length (LWORK)" - " = " << svd_lwork << postfix); + TEUCHOS_TEST_FOR_EXCEPTION + (svd_lwork < 0, std::logic_error, prefix << "a negative " + "workspace array length (LWORK) = " << svd_lwork << postfix); } // Allocate workspace for LAPACK's SVD routine. std::vector svd_work (svd_lwork); @@ -573,9 +562,9 @@ namespace TSQR { // why we copied R into B (so that we don't overwrite R if R is // full rank). lapack.GESVD ('A', 'A', ncols, ncols, B.get(), B.lda(), - &singular_values[0], U_view.get(), U_view.lda(), - VT.get(), VT.lda(), &svd_work[0], svd_lwork, - &svd_rwork[0], &svd_info); + singular_values.data(), U_view.get(), U_view.lda(), + VT.get(), VT.lda(), svd_work.data(), svd_lwork, + svd_rwork.data()); // // Compute the numerical rank of B, using the given relative // tolerance and the computed singular values. GESVD computes @@ -636,10 +625,11 @@ namespace TSQR { const bool contiguousCacheBlocks) const { // Take the easy exit if available. - if (ncols == 0) + if (ncols == 0) { return 0; + } // Matrix to hold the left singular vectors of the R factor. - Matrix U (ncols, ncols, Scalar(0)); + Matrix U (ncols, ncols, Scalar {}); // Compute numerical rank of the R factor using the SVD. // Store the left singular vectors in U. const Ordinal rank = @@ -649,9 +639,10 @@ namespace TSQR { // already computed the SVD \f$R = U \Sigma V^*\f$ of (the // input) R, and overwrote R with \f$\Sigma V^*\f$. Now, we // compute \f$Q := Q \cdot U\f$, respecting cache blocks of Q. - if (rank < ncols) + if (rank < ncols) { Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), contiguousCacheBlocks); + } return rank; } diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index e2ac67fc3719..edd294b4e85d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -41,7 +41,7 @@ #define __TSQR_Random_MatrixGenerator_hpp #include "Tsqr_Matrix.hpp" -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include "Teuchos_ScalarTraits.hpp" #include #include @@ -96,19 +96,15 @@ namespace TSQR { this->fill_random (nrows, ncols, Q, ldq); // Get ready for QR factorization - Teuchos::LAPACK lapack; + Impl::Lapack lapack; std::vector tau (std::min(nrows, ncols)); // Workspace query Scalar _lwork1, _lwork2; - int info = 0; - lapack.GEQRF (nrows, ncols, Q, ldq, &tau[0], &_lwork1, -1, &info); - if (info != 0) - throw std::logic_error("LAPACK GEQRF LWORK query failed"); - - lapack.UNGQR (nrows, ncols, ncols, Q, ldq, &tau[0], &_lwork2, -1, &info); - if (info != 0) - throw std::logic_error("LAPACK UNGQR LWORK query failed"); + lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(), &_lwork1, -1); + lapack.compute_explicit_Q (nrows, ncols, ncols, + Q, ldq, tau.data(), + &_lwork2, -1); // Allocate workspace. abs() returns a magnitude_type, and we // can compare those using std::max. If Scalar is complex, @@ -117,24 +113,15 @@ namespace TSQR { STS::magnitude (_lwork2))); std::vector work (lwork); - // Factor the input matrix - lapack.GEQRF (nrows, ncols, Q, ldq, &tau[0], &work[0], lwork, &info); - if (info != 0) { - std::ostringstream os; - os << "LAPACK GEQRF failed with INFO = " << info; - throw std::runtime_error (os.str ()); - } + lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(), + work.data(), lwork); // Compute explicit Q factor in place - lapack.UNGQR (nrows, ncols, ncols, Q, ldq, &tau[0], &work[0], lwork, &info); - if (info != 0) { - std::ostringstream os; - os << "LAPACK UNGQR failed with INFO = " << info; - throw std::runtime_error (os.str ()); - } + lapack.compute_explicit_Q (nrows, ncols, ncols, + Q, ldq, tau.data(), + work.data(), lwork); } - /// Fill the nrows by ncols matrix Q (in column-major order, with /// leading dimension ldq >= nrows) with a random orthogonal /// matrix, stored implicitly. tau (of length min(nrows,ncols)) @@ -154,23 +141,18 @@ namespace TSQR { this->fill_random (nrows, ncols, Q, ldq); // Get ready for QR factorization - Teuchos::LAPACK lapack; + Impl::Lapack lapack; // Workspace query Scalar _lwork1; - int info = 0; - lapack.GEQRF (nrows, ncols, Q, ldq, tau, &_lwork1, -1, &info); - if (info != 0) - throw std::logic_error("LAPACK GEQRF LWORK query failed"); + lapack.compute_QR (nrows, ncols, Q, ldq, tau, &_lwork1, -1); // Allocate workspace. const Ordinal lwork = checkedCast (STS::magnitude (_lwork1)); - std::vector< Scalar > work (lwork); + std::vector work (lwork); - // Factor the input matrix - lapack.GEQRF (nrows, ncols, Q, ldq, tau, &work[0], lwork, &info); - if (info != 0) - throw std::runtime_error("LAPACK GEQRF failed"); + lapack.compute_QR (nrows, ncols, Q, ldq, tau, + work.data(), lwork); } template< class MatrixViewType > @@ -199,66 +181,54 @@ namespace TSQR { // Fill A with zeros, and then make its diagonal the given set // of singular values. mat_view_type A_view (nrows, ncols, A, lda); - A_view.fill (Scalar (0)); - for (Ordinal j = 0; j < ncols; ++j) - // Promote magnitude_type to Scalar here. + A_view.fill (Scalar {}); + for (Ordinal j = 0; j < ncols; ++j) { A_view(j,j) = Scalar (singular_values[j]); + } // Generate random orthogonal U (nrows by ncols) and V (ncols by // ncols). Keep them stored implicitly. - implicit_Q (U, &tau_U[0]); - implicit_Q (V, &tau_V[0]); + implicit_Q (U, tau_U.data()); + implicit_Q (V, tau_V.data()); // Workspace query for ORMQR. Scalar _lwork1, _lwork2; - int info = 0; - Teuchos::LAPACK< Ordinal, Scalar > lapack; - lapack.UNMQR ('L', 'N', nrows, ncols, ncols, U.get(), U.lda(), &tau_U[0], - A, lda, &_lwork1, -1, &info); - if (info != 0) { - std::ostringstream os; - os << "LAPACK ORMQR LWORK query failed with INFO = " << info - << ": called ORMQR(\"L\", \"N\", " << nrows << ", " << ncols - << ", " << ncols << ", NULL, " << U.lda() << ", NULL, NULL, " - << lda << ", WORK, -1, &INFO)"; - throw std::logic_error(os.str()); - } + Impl::Lapack lapack; + lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, + U.get(), U.lda(), tau_U.data(), + A, lda, &_lwork1, -1); if (STS::isComplex) { - lapack.UNMQR ('R', 'C', nrows, ncols, ncols, V.get(), V.lda(), &tau_V[0], - A, lda, &_lwork2, -1, &info); + lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, + V.get(), V.lda(), tau_V.data(), + A, lda, &_lwork2, -1); } else { - lapack.UNMQR ('R', 'T', nrows, ncols, ncols, V.get(), V.lda(), &tau_V[0], - A, lda, &_lwork2, -1, &info); - } - if (info != 0) { - throw std::logic_error("LAPACK ORMQR LWORK query failed"); + lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, + V.get(), V.lda(), tau_V.data(), + A, lda, &_lwork2, -1); } // Allocate workspace. Ordinal lwork = checkedCast (std::max (STS::magnitude (_lwork1), STS::magnitude (_lwork2))); - std::vector< Scalar > work (lwork); + std::vector work (lwork); // Apply U to the left side of A, and V^H to the right side of A. - lapack.UNMQR ('L', 'N', nrows, ncols, ncols, U.get(), U.lda(), &tau_U[0], - A, lda, &work[0], lwork, &info); - if (info != 0) - throw std::runtime_error("LAPACK ORMQR failed (first time)"); + lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, + U.get(), U.lda(), tau_U.data(), + A, lda, work.data(), lwork); if (STS::isComplex) { - lapack.UNMQR ('R', 'C', nrows, ncols, ncols, V.get(), V.lda(), &tau_V[0], - A, lda, &work[0], lwork, &info); + lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, + V.get(), V.lda(), tau_V.data(), + A, lda, work.data(), lwork); } else { - lapack.UNMQR ('R', 'T', nrows, ncols, ncols, V.get(), V.lda(), &tau_V[0], - A, lda, &work[0], lwork, &info); - } - if (info != 0) { - throw std::runtime_error("LAPACK ORMQR failed (second time)"); + lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, + V.get(), V.lda(), tau_V.data(), + A, lda, work.data(), lwork); } } - /// \brief Fill in a random upper triangular matrix /// /// Fill R with a random n by n upper triangular matrix, with @@ -285,43 +255,43 @@ namespace TSQR { fill_random_svd (n, n, R, ldr, singular_values); // Compute the QR factorization in place of R (which isn't upper triangular yet). - std::vector< Scalar > tau (n); + std::vector tau (n); // Workspace size query for QR factorization. Scalar _lwork1; - int info = 0; - Teuchos::LAPACK< Ordinal, Scalar > lapack; - lapack.GEQRF (n, n, R, ldr, &tau[0], &_lwork1, -1, &info); - if (info != 0) - throw std::logic_error("LAPACK GEQRF LWORK query failed"); + Impl::Lapack lapack; + lapack.compute_QR (n, n, R, ldr, tau.data(), &_lwork1, -1); // Allocate workspace Ordinal lwork = checkedCast (STS::magnitude (_lwork1)); - std::vector< Scalar > work (lwork); + std::vector work (lwork); // Compute QR factorization (implicit representation in place). - lapack.GEQRF (n, n, R, ldr, &tau[0], &work[0], lwork, &info); - if (info != 0) - throw std::runtime_error("LAPACK GEQRF failed"); + lapack.compute_QR (n, n, R, ldr, tau.data(), work.data(), lwork); // Zero out the stuff below the diagonal of R, leaving just the R factor. - for (Ordinal j = 0; j < n; ++j) - for (Ordinal i = j+1; i < n; ++i) - R[i + j*ldr] = Scalar(0); + for (Ordinal j = 0; j < n; ++j) { + for (Ordinal i = j+1; i < n; ++i) { + R[i + j*ldr] = Scalar {}; + } + } } private: static Ordinal checkedCast (const magnitude_type& x) { - if (x < std::numeric_limits< Ordinal >::min() || x > std::numeric_limits< Ordinal >::max()) + if (x < std::numeric_limits::min() || x > std::numeric_limits::max()) { throw std::range_error("Scalar input cannot be safely cast to an Ordinal"); + } else if (std::numeric_limits< magnitude_type >::is_signed && x < magnitude_type(0) && - ! std::numeric_limits< Ordinal >::is_signed) + ! std::numeric_limits::is_signed) { throw std::range_error("Scalar input is negative, but Ordinal is unsigned"); - else - return static_cast< Ordinal > (x); + } + else { + return static_cast (x); + } } Generator& gen_; diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp index 1a862aa7d951..8e7dfb54fd8a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_NormalGenerator.hpp @@ -40,7 +40,7 @@ #ifndef __TSQR_Random_NormalGenerator_hpp #define __TSQR_Random_NormalGenerator_hpp -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include #include @@ -54,11 +54,11 @@ namespace TSQR { /// routines. These are test routines and are not guaranteed to /// be in the LAPACK library. They will be if you build LAPACK /// from source. - template< class Ordinal, class Scalar > + template class NormalGenerator { private: //! Default buffer length. - static int defaultBufferLength() { return 100; } + static constexpr int defaultBufferLength() { return 100; } public: typedef Ordinal ordinal_type; @@ -144,7 +144,7 @@ namespace TSQR { void fill_buffer () { - Teuchos::LAPACK lapack; + Impl::Lapack lapack; // LAPACK's _LARNV routine defines this "enum" (just an // integer, because it's Fortran) that lets users choose from @@ -155,7 +155,7 @@ namespace TSQR { uniform_m1_1 = 2, normal_0_1 = 3 }; - lapack.LARNV (normal_0_1, &iseed_[0], buffer_length_, &buffer_[0]); + lapack.LARNV (normal_0_1, iseed_.data(), buffer_length_, buffer_.data()); } Scalar @@ -163,11 +163,10 @@ namespace TSQR { { // It's impossible to take the greater-than branch, but we // check for robustness' sake. - if (cur_pos_ >= buffer_length_) - { - fill_buffer (); - cur_pos_ = 0; - } + if (cur_pos_ >= buffer_length_) { + fill_buffer (); + cur_pos_ = 0; + } return buffer_[cur_pos_++]; } }; diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp index a2fa8436f5a2..f81a75573e9e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp @@ -45,7 +45,7 @@ #include "Tsqr_Matrix.hpp" #include "Tsqr_SequentialTsqr.hpp" #include "Tsqr_Util.hpp" -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include "Teuchos_Time.hpp" #include #include // size_t definition @@ -63,42 +63,26 @@ namespace TSQR { template static Ordinal - lworkQueryLapackQr (Teuchos::LAPACK& lapack, + lworkQueryLapackQr (Impl::Lapack& lapack, const Ordinal nrows, const Ordinal ncols, const Ordinal lda) { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; using std::ostringstream; using std::endl; + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; - Scalar d_lwork_geqrf = Scalar (0); - int INFO = 0; - lapack.GEQRF (nrows, ncols, NULL, lda, NULL, &d_lwork_geqrf, -1, &INFO); - if (INFO != 0) { - ostringstream os; - os << "LAPACK _GEQRF workspace size query failed: INFO = " << INFO; - // It's a logic error and not a runtime error, because the - // LWORK query should only fail if the input parameters have - // invalid (e.g., out of range) values. - throw std::logic_error (os.str ()); - } + Scalar d_lwork_geqrf {}; + lapack.compute_QR (nrows, ncols, nullptr, lda, nullptr, + &d_lwork_geqrf, -1); - Scalar d_lwork_orgqr = Scalar (0); + Scalar d_lwork_orgqr {}; // A workspace query appropriate for computing the explicit Q // factor (nrows x ncols) in place, from the QR factorization of // an nrows x ncols matrix with leading dimension lda. - lapack.UNGQR (nrows, ncols, ncols, NULL, lda, NULL, &d_lwork_orgqr, - -1, &INFO); - if (INFO != 0) { - ostringstream os; - os << "LAPACK _UNGQR workspace size query failed: INFO = " << INFO; - // It's a logic error and not a runtime error, because the - // LWORK query should only fail if the input parameters have - // invalid (e.g., out of range) values. - throw std::logic_error (os.str()); - } + lapack.compute_explicit_Q (nrows, ncols, ncols, nullptr, lda, + nullptr, &d_lwork_orgqr, -1); // LAPACK workspace queries do return their results as a // double-precision floating-point value, but LAPACK promises @@ -106,8 +90,8 @@ namespace TSQR { // check for valid casts to int below. I include the checks // just to be "bulletproof" and also to show how to do the // checks for later reference. - const magnitude_type lwork_geqrf_test = - static_cast< magnitude_type > (static_cast (STS::magnitude (d_lwork_geqrf))); + const mag_type lwork_geqrf_test = + static_cast (static_cast (STS::magnitude (d_lwork_geqrf))); if (lwork_geqrf_test != STS::magnitude (d_lwork_geqrf)) { ostringstream os; os << "LAPACK _GEQRF workspace query returned a result, " @@ -116,7 +100,7 @@ namespace TSQR { throw std::range_error (os.str ()); } const Scalar lwork_orgqr_test = - static_cast (static_cast (STS::magnitude ((d_lwork_orgqr)))); + static_cast (static_cast (STS::magnitude ((d_lwork_orgqr)))); if (lwork_orgqr_test != STS::magnitude (d_lwork_orgqr)) { ostringstream os; os << "LAPACK _UNGQR workspace query returned a result, " @@ -156,7 +140,7 @@ namespace TSQR { using std::string; using std::vector; - SequentialTsqr< Ordinal, Scalar > actor (cache_size_hint); + SequentialTsqr actor (cache_size_hint); Ordinal numCacheBlocks; if (b_debug) { @@ -168,10 +152,10 @@ namespace TSQR { } } - Matrix< Ordinal, Scalar > A (nrows, ncols); - Matrix< Ordinal, Scalar > A_copy (nrows, ncols); - Matrix< Ordinal, Scalar > Q (nrows, ncols); - Matrix< Ordinal, Scalar > R (ncols, ncols); + Matrix A (nrows, ncols); + Matrix A_copy (nrows, ncols); + Matrix Q (nrows, ncols); + Matrix R (ncols, ncols); if (std::numeric_limits::has_quiet_NaN) { A.fill (std::numeric_limits< Scalar>::quiet_NaN()); A_copy.fill (std::numeric_limits::quiet_NaN()); @@ -217,7 +201,7 @@ namespace TSQR { // Verify cache blocking, when in debug mode. if (b_debug) { - Matrix< Ordinal, Scalar > A2 (nrows, ncols); + Matrix A2 (nrows, ncols); if (std::numeric_limits::has_quiet_NaN) { A2.fill (std::numeric_limits::quiet_NaN ()); } @@ -250,7 +234,7 @@ namespace TSQR { } // Factor the matrix and compute the explicit Q factor - typedef typename SequentialTsqr< Ordinal, Scalar >::FactorOutput + typedef typename SequentialTsqr::FactorOutput factor_output_type; factor_output_type factorOutput = actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), @@ -444,7 +428,7 @@ namespace TSQR { template< class Ordinal, class Scalar > static void verifyLapackTemplate (std::ostream& out, - TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator, + TSQR::Random::NormalGenerator& generator, const std::string& datatype, const Ordinal nrows, const Ordinal ncols, @@ -460,18 +444,17 @@ namespace TSQR { using std::cerr; using std::endl; - // Initialize LAPACK. - Teuchos::LAPACK< Ordinal, Scalar > lapack; + Impl::Lapack lapack; if (b_debug) { cerr << "LAPACK test problem:" << endl << "* " << nrows << " x " << ncols << endl; } - Matrix< Ordinal, Scalar > A (nrows, ncols); - Matrix< Ordinal, Scalar > A_copy (nrows, ncols); - Matrix< Ordinal, Scalar > Q (nrows, ncols); - Matrix< Ordinal, Scalar > R (ncols, ncols); + Matrix A (nrows, ncols); + Matrix A_copy (nrows, ncols); + Matrix Q (nrows, ncols); + Matrix R (ncols, ncols); if (std::numeric_limits::has_quiet_NaN) { A.fill (std::numeric_limits< Scalar>::quiet_NaN()); A_copy.fill (std::numeric_limits::quiet_NaN()); @@ -503,18 +486,10 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (Scalar (0)); - - // Compute the QR factorization - int info = 0; // INFO is always an int - lapack.GEQRF (nrows, ncols, A_copy.get(), A_copy.lda(), - &tau[0], &work[0], lwork, &info); - if (info != 0) { - ostringstream os; - os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info; - throw std::runtime_error (os.str()); - } + R.fill (Scalar {}); + lapack.compute_QR (nrows, ncols, A_copy.get(), A_copy.lda(), + tau.data(), work.data(), lwork); // Copy out the R factor from A_copy (where we computed the QR // factorization in place) into R. copy_upper_triangle (ncols, ncols, R.get(), ldr, A_copy.get(), lda); @@ -529,17 +504,13 @@ namespace TSQR { // result of the factorization into Q. deep_copy (Q, A_copy); - // Compute the explicit Q factor - lapack.UNGQR (nrows, ncols, ncols, Q.get(), ldq, &tau[0], &work[0], lwork, &info); - if (info != 0) { - ostringstream os; - os << "LAPACK explicit Q computation (_UNGQR) failed: INFO = " << info; - throw std::runtime_error (os.str()); - } + lapack.compute_explicit_Q (nrows, ncols, ncols, Q.get(), ldq, + tau.data(), work.data(), lwork); // Validate the factorization - std::vector< magnitude_type > results = - local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr); + std::vector results = + local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, + R.get(), ldr); // Print the results if (human_readable) { @@ -692,9 +663,9 @@ namespace TSQR { const std::string& additionalData, const bool printFieldNames) { - Matrix< Ordinal, Scalar > A (numRows, numCols); - Matrix< Ordinal, Scalar > Q (numRows, numCols); - Matrix< Ordinal, Scalar > R (numCols, numCols); + Matrix A (numRows, numCols); + Matrix Q (numRows, numCols); + Matrix R (numCols, numCols); const Ordinal lda = numRows; const Ordinal ldq = numRows; const Ordinal ldr = numCols; @@ -720,28 +691,17 @@ namespace TSQR { TimerType timer("LAPACK"); timer.start(); for (int trialNum = 0; trialNum < numTrials; ++trialNum) { - // Compute the QR factorization - int info = 0; // INFO is always an int - lapack_.GEQRF (numRows, numCols, Q.get(), ldq, &tau[0], &work[0], lwork, &info); - if (info != 0) { - std::ostringstream os; - os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info; - throw std::runtime_error (os.str()); - } - + lapack_.compute_QR (numRows, numCols, + Q.get(), ldq, tau.data(), + work.data(), lwork); // Extract the upper triangular factor R from Q (where it // was computed in place by GEQRF), since UNGQR will // overwrite all of Q with the explicit Q factor. - copy_upper_triangle (numRows, numCols, R.get(), ldr, Q.get(), ldq); - - // Compute the explicit Q factor - lapack_.UNGQR (numRows, numCols, numCols, Q.get(), ldq, - &tau[0], &work[0], lwork, &info); - if (info != 0) { - std::ostringstream os; - os << "LAPACK explicit Q computation (_UNGQR) failed: INFO = " << info; - throw std::runtime_error (os.str ()); - } + copy_upper_triangle (numRows, numCols, R.get(), ldr, + Q.get(), ldq); + lapack_.compute_explicit_Q (numRows, numCols, numCols, + Q.get(), ldq, tau.data(), + work.data(), lwork); } const double lapackTiming = timer.stop(); reportResults (numTrials, numRows, numCols, lapackTiming, @@ -751,7 +711,7 @@ namespace TSQR { private: //! Wrapper around LAPACK routines. - Teuchos::LAPACK< Ordinal, Scalar > lapack_; + Impl::Lapack lapack_; /// \brief Pseudorandom normal(0,1) generator. /// diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index ebd23b6b2850..c98c2a34ec03 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -45,7 +45,7 @@ #include "Tsqr_CacheBlocker.hpp" #include "Tsqr_Util.hpp" #include "Teuchos_BLAS.hpp" -#include "Teuchos_LAPACK.hpp" +#include "Tsqr_Impl_Lapack.hpp" #include #include #include @@ -67,7 +67,6 @@ namespace TSQR { typedef MatView< LocalOrdinal, Scalar > mat_view_type; typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view_type; typedef Teuchos::BLAS blas_type; - typedef Teuchos::LAPACK lapack_type; public: typedef Scalar scalar_type; @@ -122,7 +121,7 @@ namespace TSQR { using Teuchos::NO_TRANS; CacheBlocker blocker (nrows, ncols, strategy_); blas_type blas; - lapack_type lapack; + Impl::Lapack lapack; std::vector work (ncols); Matrix ATA (ncols, ncols, Scalar(0)); @@ -168,16 +167,12 @@ namespace TSQR { } // Compute the Cholesky factorization of ATA in place, so that - // A^T * A = R^T * R, where R is ncols by ncols upper - // triangular. - int info = 0; - lapack.POTRF ('U', ncols, ATA.get(), ATA.lda(), &info); - // FIXME (mfh 22 June 2010) The right thing to do here would be - // to resort to a rank-revealing factorization, as Stathopoulos - // and Wu (2002) do with their CholeskyQR + symmetric - // eigensolver factorization. - if (info != 0) - throw std::runtime_error("Cholesky factorization failed"); + // A^T * A = R^T * R, where R is ncols x ncols upper triangular. + lapack.POTRF ('U', ncols, ATA.get(), ATA.lda()); + // FIXME (mfh 22 June 2010, mfh 21 Nov 2019) The right thing to + // do on failure of above would be to resort to a rank-revealing + // factorization, as Stathopoulos and Wu (2002) do with their + // CholeskyQR + symmetric eigensolver factorization. // Copy out the R factor fill_matrix (ncols, ncols, R, ldr, Scalar(0)); diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 707d51641707..de6fd7e84737 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -159,7 +159,7 @@ namespace TSQR { { const LocalOrdinal ncols = A_top.ncols(); combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), - &tau[0], &work[0]); + tau.data(), work.data()); return mat_view_type(ncols, ncols, A_top.get(), A_top.lda()); } @@ -178,7 +178,7 @@ namespace TSQR { const LocalOrdinal nrowsLocal = Q_first.nrows(); combine.apply_first (applyType, nrowsLocal, C_first.ncols(), Q_first.ncols(), Q_first.get(), Q_first.lda(), - &tau[0], C_first.get(), C_first.lda(), &work[0]); + tau.data(), C_first.get(), C_first.lda(), work.data()); } void @@ -196,9 +196,9 @@ namespace TSQR { combine.apply_inner (apply_type, nrows_local, ncols_C, ncols_Q, - Q_cur.get(), C_cur.lda(), &tau[0], + Q_cur.get(), C_cur.lda(), tau.data(), C_top.get(), C_top.lda(), - C_cur.get(), C_cur.lda(), &work[0]); + C_cur.get(), C_cur.lda(), work.data()); } void @@ -212,8 +212,8 @@ namespace TSQR { const LocalOrdinal ncols = A_cur.ncols(); combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), - A_cur.get(), A_cur.lda(), &tau[0], - &work[0]); + A_cur.get(), A_cur.lda(), tau.data(), + work.data()); } public: @@ -624,7 +624,6 @@ namespace TSQR { // free to choose the cache block dimensions as we wish in // apply(), independently of what we did in factor(). CacheBlocker blocker (nrows, ncols_Q, strategy_); - Teuchos::LAPACK lapack; Combine combine; const bool transposed = apply_type.transposed(); From 6d5a9bfd06bae18490a2f26932c45978b97b36e2 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 22 Nov 2019 12:04:14 -0700 Subject: [PATCH 07/50] TSQR: Hide Teuchos::BLAS behind opaque interface The goal is to decouple TSQR from Teuchos::BLAS. There's nothing wrong with Teuchos::BLAS, but we want options to call other TPLs besides the system BLAS. --- packages/tpetra/tsqr/src/CMakeLists.txt | 2 +- .../tpetra/tsqr/src/Tsqr_GlobalVerify.hpp | 4 +- .../tpetra/tsqr/src/Tsqr_Impl_RawBlas.hpp | 47 ++++ .../tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp | 96 ++++++++ .../tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp | 65 ++++++ .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 8 +- packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp | 67 ++---- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 205 ++++++++---------- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 26 ++- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 4 +- 10 files changed, 350 insertions(+), 174 deletions(-) create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_RawBlas.hpp create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp create mode 100644 packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp diff --git a/packages/tpetra/tsqr/src/CMakeLists.txt b/packages/tpetra/tsqr/src/CMakeLists.txt index 9295e27c1691..91cca32b7ec1 100644 --- a/packages/tpetra/tsqr/src/CMakeLists.txt +++ b/packages/tpetra/tsqr/src/CMakeLists.txt @@ -37,5 +37,5 @@ TRIBITS_ADD_LIBRARY( # / from this directory, or to / from the 'impl' subdirectory. That ensures # that running "make" will also rerun CMake in order to regenerate Makefiles. # -# Here is another such change, and here is another. +# Here is another such change, and here is another. Another! # diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp index 636e73b19708..f666e6ce70f8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp @@ -43,7 +43,7 @@ #include "Tsqr_LocalVerify.hpp" #include "Tsqr_MessengerBase.hpp" #include "Tsqr_Util.hpp" -#include "Teuchos_BLAS.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" #include "Teuchos_ScalarTraits.hpp" #include // std::pair #include @@ -184,7 +184,7 @@ namespace TSQR { const magnitude_type ZERO {}; const magnitude_type ONE (1.0); - Teuchos::BLAS blas; + Impl::SystemBlas blas; // // Compute $\| I - Q^T * Q \|_F$ diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_RawBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_RawBlas.hpp new file mode 100644 index 000000000000..5c4a2ddb6f59 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_RawBlas.hpp @@ -0,0 +1,47 @@ +#ifndef TSQR_IMPL_RAWBLAS_HPP +#define TSQR_IMPL_RAWBLAS_HPP + +namespace TSQR { +namespace Impl { + +/// \brief "Raw" local BLAS interface. +/// +/// Subclass and specialize this interface as needed. +/// +/// The methods are instance methods so that subclass instances may +/// have state. For example, a cuBLAS implementation would have a +/// CUDA stream instance (cudaStream_t) and some kind of handle. +/// +/// Methods are virtual because they are meant to be called from host, +/// even if they run on device with pointers to device data. +template +class RawBlas { +public: + using value_type = Scalar; + + virtual ~RawBlas() = default; + + //! Corresponds to BLAS _GEMM. + virtual void + matrix_matrix_product(const char transa, const char transb, + const int m, const int n, const int k, + const value_type& alpha, + const value_type A[], const int lda, + const value_type B[], const int ldb, + const value_type& beta, + value_type C[], const int ldc) const = 0; + + //! Corresponds to BLAS _TRSM. + virtual void + triangular_matrix_matrix_solve(const char side, const char uplo, + const char transa, const char diag, + const int m, const int n, + const value_type& alpha, + const value_type A[], const int lda, + value_type B[], const int ldb) const = 0; +}; + +} // namespace Impl +} // namespace TSQR + +#endif // TSQR_IMPL_RAWBLAS_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp new file mode 100644 index 000000000000..c93b6e53c219 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp @@ -0,0 +1,96 @@ +#include "Tsqr_Impl_SystemBlas.hpp" +#include "Teuchos_BLAS.hpp" + +namespace TSQR { +namespace Impl { + +#define TSQR_IMPL_SYSTEMBLAS_IMPL( Scalar ) \ +void SystemBlas:: \ +matrix_matrix_product(const char transa, const char transb, \ + const int m, const int n, const int k, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + const value_type B[], const int ldb, \ + const value_type& beta, \ + value_type C[], const int ldc) const \ +{ \ + const Teuchos::ETransp transa_enum = \ + (transa == 'C' || transa == 'c') ? \ + Teuchos::CONJ_TRANS : \ + ((transa == 'T' || transa == 't') ? \ + Teuchos::TRANS : \ + Teuchos::NO_TRANS); \ + const Teuchos::ETransp transb_enum = \ + (transb == 'C' || transb == 'c') ? \ + Teuchos::CONJ_TRANS : \ + ((transb == 'T' || transb == 't') ? \ + Teuchos::TRANS : \ + Teuchos::NO_TRANS); \ + GEMM(transa_enum, transb_enum, m, n, k, \ + alpha, A, lda, B, ldb, beta, C, ldc); \ +} \ + \ +void SystemBlas:: \ +GEMM(const Teuchos::ETransp transa, const Teuchos::ETransp transb, \ + const int m, const int n, const int k, \ + const value_type& alpha, const value_type A[], const int lda, \ + const value_type B[], const int ldb, \ + const value_type& beta, value_type C[], const int ldc) const \ +{ \ + Teuchos::BLAS blas; \ + blas.GEMM(transa, transb, m, n, k, \ + alpha, A, lda, B, ldb, beta, C, ldc); \ +} \ + \ +void SystemBlas:: \ +triangular_matrix_matrix_solve(const char side, const char uplo, \ + const char transa, const char diag, \ + const int m, const int n, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + value_type B[], const int ldb) const \ +{ \ + const Teuchos::ESide side_enum = \ + (side == 'L' || side == 'l') ? \ + Teuchos::LEFT_SIDE : \ + Teuchos::RIGHT_SIDE; \ + const Teuchos::EUplo uplo_enum = \ + (uplo == 'U' || uplo == 'u') ? \ + Teuchos::UPPER_TRI : \ + ((uplo == 'L' || uplo == 'l') ? \ + Teuchos::LOWER_TRI : \ + Teuchos::UNDEF_TRI); \ + const Teuchos::ETransp transa_enum = \ + (transa == 'C' || transa == 'c') ? \ + Teuchos::CONJ_TRANS : \ + ((transa == 'T' || transa == 't') ? \ + Teuchos::TRANS : \ + Teuchos::NO_TRANS); \ + const Teuchos::EDiag diag_enum = \ + (diag == 'U' || diag == 'u') ? \ + Teuchos::UNIT_DIAG : \ + Teuchos::NON_UNIT_DIAG; \ + TRSM(side_enum, uplo_enum, transa_enum, diag_enum, \ + m, n, alpha, A, lda, B, ldb); \ +} \ + \ +void SystemBlas:: \ +TRSM(const Teuchos::ESide side, const Teuchos::EUplo uplo, \ + const Teuchos::ETransp transa, const Teuchos::EDiag diag, \ + const int m, const int n, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + value_type B[], const int ldb) const \ +{ \ + Teuchos::BLAS blas; \ + blas.TRSM(side, uplo, transa, diag, \ + m, n, alpha, A, lda, B, ldb); \ +} + +TSQR_IMPL_SYSTEMBLAS_IMPL( float ) +TSQR_IMPL_SYSTEMBLAS_IMPL( double ) +TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) +TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) + +} // namespace Impl +} // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp new file mode 100644 index 000000000000..b4156adf9e79 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp @@ -0,0 +1,65 @@ +#ifndef TSQR_IMPL_SYSTEMBLAS_HPP +#define TSQR_IMPL_SYSTEMBLAS_HPP + +#include "Tsqr_Impl_RawBlas.hpp" +#include "Teuchos_BLAS_types.hpp" +#include + +namespace TSQR { +namespace Impl { + +template +class SystemBlas {}; + +#define TSQR_IMPL_SYSTEMBLAS_DECL( Scalar ) \ +template<> \ +class SystemBlas : public RawBlas { \ +public: \ + using value_type = Scalar; \ + \ + ~SystemBlas() = default; \ + \ + void \ + matrix_matrix_product(const char transa, const char transb, \ + const int m, const int n, const int k, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + const value_type B[], const int ldb, \ + const value_type& beta, \ + value_type C[], const int ldc) const override; \ + \ + void \ + GEMM(const Teuchos::ETransp transa, const Teuchos::ETransp transb, \ + const int m, const int n, const int k, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + const value_type B[], const int ldb, \ + const value_type& beta, \ + value_type C[], const int ldc) const; \ + \ + void \ + triangular_matrix_matrix_solve(const char side, const char uplo, \ + const char transa, const char diag, \ + const int m, const int n, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + value_type B[], const int ldb) const override; \ + \ + void \ + TRSM(const Teuchos::ESide side, const Teuchos::EUplo uplo, \ + const Teuchos::ETransp transa, const Teuchos::EDiag diag, \ + const int m, const int n, \ + const value_type& alpha, \ + const value_type A[], const int lda, \ + value_type B[], const int ldb) const; \ +}; + +TSQR_IMPL_SYSTEMBLAS_DECL( float ) +TSQR_IMPL_SYSTEMBLAS_DECL( double ) +TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) +TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) + +} // namespace Impl +} // namespace TSQR + +#endif // TSQR_IMPL_SYSTEMBLAS_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index c16904f1c99c..2e5082b2ed93 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -46,6 +46,7 @@ #include "Tsqr_CacheBlocker.hpp" #include "Tsqr_Combine.hpp" #include "Tsqr_NodeTsqr.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" #include "Teuchos_ParameterListAcceptorDefaultBase.hpp" #include "Kokkos_Core.hpp" @@ -827,8 +828,11 @@ namespace TSQR { int numPartitions_; bool contiguousCacheBlocks_; + // This uses SystemBlas for now. + // In the future, we may want to use a TPL. + // That means we could switch to RawBlas. void - multBlock (Teuchos::BLAS& blas, + multBlock (Impl::SystemBlas& blas, const mat_view_type& Q_cur, Matrix& Q_temp) const { @@ -861,7 +865,7 @@ namespace TSQR { // routine (which forbids aliasing of any input argument and // the output argument). Matrix Q_temp; - Teuchos::BLAS blas; + Impl::SystemBlas blas; while (iter != end) { mat_view_type Q_cur = *iter; multBlock (blas, Q_cur, Q_temp); diff --git a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp index 8e04d6ed85ba..93e0cdeff89d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp @@ -41,26 +41,22 @@ #define __TSQR_Tsqr_LocalVerify_hpp #include "Tsqr_Util.hpp" -#include "Teuchos_BLAS.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" #include #include #include // std::pair, std::make_pair #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { - - template< class Ordinal, class Scalar > + template typename Teuchos::ScalarTraits::magnitudeType local_frobenius_norm (const Ordinal nrows_local, const Ordinal ncols, - const Scalar A_local[], + const Scalar A_local[], const Ordinal lda_local) { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; + using STS = Teuchos::ScalarTraits; + using magnitude_type = typename STS::magnitudeType; // FIXME (mfh 22 Apr 2010) This function does no scaling of // intermediate quantities, so it might overflow unnecessarily. @@ -68,7 +64,7 @@ namespace TSQR { for (Ordinal j = 0; j < ncols; ++j) { const Scalar* const cur_col = &A_local[j*lda_local]; for (Ordinal i = 0; i < nrows_local; ++i) { - const magnitude_type abs_xi = STS::magnitude (cur_col[i]); + const auto abs_xi = STS::magnitude (cur_col[i]); result = result + abs_xi * abs_xi; } } @@ -131,10 +127,10 @@ namespace TSQR { const Ordinal ldq) { typedef Teuchos::ScalarTraits STS; - const Scalar ZERO (0); - const Scalar ONE (1); + const Scalar ZERO {}; + const Scalar ONE (1.0); - Teuchos::BLAS blas; + Impl::SystemBlas blas; std::vector AbsOrthog (ncols * ncols, std::numeric_limits::quiet_NaN()); const Ordinal AbsOrthog_stride = ncols; @@ -170,11 +166,11 @@ namespace TSQR { { typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; - const Scalar ZERO (0); - const Scalar ONE (1); + const Scalar ZERO {}; + const Scalar ONE (1.0); const bool relative = false; // whether to scale $\|I-Q^T*Q\|_F$ by $\|A\|_F$ - Teuchos::BLAS blas; + Impl::SystemBlas blas; std::vector AbsOrthog (ncols * ncols, std::numeric_limits::quiet_NaN()); const Ordinal AbsOrthog_stride = ncols; @@ -221,7 +217,7 @@ namespace TSQR { std::vector AbsResid (nrows * ncols, std::numeric_limits::quiet_NaN ()); const Ordinal AbsResid_stride = nrows; - Teuchos::BLAS blas; + Impl::SystemBlas blas; const magnitude_type ONE (1); // A_copy := A_copy - Q * R @@ -251,41 +247,18 @@ namespace TSQR { std::vector AbsResid (nrows * ncols, std::numeric_limits::quiet_NaN ()); const Ordinal AbsResid_stride = nrows; - Teuchos::BLAS blas; - const magnitude_type ONE (1); - - // if (b_debug) - // cerr << "relative_residual:" << endl; - // if (matrix_contains_nan (nrows, ncols, A, lda)) - // cerr << "relative_residual: matrix A contains a NaN" << endl; - // if (matrix_contains_nan (nrows, ncols, Q, ldq)) - // cerr << "relative_residual: matrix Q contains a NaN" << endl; - // if (matrix_contains_nan (ncols, ncols, R, ldr)) - // cerr << "relative_residual: matrix R contains a NaN" << endl; + Impl::SystemBlas blas; + const magnitude_type ONE (1.0); // A_copy := A_copy - Q * R - copy_matrix (nrows, ncols, &AbsResid[0], AbsResid_stride, A, lda); - - // if (NaN_in_matrix (nrows, ncols, AbsResid, AbsResid_stride)) - // cerr << "relative_residual: matrix AbsResid := A contains a NaN" << endl; - - blas.GEMM (NO_TRANS, NO_TRANS, nrows, ncols, ncols, -ONE, Q, ldq, R, ldr, - ONE, &AbsResid[0], AbsResid_stride); - - // if (NaN_in_matrix (nrows, ncols, AbsResid, AbsResid_stride)) - // cerr << "relative_residual: matrix AbsResid := A - Q*R contains a NaN" << endl; + copy_matrix (nrows, ncols, AbsResid.data(), + AbsResid_stride, A, lda); + blas.GEMM (NO_TRANS, NO_TRANS, nrows, ncols, ncols, + -ONE, Q, ldq, R, ldr, + ONE, AbsResid.data(), AbsResid_stride); const magnitude_type absolute_residual = local_frobenius_norm (nrows, ncols, &AbsResid[0], AbsResid_stride); - - // if (b_debug) - // { - // cerr << "In relative_residual:" << endl; - // cerr << "||Q||_2 = " << matrix_2norm(nrows, ncols, Q, ldq) << endl; - // cerr << "||R||_2 = " << matrix_2norm(ncols, ncols, R, ldr) << endl; - // cerr << "||A - QR||_2 = " << absolute_residual << endl; - // } - return absolute_residual / A_norm_F; } diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 2e1bb2a9198e..44e6a243954d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -43,14 +43,12 @@ #include "Tsqr_Matrix.hpp" #include "Tsqr_Random_MatrixGenerator.hpp" #include "Tsqr_RMessenger.hpp" -#include "Teuchos_BLAS.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" #include "Teuchos_ScalarTraits.hpp" -#include #include #include #include #include -#include namespace TSQR { namespace Random { @@ -93,17 +91,14 @@ namespace TSQR { MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger) { using Teuchos::NO_TRANS; - using std::vector; - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; - - - const bool b_local_debug = false; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = typename MatrixViewType::scalar_type; + using STS = Teuchos::ScalarTraits; const int rootProc = 0; const int nprocs = ordinalMessenger->size(); const int myRank = ordinalMessenger->rank(); - Teuchos::BLAS blas; + Impl::SystemBlas blas; const ordinal_type nrowsLocal = A_local.nrows(); const ordinal_type ncols = A_local.ncols(); @@ -117,107 +112,99 @@ namespace TSQR { // // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I. - if (myRank == rootProc) - { - typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type; - matgen_type matGen (*pGenerator); - - // Generate a random ncols by ncols upper triangular matrix - // R with the given singular values. - Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0)); - matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); - - // Broadcast R to all the processors. - scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); - - // Generate (for myself) a random nrowsLocal x ncols - // orthogonal matrix, stored in explicit form. - Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); - matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); - - // Scale the (local) orthogonal matrix by the number of - // processors P, to make the columns of the global matrix Q - // orthogonal. (Otherwise the norm of each column will be P - // instead of 1.) - const scalar_type P = static_cast< scalar_type > (nprocs); - // Do overflow check. If casting P back to scalar_type - // doesn't produce the same value as nprocs, the cast - // overflowed. We take the real part, because scalar_type - // might be complex. - if (nprocs != static_cast (Teuchos::ScalarTraits::real (P))) - throw std::runtime_error ("Casting nprocs to Scalar failed"); - - scaleMatrix (Q_local, P); - - // A_local := Q_local * R - blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.get(), Q_local.lda(), - R.get(), R.lda(), - scalar_type(0), A_local.get(), A_local.lda()); - - for (int recvProc = 1; recvProc < nprocs; ++recvProc) - { - // Ask the receiving processor how big (i.e., how many rows) - // its local component of the matrix is. - ordinal_type nrowsRemote = 0; - ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0); - - if (b_local_debug) - { - std::ostringstream os; - os << "For Proc " << recvProc << ": local block is " - << nrowsRemote << " by " << ncols << std::endl; - std::cerr << os.str(); - } - - // Make sure Q_local is big enough to hold the data for - // the current receiver proc. - Q_local.reshape (nrowsRemote, ncols); - - // Compute a random nrowsRemote * ncols orthogonal - // matrix Q_local, for the current receiving processor. - matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); - - // Send Q_local to the current receiving processor. - scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); - } + if (myRank == rootProc) { + using matgen_type = Random::MatrixGenerator; + matgen_type matGen (*pGenerator); + + // Generate a random ncols by ncols upper triangular matrix R + // with the given singular values. + Matrix R (ncols, ncols, scalar_type {}); + matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); + + // Broadcast R to all the processors. + scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); + + // Generate (for myself) a random nrowsLocal x ncols + // orthogonal matrix, stored in explicit form. + Matrix Q_local (nrowsLocal, ncols); + matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); + + // Scale the (local) orthogonal matrix by the number of + // processors P, to make the columns of the global matrix Q + // orthogonal. (Otherwise the norm of each column will be P + // instead of 1.) + const scalar_type P (static_cast (nprocs)); + // Do overflow check. If casting P back to scalar_type + // doesn't produce the same value as nprocs, the cast + // overflowed. We take the real part, because scalar_type + // might be complex. + if (nprocs != static_cast (STS::real (P))) { + throw std::runtime_error ("Casting nprocs to Scalar failed"); + } + + scaleMatrix (Q_local, P); + + // A_local := Q_local * R + blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, + scalar_type(1), Q_local.get(), Q_local.lda(), + R.get(), R.lda(), + scalar_type(0), A_local.get(), A_local.lda()); + + for (int recvProc = 1; recvProc < nprocs; ++recvProc) { + // Ask the receiving processor how big (i.e., how many rows) + // its local component of the matrix is. + ordinal_type nrowsRemote = 0; + ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0); + + // Make sure Q_local is big enough to hold the data for + // the current receiver proc. + Q_local.reshape (nrowsRemote, ncols); + + // Compute a random nrowsRemote * ncols orthogonal + // matrix Q_local, for the current receiving processor. + matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); + + // Send Q_local to the current receiving processor. + scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); } - else - { - // Receive the R factor from Proc 0. There's only 1 R - // factor for all the processes. - Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0)); - scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); - - // Q_local (nrows_local by ncols, random orthogonal matrix) - // will be received from Proc 0, where it was generated. - const ordinal_type recvSize = nrowsLocal * ncols; - Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); - - // Tell Proc 0 how many rows there are in the random orthogonal - // matrix I want to receive from Proc 0. - ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); - - // Receive the orthogonal matrix from Proc 0. - scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); - - // Scale the (local) orthogonal matrix by the number of - // processors, to make the global matrix Q orthogonal. - const scalar_type P = static_cast< scalar_type > (nprocs); - // Do overflow check. If casting P back to scalar_type - // doesn't produce the same value as nprocs, the cast - // overflowed. We take the real part, because scalar_type - // might be complex. - if (nprocs != static_cast (Teuchos::ScalarTraits::real (P))) - throw std::runtime_error ("Casting nprocs to Scalar failed"); - scaleMatrix (Q_local, P); - - // A_local := Q_local * R - blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.get(), Q_local.lda(), - R.get(), R.lda(), - scalar_type(0), A_local.get(), A_local.lda()); + } + else { + // Receive the R factor from Proc 0. There's only 1 R + // factor for all the processes. + Matrix R (ncols, ncols, scalar_type {}); + scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); + + // Q_local (nrows_local by ncols, random orthogonal matrix) + // will be received from Proc 0, where it was generated. + const ordinal_type recvSize = nrowsLocal * ncols; + Matrix Q_local (nrowsLocal, ncols); + + // Tell Proc 0 how many rows there are in the random orthogonal + // matrix I want to receive from Proc 0. + ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); + + // Receive the orthogonal matrix from Proc 0. + scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); + + // Scale the (local) orthogonal matrix by the number of + // processors, to make the global matrix Q orthogonal. + const scalar_type P (static_cast (nprocs)); + // Do overflow check. If casting P back to scalar_type + // doesn't produce the same value as nprocs, the cast + // overflowed. We take the real part, because scalar_type + // might be complex. + if (nprocs != static_cast (STS::real (P))) { + throw std::runtime_error ("Casting nprocs to Scalar failed"); } + scaleMatrix (Q_local, P); + + // A_local := Q_local * R + blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, + scalar_type(1), Q_local.get(), Q_local.lda(), + R.get(), R.lda(), + scalar_type(0), A_local.get(), A_local.lda()); + } } } // namespace Random } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index c98c2a34ec03..fea4c49094c4 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -64,13 +64,13 @@ namespace TSQR { template class SequentialCholeskyQR { private: - typedef MatView< LocalOrdinal, Scalar > mat_view_type; - typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view_type; - typedef Teuchos::BLAS blas_type; + using mat_view_type = MatView; + using const_mat_view_type = ConstMatView; + using blas_type = Impl::SystemBlas; public: - typedef Scalar scalar_type; - typedef LocalOrdinal ordinal_type; + using scalar_type = Scalar; + using ordinal_type = LocalOrdinal; /// \typedef FactorOutput /// \brief Return value of \c factor(). @@ -124,7 +124,7 @@ namespace TSQR { Impl::Lapack lapack; std::vector work (ncols); - Matrix ATA (ncols, ncols, Scalar(0)); + Matrix ATA (ncols, ncols, Scalar {}); FactorOutput retval (0); if (contiguous_cache_blocks) @@ -175,7 +175,7 @@ namespace TSQR { // CholeskyQR + symmetric eigensolver factorization. // Copy out the R factor - fill_matrix (ncols, ncols, R, ldr, Scalar(0)); + fill_matrix (ncols, ncols, R, ldr, Scalar {}); copy_upper_triangle (ncols, ncols, R, ldr, ATA.get(), ATA.lda()); // Compute A := A * R^{-1}. We do this in place in A, using @@ -189,7 +189,8 @@ namespace TSQR { mat_view_type A_rest (nrows, ncols, A, lda); // This call modifies A_rest. - mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + mat_view_type A_cur = + blocker.split_top_block (A_rest, contiguous_cache_blocks); // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, @@ -227,12 +228,15 @@ namespace TSQR { const LocalOrdinal ncols = ncols_Q; if (contiguous_cache_blocks) { - CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); + CacheBlocker blocker (nrows, ncols, + strategy_); mat_view_type C_rest (nrows, ncols, C, ldc); const_mat_view_type Q_rest (nrows, ncols, Q, ldq); - mat_view_type C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); - const_mat_view_type Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); + mat_view_type C_cur = + blocker.split_top_block (C_rest, contiguous_cache_blocks); + const_mat_view_type Q_cur = + blocker.split_top_block (Q_rest, contiguous_cache_blocks); while (! C_rest.empty ()) { deep_copy (Q_cur, C_cur); diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index de6fd7e84737..eefa5cb1b0cd 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -51,6 +51,7 @@ #include "Tsqr_LocalVerify.hpp" #include "Tsqr_NodeTsqr.hpp" #include "Tsqr_Util.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" #include "Teuchos_Describable.hpp" #include "Teuchos_ParameterList.hpp" #include "Teuchos_ParameterListExceptions.hpp" @@ -746,8 +747,7 @@ namespace TSQR { // restructuring of this code would parallelize nicely using // OpenMP. CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); - - Teuchos::BLAS blas; + Impl::SystemBlas blas; mat_view_type Q_rest (nrows, ncols, Q, ldq); Matrix Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized From c35c8cbb297274764d8923549404b85295fcd800 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 22 Nov 2019 13:52:59 -0700 Subject: [PATCH 08/50] TSQR: Matrix & MatView now have .data() instead of .get() This should help with porting to use Kokkos::View (and mdspan and mdarray in the future). --- .../tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp | 10 +- .../tsqr/src/TbbTsqr_CacheBlockTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 6 +- .../tsqr/src/TbbTsqr_FillWithZerosTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_Partitioner.hpp | 8 +- .../tsqr/src/TbbTsqr_RevealRankTask.hpp | 4 +- .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 14 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp | 34 ++--- .../tsqr/src/TbbTsqr_UnCacheBlockTask.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr.hpp | 16 +-- packages/tpetra/tsqr/src/TsqrAdaptor.hpp | 20 +-- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 12 +- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 124 +++++++++--------- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 12 +- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 119 +++++++++-------- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 6 +- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 10 +- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 32 ++--- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 48 +++---- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 58 ++++---- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 38 +++--- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 18 +-- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 14 +- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 28 ++-- packages/tpetra/tsqr/src/Tsqr_ParTest.hpp | 80 ++++++----- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 8 +- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 28 ++-- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 14 +- packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp | 64 ++++----- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 24 ++-- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 26 ++-- packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp | 38 +++--- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 34 ++--- .../tpetra/tsqr/src/Tsqr_generateStack.hpp | 13 +- .../tsqr/src/Tsqr_printGlobalMatrix.hpp | 8 +- 36 files changed, 486 insertions(+), 490 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp index 62609a868499..1fc01b237d29 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp @@ -192,8 +192,8 @@ namespace TSQR { timer.start(); const std::vector& seq_outputs = factor_output_.first; seq_.apply (apply_type_, Q_.nrows(), Q_.ncols(), - Q_.get(), Q_.lda(), seq_outputs[P_first_], - C_.ncols(), C_.get(), C_.lda(), + Q_.data(), Q_.lda(), seq_outputs[P_first_], + C_.ncols(), C_.data(), C_.lda(), contiguous_cache_blocks_); my_seq_timing_ = timer.stop(); } @@ -213,9 +213,9 @@ namespace TSQR { const std::vector& tau = par_output[P_bot]; std::vector work (C_top.ncols()); combine_.apply_pair (apply_type_, C_top.ncols(), Q_bot.ncols(), - Q_bot.get(), Q_bot.lda(), &tau[0], - C_top.get(), C_top.lda(), - C_bot.get(), C_bot.lda(), &work[0]); + Q_bot.data(), Q_bot.lda(), tau.data(), + C_top.data(), C_top.lda(), + C_bot.data(), C_bot.lda(), work.data()); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp index 602e5c275777..38e7aea13629 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp @@ -135,7 +135,7 @@ namespace TSQR { execute_base_case () { seq_.cache_block (A_out_.nrows(), A_out_.ncols(), - A_out_.get(), A_in_.get(), A_in_.lda()); + A_out_.data(), A_in_.data(), A_in_.lda()); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp index 377e3c16495f..e204cb99daab 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp @@ -122,7 +122,7 @@ namespace TSQR { execute_base_case () { // Fill my partition with zeros. - seq_.fill_with_zeros (Q_out_.nrows(), Q_out_.ncols(), Q_out_.get(), + seq_.fill_with_zeros (Q_out_.nrows(), Q_out_.ncols(), Q_out_.data(), Q_out_.lda(), contiguous_cache_blocks_); // If our partition is the first (topmost), fill it with // the first Q_out.ncols() columns of the identity matrix. diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index 876cfa876cec..0fc2d2df2e11 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -204,8 +204,8 @@ namespace TSQR { const LocalOrdinal ncols = A_top.ncols(); std::vector& tau = par_output_[P_bot]; std::vector work (ncols); - combine_.factor_pair (ncols, A_top.get(), A_top.lda(), - A_bot.get(), A_bot.lda(), &tau[0], &work[0]); + combine_.factor_pair (ncols, A_top.data(), A_top.lda(), + A_bot.data(), A_bot.lda(), &tau[0], &work[0]); } void @@ -214,7 +214,7 @@ namespace TSQR { TimerType timer(""); timer.start(); seq_outputs_[P_first_] = - seq_.factor (A_.nrows(), A_.ncols(), A_.get(), + seq_.factor (A_.nrows(), A_.ncols(), A_.data(), A_.lda(), contiguous_cache_blocks_); // Assign the topmost cache block of the current partition to // *A_top_ptr_. Every base case invocation does this, so that diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp index b8e2e21651cc..e6f040b9b7e8 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp @@ -124,7 +124,7 @@ namespace TSQR { execute_base_case () { // Fill my partition with zeros. - seq_.fill_with_zeros (C_.nrows(), C_.ncols(), C_.get(), + seq_.fill_with_zeros (C_.nrows(), C_.ncols(), C_.data(), C_.lda(), contiguous_cache_blocks_); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp index a1aa38f7e4a9..a1087e0142cd 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp @@ -117,14 +117,14 @@ namespace TSQR { // is for splitting off a single cache block. Each half // of the split may contain more than one cache block. if (contiguous_cache_blocks) { - pointer_type A_bot_ptr = A.get() + num_rows_top * ncols; - MatrixViewType A_top (num_rows_top, ncols, A.get(), num_rows_top); + pointer_type A_bot_ptr = A.data() + num_rows_top * ncols; + MatrixViewType A_top (num_rows_top, ncols, A.data(), num_rows_top); MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, num_rows_bot); return std::make_pair (A_top, A_bot); } else { - pointer_type A_bot_ptr = A.get() + num_rows_top; - MatrixViewType A_top (num_rows_top, ncols, A.get(), A.lda()); + pointer_type A_bot_ptr = A.data() + num_rows_top; + MatrixViewType A_top (num_rows_top, ncols, A.data(), A.lda()); MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, A.lda()); return std::make_pair (A_top, A_bot); } diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp index 7cc815a330d9..33fa6071fb2b 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp @@ -85,8 +85,8 @@ namespace TSQR { // doesn't suggest any orthogonality of the B input matrix, // though in this case B is U and U is orthogonal // (resp. unitary if Scalar is complex). - seq_.Q_times_B (Q_.nrows(), Q_.ncols(), Q_.get(), Q_.lda(), - U_.get(), U_.lda(), contiguous_cache_blocks_); + seq_.Q_times_B (Q_.nrows(), Q_.ncols(), Q_.data(), Q_.lda(), + U_.data(), U_.lda(), contiguous_cache_blocks_); } tbb::task* execute () diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index e67ef077ef80..26c7261dfe25 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -326,7 +326,7 @@ namespace TSQR { } // Copy the R factor out of A_top into R. - seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), + seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.data(), A_top.lda(), R, ldr, contiguous_cache_blocks); // Save the timings for future reference @@ -512,7 +512,7 @@ namespace TSQR { Matrix U (ncols, ncols, Scalar(0)); const LocalOrdinal rank = - reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol); + reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol); if (rank < ncols) { @@ -520,7 +520,7 @@ namespace TSQR { // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), contiguous_cache_blocks); } return rank; @@ -644,9 +644,9 @@ namespace TSQR { mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = std::make_pair (const_mat_view_type (Q_top.ncols(), Q_top.ncols(), - Q_top.get(), Q_top.lda()), + Q_top.data(), Q_top.lda()), mat_view_type (C_top.ncols(), C_top.ncols(), - C_top.get(), C_top.lda())); + C_top.data(), C_top.lda())); } else { // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] @@ -670,9 +670,9 @@ namespace TSQR { mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = std::make_pair (const_mat_view_type (Q_top.ncols(), Q_top.ncols(), - Q_top.get(), Q_top.lda()), + Q_top.data(), Q_top.lda()), mat_view_type (C_top.ncols(), C_top.ncols(), - C_top.get(), C_top.lda())); + C_top.data(), C_top.lda())); } else { build_partition_array (P_first, P_mid, top_blocks, diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp index 32ed5dbdc9fd..c4f4e6a09588 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp @@ -110,7 +110,7 @@ namespace TSQR { } else if (P_first == P_last) { std::pair results = - seq_.factor (A.nrows(), A.ncols(), A.get(), A.lda(), + seq_.factor (A.nrows(), A.ncols(), A.data(), A.lda(), contiguous_cache_blocks); seq_outputs[P_first] = results.first; A_top = A; @@ -136,7 +136,7 @@ namespace TSQR { // If we're completely done, extract the final R factor from // the topmost partition. if (depth == 0) { - seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), + seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.data(), A_top.lda(), R, ldr, contiguous_cache_blocks); } return A_top; @@ -188,8 +188,8 @@ namespace TSQR { const_mat_view Q_top = blocker.top_block (Q, contiguous_cache_blocks); mat_view C_top = blocker.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = - std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), Q_top.get(), Q_top.lda()), - mat_view (C_top.ncols(), C_top.ncols(), C_top.get(), C_top.lda())); + std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), Q_top.data(), Q_top.lda()), + mat_view (C_top.ncols(), C_top.ncols(), C_top.data(), C_top.lda())); } else { @@ -227,8 +227,8 @@ namespace TSQR { else if (P_first == P_last) { const std::vector< SeqOutput >& seq_outputs = factor_output.first; - seq_.apply ("N", Q.nrows(), Q.ncols(), Q.get(), Q.lda(), - seq_outputs[P_first], C.ncols(), C.get(), + seq_.apply ("N", Q.nrows(), Q.ncols(), Q.data(), Q.lda(), + seq_outputs[P_first], C.ncols(), C.data(), C.lda(), contiguous_cache_blocks); } else @@ -270,8 +270,8 @@ namespace TSQR { } else if (P_first == P_last) { const std::vector& seq_outputs = factor_output.first; - seq_.apply (op, Q.nrows(), Q.ncols(), Q.get(), Q.lda(), - seq_outputs[P_first], C.ncols(), C.get(), + seq_.apply (op, Q.nrows(), Q.ncols(), Q.data(), Q.lda(), + seq_outputs[P_first], C.ncols(), C.data(), C.lda(), contiguous_cache_blocks); return std::make_pair (Q, C); } @@ -327,8 +327,8 @@ namespace TSQR { std::vector< Scalar > work (ncols); TSQR::Combine< LocalOrdinal, Scalar > combine_; - combine_.factor_pair (ncols, A_top.get(), A_top.lda(), - A_bot.get(), A_bot.lda(), &tau[0], &work[0]); + combine_.factor_pair (ncols, A_top.data(), A_top.lda(), + A_bot.data(), A_bot.lda(), &tau[0], &work[0]); } template< class LocalOrdinal, class Scalar > @@ -351,9 +351,9 @@ namespace TSQR { TSQR::Combine combine_; combine_.apply_pair (trans.c_str(), C_top.ncols(), Q_bot.ncols(), - Q_bot.get(), Q_bot.lda(), &tau[0], - C_top.get(), C_top.lda(), - C_bot.get(), C_bot.lda(), &work[0]); + Q_bot.data(), Q_bot.lda(), &tau[0], + C_top.data(), C_top.lda(), + C_bot.data(), C_bot.lda(), &work[0]); } template< class LocalOrdinal, class Scalar > @@ -367,8 +367,8 @@ namespace TSQR { if (P_first > P_last) return; else if (P_first == P_last) - seq_.cache_block (A_out.nrows(), A_out.ncols(), A_out.get(), - A_in.get(), A_in.lda()); + seq_.cache_block (A_out.nrows(), A_out.ncols(), A_out.data(), + A_in.data(), A_in.lda()); else { const size_t P_mid = (P_first + P_last) / 2; @@ -395,8 +395,8 @@ namespace TSQR { return; } else if (P_first == P_last) { - seq_.un_cache_block (A_out.nrows(), A_out.ncols(), A_out.get(), - A_out.lda(), A_in.get()); + seq_.un_cache_block (A_out.nrows(), A_out.ncols(), A_out.data(), + A_out.lda(), A_in.data()); } else { const size_t P_mid = (P_first + P_last) / 2; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp index 351228c64f22..287a238af9f6 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp @@ -135,7 +135,7 @@ namespace TSQR { execute_base_case () { seq_.un_cache_block (A_out_.nrows(), A_out_.ncols(), - A_out_.get(), A_out_.lda(), A_in_.get()); + A_out_.data(), A_out_.lda(), A_in_.data()); } }; diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index f0fa8051b0a8..0491249b8117 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -283,7 +283,7 @@ namespace TSQR { // factor (computed above) to compute the distributed-memory // part of the QR factorization. { - mat_view_type Q_top (numCols, numCols, Q_top_block.get(), + mat_view_type Q_top (numCols, numCols, Q_top_block.data(), Q_top_block.lda()); mat_view_type R_view (numCols, numCols, R, LDR); distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); @@ -373,7 +373,7 @@ namespace TSQR { // factor (computed above) to compute the distributed-memory // part of the QR factorization. { - mat_view_type Q_top (numCols, numCols, Q_top_block.get(), + mat_view_type Q_top (numCols, numCols, Q_top_block.data(), Q_top_block.lda()); mat_view_type R_view (numCols, numCols, R, LDR); distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); @@ -453,7 +453,7 @@ namespace TSQR { R_view.fill (STS::zero()); NodeOutput nodeResults = nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, - R_view.get(), R_view.lda(), + R_view.data(), R_view.lda(), contiguousCacheBlocks); DistOutput distResults = distTsqr_->factor (R_view); return std::make_pair (nodeResults, distResults); @@ -530,7 +530,7 @@ namespace TSQR { nodeTsqr_->top_block (C_view, contiguousCacheBlocks); // View of the topmost ncols_C by ncols_C block of C. - mat_view_type C_top_view (ncols_C, ncols_C, C_view_top_block.get(), + mat_view_type C_top_view (ncols_C, ncols_C, C_view_top_block.data(), C_view_top_block.lda()); if (! transposed) { @@ -539,7 +539,7 @@ namespace TSQR { matrix_type C_top (C_top_view); // Compute in place on all processors' C_top blocks. - distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.get(), + distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.data(), C_top.lda(), factor_output.second); // Copy the result from C_top back into the top ncols_C by @@ -566,7 +566,7 @@ namespace TSQR { matrix_type C_top (C_top_view); // Compute in place on all processors' C_top blocks. - distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.get(), + distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.data(), C_top.lda(), factor_output.second); // Copy the result from C_top back into the top ncols_C by @@ -764,13 +764,13 @@ namespace TSQR { // matrix_type U (ncols, ncols, STS::zero()); const ordinal_type rank = - reveal_R_rank (ncols, R, ldr, U.get(), U.lda(), tol); + reveal_R_rank (ncols, R, ldr, U.data(), U.lda(), tol); if (rank < ncols) { // If R is not full rank: reveal_R_rank() already computed // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), contiguousCacheBlocks); } return rank; diff --git a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp index 7ed5b992c0ec..89236be2068c 100644 --- a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp @@ -224,7 +224,7 @@ namespace TSQR { throw std::runtime_error ("Failed to reshape matrix R"); } } - return pTsqr_->factor (nrowsLocal, ncols, A_local.get(), LDA, + return pTsqr_->factor (nrowsLocal, ncols, A_local.data(), LDA, R.values(), R.stride(), contiguousCacheBlocks); } @@ -277,9 +277,9 @@ namespace TSQR { ArrayRCP pQin = fetchConstView (Q_in); ArrayRCP pQout = fetchNonConstView (Q_out); pTsqr_->explicit_Q (nrowsLocal, - ncols_in, pQin.get(), LDQ_in, + ncols_in, pQin.data(), LDQ_in, factorOutput, - ncols_out, pQout.get(), LDQ_out, + ncols_out, pQout.data(), LDQ_out, contiguousCacheBlocks); } @@ -320,7 +320,7 @@ namespace TSQR { ArrayRCP< scalar_type > Q_ptr = fetchNonConstView (Q); return pTsqr_->reveal_rank (nrowsLocal, ncols, - Q_ptr.get(), ldqLocal, + Q_ptr.data(), ldqLocal, R.values(), R.stride(), relativeTolerance, contiguousCacheBlocks); @@ -363,8 +363,8 @@ namespace TSQR { } ArrayRCP pA_in = fetchConstView (A_in); ArrayRCP pA_out = fetchNonConstView (A_out); - pTsqr_->cache_block (nrowsLocal, ncols, pA_out.get(), - pA_in.get(), LDA_in); + pTsqr_->cache_block (nrowsLocal, ncols, pA_out.data(), + pA_in.data(), LDA_in); } /// \brief Un-cache-block A_in into A_out. @@ -399,8 +399,8 @@ namespace TSQR { } ArrayRCP pA_in = fetchConstView (A_in); ArrayRCP pA_out = fetchNonConstView (A_out); - pTsqr_->un_cache_block (nrowsLocal, ncols, pA_out.get(), - LDA_out, pA_in.get()); + pTsqr_->un_cache_block (nrowsLocal, ncols, pA_out.data(), + LDA_out, pA_in.data()); } /// \brief Verify the result of the "thin" QR factorization \f$A = QR\f$. @@ -437,8 +437,8 @@ namespace TSQR { // Const views suffice for verification ArrayRCP A_ptr = fetchConstView (A); ArrayRCP Q_ptr = fetchConstView (Q); - return global_verify (nrowsLocal_A, ncols_A, A_ptr.get(), LDA, - Q_ptr.get(), LDQ, R.values(), R.stride(), + return global_verify (nrowsLocal_A, ncols_A, A_ptr.data(), LDA, + Q_ptr.data(), LDQ, R.values(), R.stride(), pScalarMessenger_.get()); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 35d216262bc9..18bd3d83717b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -342,8 +342,8 @@ namespace TSQR { // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, true); - copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), - A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda()); + copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.data(), + A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); } } @@ -375,8 +375,8 @@ namespace TSQR { // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, false); - copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), - A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda()); + copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.data(), + A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); } } @@ -411,7 +411,7 @@ namespace TSQR { // result[0] = starting row index of the cache block // result[1] = number of rows in the cache block - // result[2] = pointer offset (A.get() + result[2]) + // result[2] = pointer offset (A.data() + result[2]) // result[3] = leading dimension (a.k.a. stride) of the cache block std::vector result = strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(), @@ -425,7 +425,7 @@ namespace TSQR { // (ordinal_type) to unsigned (pointer) may raise compiler // warnings. return MatrixViewType (result[1], A.ncols(), - A.get() + static_cast(result[2]), + A.data() + static_cast(result[2]), result[3]); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 546c6097efd5..7ba5e87d0dc1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -293,7 +293,7 @@ namespace TSQR { matrix_type A (numRows, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]); + matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); @@ -313,11 +313,11 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (numRows, numCols, A.get(), A.lda(), - &tau[0], &work[0]); + combiner.factor_first (numRows, numCols, A.data(), A.lda(), + tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], - Q.get(), Q.lda(), &work[0]); + A.data(), A.lda(), tau.data(), + Q.data(), Q.lda(), work.data()); } // How much time numTrials runs must take in order for @@ -343,11 +343,11 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (numRows, numCols, A.get(), A.lda(), - &tau[0], &work[0]); + combiner.factor_first (numRows, numCols, A.data(), A.lda(), + tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], - Q.get(), Q.lda(), &work[0]); + A.data(), A.lda(), tau.data(), + Q.data(), Q.lda(), work.data()); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -391,7 +391,7 @@ namespace TSQR { matrix_type A (numRows, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]); + matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); @@ -411,11 +411,11 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (numRows, numCols, A.get(), A.lda(), - &tau[0], &work[0]); + combiner.factor_first (numRows, numCols, A.data(), A.lda(), + tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], - Q.get(), Q.lda(), &work[0]); + A.data(), A.lda(), tau.data(), + Q.data(), Q.lda(), work.data()); } // // The actual timing runs. @@ -424,11 +424,11 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (numRows, numCols, A.get(), A.lda(), - &tau[0], &work[0]); + combiner.factor_first (numRows, numCols, A.data(), A.lda(), + tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], - Q.get(), Q.lda(), &work[0]); + A.data(), A.lda(), tau.data(), + Q.data(), Q.lda(), work.data()); } return timer.stop(); } @@ -477,12 +477,12 @@ namespace TSQR { matrix_type R (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R.data(), R.lda(), sigmas.data()); // Now generate a random cache block. matrix_type A (numRows, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]); + matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); @@ -502,13 +502,13 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (numRows, numCols, R.get(), R.lda(), - A.get(), A.lda(), &tau[0], &work[0]); + combiner.factor_inner (numRows, numCols, R.data(), R.lda(), + A.data(), A.lda(), tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], + A.data(), A.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } // How much time numTrials runs must take in order for @@ -534,13 +534,13 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (numRows, numCols, R.get(), R.lda(), - A.get(), A.lda(), &tau[0], &work[0]); + combiner.factor_inner (numRows, numCols, R.data(), R.lda(), + A.data(), A.lda(), tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], + A.data(), A.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -587,12 +587,12 @@ namespace TSQR { matrix_type R (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R.data(), R.lda(), sigmas.data()); // Now generate a random cache block. matrix_type A (numRows, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]); + matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); @@ -612,13 +612,13 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (numRows, numCols, R.get(), R.lda(), - A.get(), A.lda(), &tau[0], &work[0]); + combiner.factor_inner (numRows, numCols, R.data(), R.lda(), + A.data(), A.lda(), tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], + A.data(), A.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } // // The actual timing runs. @@ -627,13 +627,13 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (numRows, numCols, R.get(), R.lda(), - A.get(), A.lda(), &tau[0], &work[0]); + combiner.factor_inner (numRows, numCols, R.data(), R.lda(), + A.data(), A.lda(), tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau[0], + A.data(), A.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } return timer.stop(); } @@ -678,12 +678,12 @@ namespace TSQR { matrix_type R1 (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R1.data(), R1.lda(), sigmas.data()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R2.data(), R2.lda(), sigmas.data()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); @@ -703,14 +703,14 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (numCols, R1.get(), R1.lda(), - R2.get(), R2.lda(), - &tau[0], &work[0]); + combiner.factor_pair (numCols, R1.data(), R1.lda(), + R2.data(), R2.lda(), + tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.get(), R2.lda(), &tau[0], + R2.data(), R2.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } // How much time numTrials runs must take in order for @@ -736,14 +736,14 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (numCols, R1.get(), R1.lda(), - R2.get(), R2.lda(), - &tau[0], &work[0]); + combiner.factor_pair (numCols, R1.data(), R1.lda(), + R2.data(), R2.lda(), + tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.get(), R2.lda(), &tau[0], + R2.data(), R2.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -786,12 +786,12 @@ namespace TSQR { matrix_type R1 (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R1.data(), R1.lda(), sigmas.data()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]); + matGen.fill_random_R (numCols, R2.data(), R2.lda(), sigmas.data()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); @@ -811,14 +811,14 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (numCols, R1.get(), R1.lda(), - R2.get(), R2.lda(), - &tau[0], &work[0]); + combiner.factor_pair (numCols, R1.data(), R1.lda(), + R2.data(), R2.lda(), + tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.get(), R2.lda(), &tau[0], + R2.data(), R2.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } // // The actual timing runs. @@ -827,14 +827,14 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (numCols, R1.get(), R1.lda(), - R2.get(), R2.lda(), - &tau[0], &work[0]); + combiner.factor_pair (numCols, R1.data(), R1.lda(), + R2.data(), R2.lda(), + tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.get(), R2.lda(), &tau[0], + R2.data(), R2.lda(), tau.data(), &Q(0, 0), Q.lda(), &Q(numCols, 0), Q.lda(), - &work[0]); + work.data()); } return timer.stop(); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 07cacc0a6941..c19404fb2c9b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -152,8 +152,8 @@ namespace TSQR { const std::string trans = apply_type.toString (); const int lwork = ncols_C; lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.get(), A_buf_.lda(), tau, - C_buf_.get(), C_buf_.lda(), + A_buf_.data(), A_buf_.lda(), tau, + C_buf_.data(), C_buf_.lda(), work, lwork); // Copy back the results. deep_copy (C_top_view, C_buf_top); @@ -182,7 +182,7 @@ namespace TSQR { copy_matrix (m, n, &A_buf_(n, 0), A_buf_.lda(), A, lda); const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.get(), A_buf_.lda(), + lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.lda(), tau, work, lwork); // Copy back the results. R might be a view of the upper // triangle of a cache block, so only copy into the upper @@ -213,7 +213,7 @@ namespace TSQR { copy_upper_triangle (n, n, &A_buf_(n, 0), A_buf_.lda(), R_bot, ldr_bot); const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.get(), A_buf_.lda(), + lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.lda(), tau, work, lwork); // Copy back the results. Only read the upper triangles of the // two n by n row blocks of A_buf_ (this means we don't have to @@ -250,8 +250,8 @@ namespace TSQR { const int lwork = ncols_Q; const std::string trans = apply_type.toString (); lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.get(), A_buf_.lda(), tau, - C_buf_.get(), C_buf_.lda(), + A_buf_.data(), A_buf_.lda(), tau, + C_buf_.data(), C_buf_.lda(), work, lwork); // Copy back the results. copy_matrix (ncols_Q, ncols_C, C_top, ldc_top, &C_buf_(0, 0), C_buf_.lda()); diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 5a3e52afb231..b02c267f91a7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -205,7 +205,7 @@ namespace TSQR { printMatrix (std::ostream& out, const MatrixViewType& A) { - print_local_matrix (out, A.nrows(), A.ncols(), A.get(), A.lda()); + print_local_matrix (out, A.nrows(), A.ncols(), A.data(), A.lda()); } template @@ -215,8 +215,8 @@ namespace TSQR { const MatrixViewType& Q, const MatrixViewType& R) { - return local_verify (A.nrows(), A.ncols(), A.get(), A.lda(), - Q.get(), Q.lda(), R.get(), R.lda()); + return local_verify (A.nrows(), A.ncols(), A.data(), A.lda(), + Q.data(), Q.lda(), R.data(), R.lda()); } /// \brief Test accuracy of TSQR::Combine @@ -282,21 +282,21 @@ namespace TSQR { matrix_type R3 (numCols, numCols, Scalar(0)); matrix_type A (numRows, numCols, Scalar(0)); matgen_type matgen (gen); - matgen.fill_random_R (numCols, R1.get(), R1.lda(), &sigma_R1[0]); - matgen.fill_random_R (numCols, R2.get(), R2.lda(), &sigma_R2[0]); - matgen.fill_random_R (numCols, R3.get(), R3.lda(), &sigma_R3[0]); - matgen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigma_A[0]); + matgen.fill_random_R (numCols, R1.data(), R1.lda(), &sigma_R1[0]); + matgen.fill_random_R (numCols, R2.data(), R2.lda(), &sigma_R2[0]); + matgen.fill_random_R (numCols, R3.data(), R3.lda(), &sigma_R3[0]); + matgen.fill_random_svd (numRows, numCols, A.data(), A.lda(), &sigma_A[0]); if (false && debug) { cerr << endl << "First test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R1.get(), R1.lda()); - print_local_matrix (cerr, numCols, numCols, R2.get(), R2.lda()); + print_local_matrix (cerr, numCols, numCols, R1.data(), R1.lda()); + print_local_matrix (cerr, numCols, numCols, R2.data(), R2.lda()); cerr << endl; cerr << endl << "Second test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R3.get(), R3.lda()); - print_local_matrix (cerr, numRows, numCols, A.get(), A.lda()); + print_local_matrix (cerr, numCols, numCols, R3.data(), R3.lda()); + print_local_matrix (cerr, numRows, numCols, A.data(), A.lda()); cerr << endl; } @@ -309,15 +309,15 @@ namespace TSQR { // Copy [R1; R2] into A_R1R2. copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.lda(), - R1.get(), R1.lda()); + R1.data(), R1.lda()); copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.lda(), - R2.get(), R2.lda()); + R2.data(), R2.lda()); // Copy [R3; A] into A_R3A. copy_matrix (numCols, numCols, &A_R3A(0, 0), A_R3A.lda(), - R3.get(), R3.lda()); + R3.data(), R3.lda()); copy_matrix (numRows, numCols, &A_R3A(numCols, 0), A_R3A.lda(), - A.get(), A.lda()); + A.data(), A.lda()); // Space to put the explicit Q factors. matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0)); @@ -346,32 +346,32 @@ namespace TSQR { << " by " << numCols << endl << endl; Combine< Ordinal, Scalar > combiner; - combiner.factor_pair (numCols, R1.get(), R1.lda(), R2.get(), R2.lda(), - &tau_R1R2[0], &work[0]); + combiner.factor_pair (numCols, R1.data(), R1.lda(), R2.data(), R2.lda(), + &tau_R1R2[0], work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.get(), R2.lda(), &tau_R1R2[0], + R2.data(), R2.lda(), &tau_R1R2[0], &Q_R1R2(0, 0), Q_R1R2.lda(), &Q_R1R2(numCols, 0), Q_R1R2.lda(), - &work[0]); + work.data()); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Copy of test problem:" << endl; print_local_matrix (cerr, A_R1R2.nrows(), A_R1R2.ncols(), - A_R1R2.get(), A_R1R2.lda()); + A_R1R2.data(), A_R1R2.lda()); cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q_R1R2.nrows(), Q_R1R2.ncols(), - Q_R1R2.get(), Q_R1R2.lda()); + Q_R1R2.data(), Q_R1R2.lda()); cerr << endl << "-- R factor:" << endl; print_local_matrix (cerr, R1.nrows(), R1.ncols(), - R1.get(), R1.lda()); + R1.data(), R1.lda()); cerr << endl; } const results_type firstResults = local_verify (A_R1R2.nrows(), A_R1R2.ncols(), - A_R1R2.get(), A_R1R2.lda(), - Q_R1R2.get(), Q_R1R2.lda(), - R1.get(), R1.lda()); + A_R1R2.data(), A_R1R2.lda(), + Q_R1R2.data(), Q_R1R2.lda(), + R1.data(), R1.lda()); if (debug) cerr << "\\| A - Q*R \\|_F = " << firstResults[0] << endl << "\\| I - Q'*Q \\|_F = " << firstResults[1] << endl @@ -383,32 +383,32 @@ namespace TSQR { << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols << " and A " << numRows << " by " << numCols << endl << endl; - combiner.factor_inner (numRows, numCols, R3.get(), R3.lda(), - A.get(), A.lda(), &tau_R3A[0], &work[0]); + combiner.factor_inner (numRows, numCols, R3.data(), R3.lda(), + A.data(), A.lda(), &tau_R3A[0], work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.get(), A.lda(), &tau_R3A[0], + A.data(), A.lda(), &tau_R3A[0], &Q_R3A(0, 0), Q_R3A.lda(), &Q_R3A(numCols, 0), Q_R3A.lda(), - &work[0]); + work.data()); if (debug) { cerr << "Results of second test problem:" << endl; cerr << "-- Copy of test problem:" << endl; print_local_matrix (cerr, A_R3A.nrows(), A_R3A.ncols(), - A_R3A.get(), A_R3A.lda()); + A_R3A.data(), A_R3A.lda()); cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q_R3A.nrows(), Q_R3A.ncols(), - Q_R3A.get(), Q_R3A.lda()); + Q_R3A.data(), Q_R3A.lda()); cerr << endl << "-- R factor:" << endl; print_local_matrix (cerr, R3.nrows(), R3.ncols(), - R3.get(), R3.lda()); + R3.data(), R3.lda()); cerr << endl; } const results_type secondResults = local_verify (A_R3A.nrows(), A_R3A.ncols(), - A_R3A.get(), A_R3A.lda(), - Q_R3A.get(), Q_R3A.lda(), - R3.get(), R3.lda()); + A_R3A.data(), A_R3A.lda(), + Q_R3A.data(), Q_R3A.lda(), + R3.data(), R3.lda()); if (debug) cerr << "\\| A - Q*R \\|_F = " << secondResults[0] << endl << "\\| I - Q'*Q \\|_F = " << secondResults[1] << endl @@ -479,8 +479,8 @@ namespace TSQR { // Fill the two cache blocks with random test problems. matgen_type matgen (gen); - matgen.fill_random_svd (numRows, numCols, A1.get(), A1.lda(), &sigma_A1[0]); - matgen.fill_random_svd (numRows, numCols, A2.get(), A2.lda(), &sigma_A2[0]); + matgen.fill_random_svd (numRows, numCols, A1.data(), A1.lda(), &sigma_A1[0]); + matgen.fill_random_svd (numRows, numCols, A2.data(), A2.lda(), &sigma_A2[0]); if (false && debug) { @@ -527,13 +527,13 @@ namespace TSQR { Combine< Ordinal, Scalar > combiner; // qr( A1 ) - combiner.factor_first (numRows, numCols, A1.get(), A1.lda(), - &tau1[0], &work[0]); + combiner.factor_first (numRows, numCols, A1.data(), A1.lda(), + &tau1[0], work.data()); // View of numCols by numCols upper triangle of A1. - mat_view_type R1 (numCols, numCols, A1.get(), A1.lda()); + mat_view_type R1 (numCols, numCols, A1.data(), A1.lda()); // qr( [R1; A2] ) - combiner.factor_inner (numRows, numCols, R1.get(), R1.lda(), - A2.get(), A2.lda(), &tau2[0], &work[0]); + combiner.factor_inner (numRows, numCols, R1.data(), R1.lda(), + A2.data(), A2.lda(), &tau2[0], work.data()); // Extract (a deep copy of) the R factor. matrix_type R (R1); // Zero out everything below the diagonal of R. @@ -545,31 +545,30 @@ namespace TSQR { // (working up the matrix A,) finishing with A1. combiner.apply_inner (ApplyType::NoTranspose, numRows, numCols, numCols, - A2.get(), A2.lda(), &tau2[0], - Q1.get(), Q1.lda(), - Q2.get(), Q2.lda(), &work[0]); + A2.data(), A2.lda(), tau2.data(), + Q1.data(), Q1.lda(), + Q2.data(), Q2.lda(), work.data()); combiner.apply_first (ApplyType::NoTranspose, numRows, numCols, numCols, - A1.get(), A.lda(), &tau1[0], - Q1.get(), Q1.lda(), &work[0]); - if (debug) - { - cerr << "Results of first test problem:" << endl; - cerr << "-- Test matrix A:" << endl; - printMatrix (cerr, A_copy); - cerr << endl << "-- Q factor:" << endl; - printMatrix (cerr, Q); - cerr << endl << "-- R factor:" << endl; - printMatrix (cerr, R); - cerr << endl; - } + A1.data(), A.lda(), tau1.data(), + Q1.data(), Q1.lda(), work.data()); + if (debug) { + cerr << "Results of first test problem:" << endl; + cerr << "-- Test matrix A:" << endl; + printMatrix (cerr, A_copy); + cerr << endl << "-- Q factor:" << endl; + printMatrix (cerr, Q); + cerr << endl << "-- R factor:" << endl; + printMatrix (cerr, R); + cerr << endl; + } const results_type results = localVerify (A_copy, Q, R); - if (debug) + if (debug) { cerr << "\\| A - Q*R \\|_F = " << results[0] << endl << "\\| I - Q'*Q \\|_F = " << results[1] << endl << "\\| A \\|_F = " << results[2] << endl; - + } return results; } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 5f37e9b65a37..19f9fb7e9420 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -277,7 +277,7 @@ namespace TSQR { const ordinal_type ncols = R_mine.ncols(); std::vector< scalar_type > R_local (ncols*ncols); - copy_matrix (ncols, ncols, &R_local[0], ncols, R_mine.get(), R_mine.lda()); + copy_matrix (ncols, ncols, R_local.data(), ncols, R_mine.data(), R_mine.lda()); const int P = messenger_->size(); const int my_rank = messenger_->rank(); @@ -285,7 +285,7 @@ namespace TSQR { std::vector work (ncols); helper.factor_helper (ncols, R_local, my_rank, 0, P-1, first_tag, messenger_.get(), Q_factors, tau_arrays, work); - copy_matrix (ncols, ncols, R_mine.get(), R_mine.lda(), &R_local[0], ncols); + copy_matrix (ncols, ncols, R_mine.data(), R_mine.lda(), R_local.data(), ncols); return std::make_pair (Q_factors, tau_arrays); } @@ -319,7 +319,7 @@ namespace TSQR { const int cur_pos = Q_factors.size() - 1; DistTsqrHelper helper; helper.apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - &C_other[0], my_rank, 0, P-1, first_tag, + C_other.data(), my_rank, 0, P-1, first_tag, messenger_.get(), Q_factors, tau_arrays, cur_pos, work); } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index b13c888e659d..5c85acab44ea 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -396,8 +396,8 @@ namespace TSQR { // require expensive reallocation every time we send / // receive data. resizeWork (numCols); - combine_.factor_pair (numCols, R_mine.get(), R_mine.lda(), - R_other.get(), R_other.lda(), + combine_.factor_pair (numCols, R_mine.data(), R_mine.lda(), + R_other.data(), R_other.lda(), &tau[0], &work_[0]); QFactors.push_back (R_other); tauArrays.push_back (tau); @@ -462,9 +462,9 @@ namespace TSQR { Q_other.fill (scalar_type (0)); combine_.apply_pair (ApplyType::NoTranspose, Q_mine.ncols(), Q_impl.ncols(), - Q_impl.get(), Q_impl.lda(), &tau[0], - Q_mine.get(), Q_mine.lda(), - Q_other.get(), Q_other.lda(), &work_[0]); + Q_impl.data(), Q_impl.lda(), &tau[0], + Q_mine.data(), Q_mine.lda(), + Q_other.data(), Q_other.lda(), &work_[0]); // Send the resulting Q_other, and the final R factor, to P_mid. send_Q_R (Q_other, R_mine, P_mid); newpos = curpos - 1; diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 94dfdac67005..2a8634f6a263 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -241,7 +241,7 @@ namespace TSQR { // This part has O(P) communication for P MPI processes. using TSQR::Random::randomGlobalMatrix; // Help the C++ compiler with type inference. - mat_view_type A_local_view (A_local.nrows(), A_local.ncols(), A_local.get(), A_local.lda()); + mat_view_type A_local_view (A_local.nrows(), A_local.ncols(), A_local.data(), A_local.lda()); const magnitude_type* const singVals = (numCols == 0) ? NULL : &singularValues[0]; randomGlobalMatrix (&gen, A_local_view, singVals, ordinalMessenger.getRawPtr(), @@ -258,8 +258,8 @@ namespace TSQR { // we have to make a copy in order to validate the final // result. if (contiguousCacheBlocks) { - tsqr->cache_block (numRowsLocal, numCols, A_copy.get(), - A_local.get(), A_local.lda()); + tsqr->cache_block (numRowsLocal, numCols, A_copy.data(), + A_local.data(), A_local.lda()); if (debug) { Teuchos::barrier (*comm); if (myRank == 0) @@ -275,9 +275,9 @@ namespace TSQR { // wanted. if (testFactorExplicit) { tsqr->factorExplicitRaw (A_copy.nrows (), A_copy.ncols (), - A_copy.get (), A_copy.lda (), - Q_local.get (), Q_local.lda (), - R.get (), R.lda (), + A_copy.data (), A_copy.lda (), + Q_local.data (), Q_local.lda (), + R.data (), R.lda (), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); @@ -288,16 +288,16 @@ namespace TSQR { else { // Factor the (copy of the) matrix. factor_output_type factorOutput = - tsqr->factor (numRowsLocal, numCols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + tsqr->factor (numRowsLocal, numCols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); if (myRank == 0) cerr << "-- Finished Tsqr::factor" << endl; } // Compute the explicit Q factor in Q_local. - tsqr->explicit_Q (numRowsLocal, numCols, A_copy.get(), A_copy.lda(), - factorOutput, numCols, Q_local.get(), Q_local.lda(), + tsqr->explicit_Q (numRowsLocal, numCols, A_copy.data(), A_copy.lda(), + factorOutput, numCols, Q_local.data(), Q_local.lda(), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); @@ -322,8 +322,8 @@ namespace TSQR { const magnitude_type tol = STM::zero(); const ordinal_type rank = tsqr->revealRankRaw (Q_local.nrows (), Q_local.ncols (), - Q_local.get (), Q_local.lda (), - R.get (), R.lda (), tol, + Q_local.data (), Q_local.lda (), + R.data (), R.lda (), tol, contiguousCacheBlocks); magnitude_type two_to_the_numCols = STM::one(); @@ -361,8 +361,8 @@ namespace TSQR { // We can use A_copy as scratch space for // un-cache-blocking Q_local, since we're done using // A_copy for other things. - tsqr->un_cache_block (numRowsLocal, numCols, A_copy.get(), - A_copy.lda(), Q_local.get()); + tsqr->un_cache_block (numRowsLocal, numCols, A_copy.data(), + A_copy.lda(), Q_local.data()); // Overwrite Q_local with the un-cache-blocked Q factor. deep_copy (Q_local, A_copy); if (debug) { @@ -374,8 +374,8 @@ namespace TSQR { // Test accuracy of the factorization. const std::vector results = - global_verify (numRowsLocal, numCols, A_local.get(), A_local.lda(), - Q_local.get(), Q_local.lda(), R.get(), R.lda(), + global_verify (numRowsLocal, numCols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), scalarMessenger.getRawPtr()); if (debug) { Teuchos::barrier (*comm); diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 2e5082b2ed93..fd904a8305ca 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -173,8 +173,8 @@ namespace TSQR { // We should only call this if A_top.ncols() > 0 and therefore // work.size() > 0, but we've already checked for that, so we // don't have to check again. - combine.factor_first (A_top.nrows(), A_top.ncols(), A_top.get(), - A_top.lda(), &tau[0], &work[0]); + combine.factor_first (A_top.nrows(), A_top.ncols(), A_top.data(), + A_top.lda(), tau.data(), work.data()); return tau; } @@ -190,9 +190,9 @@ namespace TSQR { // tau.size() > 0 and work.size() > 0, but we've already // checked for that, so we don't have to check again. combine.factor_inner (A_cur.nrows(), A_top.ncols(), - A_top.get(), A_top.lda(), - A_cur.get(), A_cur.lda(), - &tau[0], &work[0]); + A_top.data(), A_top.lda(), + A_cur.data(), A_cur.lda(), + tau.data(), work.data()); return tau; } @@ -395,8 +395,8 @@ namespace TSQR { // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. combine.apply_first (applyType, C_top.nrows(), C_top.ncols(), - Q_top.ncols(), Q_top.get(), Q_top.lda(), - &tau[0], C_top.get(), C_top.lda(), &work[0]); + Q_top.ncols(), Q_top.data(), Q_top.lda(), + tau.data(), C_top.data(), C_top.lda(), work.data()); } void @@ -418,10 +418,10 @@ namespace TSQR { // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. combine.apply_inner (applyType, C_cur.nrows(), C_cur.ncols(), - Q_cur.ncols(), Q_cur.get(), Q_cur.lda(), + Q_cur.ncols(), Q_cur.data(), Q_cur.lda(), tau.data(), - C_top.get(), C_top.lda(), - C_cur.get(), C_cur.lda(), + C_top.data(), C_top.lda(), + C_cur.data(), C_cur.lda(), work.data()); } @@ -538,7 +538,7 @@ namespace TSQR { // the top ncols rows of C_top) with zeros. mat_view_type C_top_rest (C_top.nrows() - C_top.ncols(), C_top.ncols(), - C_top.get() + C_top.ncols(), + C_top.data() + C_top.ncols(), C_top.lda()); C_top_rest.fill (Scalar {}); } @@ -848,8 +848,8 @@ namespace TSQR { // Q_cur := Q_temp * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows(), numCols, numCols, Scalar (1.0), - Q_temp.get(), Q_temp.lda(), B_.get(), B_.lda(), - Scalar(0), Q_cur.get(), Q_cur.lda()); + Q_temp.data(), Q_temp.lda(), B_.data(), B_.lda(), + Scalar(0), Q_cur.data(), Q_cur.lda()); } /// \brief Multiply (in place) each cache block in the range by B_. @@ -1454,11 +1454,11 @@ namespace TSQR { "factorSecondPass: result.topBlocks[0] is an empty view." << suffix); mat_view_type R_top_square (R_top.ncols(), R_top.ncols(), - R_top.get(), R_top.lda()); + R_top.data(), R_top.lda()); R.fill (Scalar {}); // Only copy the upper triangle of R_top into R. - copy_upper_triangle (R.ncols(), R.ncols(), R.get(), R.lda(), - R_top.get(), R_top.lda()); + copy_upper_triangle (R.ncols(), R.ncols(), R.data(), R.lda(), + R_top.data(), R_top.lda()); return result; } @@ -1553,8 +1553,8 @@ namespace TSQR { // The statement below only works if R_top and R_bot have a // nonzero (and the same) number of columns, but we have already // checked that above. - combine_.factor_pair (R_top.ncols(), R_top.get(), R_top.lda(), - R_bot.get(), R_bot.lda(), tau.data(), + combine_.factor_pair (R_top.ncols(), R_top.data(), R_top.lda(), + R_bot.data(), R_bot.lda(), tau.data(), work_.data()); return tau; } @@ -1609,9 +1609,9 @@ namespace TSQR { // have a nonzero (and the same) number of columns, but we have // already checked that above. combine_.apply_pair (applyType, C_top.ncols(), R_bot.ncols(), - R_bot.get(), R_bot.lda(), &tau[0], - C_top.get(), C_top.lda(), - C_bot.get(), C_bot.lda(), &work_[0]); + R_bot.data(), R_bot.lda(), tau.data(), + C_top.data(), C_top.lda(), + C_bot.data(), C_bot.lda(), work_.data()); } void @@ -1645,7 +1645,7 @@ namespace TSQR { // Top blocks of C are the whole cache blocks. We only want to // affect the top ncols x ncols part of each of those blocks in // this method. - mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].get(), + mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].data(), topBlocksOfC[0].lda()); if (applyType.transposed ()) { // Don't include the topmost (index 0) partition in the @@ -1656,7 +1656,7 @@ namespace TSQR { // empty, and we can skip over them. const mat_view_type& C_cur = topBlocksOfC[partIdx]; if (! C_cur.empty()) { - mat_view_type C_cur_square (numCols, numCols, C_cur.get (), + mat_view_type C_cur_square (numCols, numCols, C_cur.data (), C_cur.lda ()); // If explicitQ: We've already done the first pass and // filled the top blocks of C. @@ -1682,7 +1682,7 @@ namespace TSQR { const mat_view_type& C_cur = topBlocksOfC[partIdx]; if (! C_cur.empty()) { mat_view_type C_cur_square (numCols, numCols, - C_cur.get (), C_cur.lda ()); + C_cur.data (), C_cur.lda ()); // The "first" pass (actually the last, only named // "first" by analogy with factorFirstPass()) will // fill the rest of these top blocks. For now, we diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index 26c5bffcc2af..ba3e195e42f0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -143,7 +143,7 @@ namespace TSQR { const Ordinal ldr = numCols; // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), true); + nodeTestProblem (gen, numRows, numCols, A.data(), A.lda(), true); if (debug) { cerr << "-- Generated test problem" << endl; @@ -151,7 +151,7 @@ namespace TSQR { if (A.nrows() <= 30) { cerr << "A = " << endl; print_local_matrix (cerr, A.nrows(), A.ncols(), - A.get(), A.lda()); + A.data(), A.lda()); cerr << endl << endl; } } @@ -167,13 +167,13 @@ namespace TSQR { if (A_copy.nrows() <= 30) { cerr << "A_copy = " << endl; print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), - A_copy.get(), A_copy.lda()); + A_copy.data(), A_copy.lda()); cerr << endl << endl; } } } else { - actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda()); + actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.lda()); if (debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -181,7 +181,7 @@ namespace TSQR { if (A_copy.nrows() <= 30) { cerr << "A_copy = " << endl; print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), - A_copy.get(), A_copy.lda()); + A_copy.data(), A_copy.lda()); cerr << endl << endl; } } @@ -193,7 +193,7 @@ namespace TSQR { A2.fill (std::numeric_limits::quiet_NaN()); } - actor.un_cache_block (numRows, numCols, A2.get(), A2.lda(), A_copy.get()); + actor.un_cache_block (numRows, numCols, A2.data(), A2.lda(), A_copy.data()); if (matrix_equal (A, A2)) { if (debug) cerr << "-- Cache blocking test succeeded!" << endl; @@ -206,10 +206,10 @@ namespace TSQR { if (A.nrows() <= 30 && A2.nrows() <= 30) { cerr << "A = " << endl; print_local_matrix (cerr, A.nrows(), A.ncols(), - A.get(), A.lda()); + A.data(), A.lda()); cerr << endl << "A2 = " << endl; print_local_matrix (cerr, A2.nrows(), A2.ncols(), - A2.get(), A2.lda()); + A2.data(), A2.lda()); cerr << endl; } } @@ -231,8 +231,8 @@ namespace TSQR { // Factor the matrix and compute the explicit Q factor factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); if (debug) { cerr << "-- Finished factor()" << endl; cerr << "-- Calling explicit_Q()" << endl; @@ -245,14 +245,14 @@ namespace TSQR { mat_view_type Q_top = actor.top_block (Q.view (), contiguousCacheBlocks); mat_view_type Q_top_square (Q_top.ncols(), Q_top.ncols(), - Q_top.get(), Q_top.lda()); + Q_top.data(), Q_top.lda()); Q_top_square.fill (Scalar {}); for (Ordinal j = 0; j < Q_top_square.ncols(); ++j) { Q_top_square(j,j) = Scalar (1.0); } } - actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), - factor_output, numCols, Q.get(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), + factor_output, numCols, Q.data(), Q.lda(), contiguousCacheBlocks); if (debug) { cerr << "-- Finished explicit_Q()" << endl; @@ -264,8 +264,8 @@ namespace TSQR { // cache blocks. if (contiguousCacheBlocks) { // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (numRows, numCols, A_copy.get(), - A_copy.lda(), Q.get()); + actor.un_cache_block (numRows, numCols, A_copy.data(), + A_copy.lda(), Q.data()); deep_copy (Q, A_copy); if (debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -278,18 +278,18 @@ namespace TSQR { if (Q.nrows() <= 30) { cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q.nrows(), Q.ncols(), - Q.get(), Q.lda()); + Q.data(), Q.lda()); cerr << endl << endl; } cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, numCols, numCols, R.get(), R.lda()); + print_local_matrix (cerr, numCols, numCols, R.data(), R.lda()); cerr << endl; } // Validate the factorization std::vector results = - local_verify (numRows, numCols, A.get(), lda, - Q.get(), ldq, R.get(), ldr); + local_verify (numRows, numCols, A.data(), lda, + Q.data(), ldq, R.data(), ldr); if (debug) cerr << "-- Finished local_verify" << endl; @@ -411,13 +411,13 @@ namespace TSQR { R.fill (Scalar {}); // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), false); + nodeTestProblem (gen, numRows, numCols, A.data(), A.lda(), false); // Copy A into A_copy, since TSQR overwrites the input. If // specified, rearrange the data in A_copy so that the data in // each cache block is contiguously stored. if (contiguousCacheBlocks) { - actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda()); + actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.lda()); } else { deep_copy (A_copy, A); } @@ -429,14 +429,14 @@ namespace TSQR { // Factor the matrix in-place in A_copy, and extract the // resulting R factor into R. factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), - factor_output, numCols, Q.get(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), + factor_output, numCols, Q.data(), Q.lda(), contiguousCacheBlocks); } @@ -449,14 +449,14 @@ namespace TSQR { // Factor the matrix in-place in A_copy, and extract the // resulting R factor into R. factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), - factor_output, numCols, Q.get(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), + factor_output, numCols, Q.data(), Q.lda(), contiguousCacheBlocks); } const double timing = timer.stop(); diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 2f3d99cef4ba..c0644d0fd127 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -95,8 +95,8 @@ namespace TSQR { const first_ordinal_type ncols = A.ncols(); const second_ordinal_type B_lda = B.lda(); - first_pointer_type A_j = A.get(); - second_pointer_type B_j = B.get(); + first_pointer_type A_j = A.data(); + second_pointer_type B_j = B.data(); for (first_ordinal_type j = 0; j < ncols; ++j, A_j += A_lda, B_j += B_lda) for (first_ordinal_type i = 0; i < nrows; ++i) @@ -229,7 +229,7 @@ namespace TSQR { /// change any members of *this. Of course one may use the /// resulting pointer to fiddle with entries in the matrix, but /// that doesn't affect the MatView's properties. - pointer_type get() const { return A_; } + pointer_type data() const { return A_; } bool empty() const { return nrows() == 0 || ncols() == 0; } /// Return a "row block" (submatrix of consecutive rows in the @@ -248,7 +248,7 @@ namespace TSQR { } } #endif // TSQR_MATVIEW_DEBUG - return MatView (lastRow - firstRow + 1, ncols(), get() + firstRow, lda()); + return MatView (lastRow - firstRow + 1, ncols(), data() + firstRow, lda()); } /// Split off and return the top cache block of nrows_top rows. @@ -284,7 +284,7 @@ namespace TSQR { } #endif // TSQR_MATVIEW_DEBUG - Scalar* const A_top_ptr = get(); + Scalar* const A_top_ptr = data(); Scalar* A_rest_ptr; const Ordinal nrows_rest = nrows() - nrows_top; Ordinal lda_top, lda_rest; @@ -300,7 +300,7 @@ namespace TSQR { lda_rest = lda(); A_rest_ptr = A_top_ptr + nrows_top; } - MatView A_top (nrows_top, ncols(), get(), lda_top); + MatView A_top (nrows_top, ncols(), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -320,7 +320,7 @@ namespace TSQR { throw std::invalid_argument ("nrows_bottom > nrows"); #endif // TSQR_MATVIEW_DEBUG - Scalar* const A_rest_ptr = get(); + Scalar* const A_rest_ptr = data(); Scalar* A_bottom_ptr; const Ordinal nrows_rest = nrows() - nrows_bottom; Ordinal lda_bottom, lda_rest; @@ -351,7 +351,7 @@ namespace TSQR { const ordinal_type num_cols = ncols(); const ordinal_type stride = lda(); - scalar_type* A_j = get(); + scalar_type* A_j = data(); for (ordinal_type j = 0; j < num_cols; ++j, A_j += stride) for (ordinal_type i = 0; i < num_rows; ++i) A_j[i] = value; @@ -359,12 +359,12 @@ namespace TSQR { bool operator== (const MatView& rhs) const { return nrows() == rhs.nrows() && ncols() == rhs.ncols() && - lda() == rhs.lda() && get() == rhs.get(); + lda() == rhs.lda() && data() == rhs.data(); } bool operator!= (const MatView& rhs) const { return nrows() != rhs.nrows() || ncols() != rhs.ncols() || - lda() != rhs.lda() || get() != rhs.get(); + lda() != rhs.lda() || data() != rhs.data(); } private: @@ -412,7 +412,7 @@ namespace TSQR { nrows_(view.nrows()), ncols_(view.ncols()), lda_(view.lda()), - A_(view.get()) + A_(view.data()) {} //! Assignment operator: Does a shallow (pointer) copy. @@ -421,7 +421,7 @@ namespace TSQR { nrows_ = view.nrows(); ncols_ = view.ncols(); lda_ = view.lda(); - A_ = view.get(); + A_ = view.data(); } return *this; } @@ -455,7 +455,7 @@ namespace TSQR { Ordinal nrows() const { return nrows_; } Ordinal ncols() const { return ncols_; } Ordinal lda() const { return lda_; } - pointer_type get() const { return A_; } + pointer_type data() const { return A_; } bool empty() const { return nrows() == 0 || ncols() == 0; } /// Return a "row block" (submatrix of consecutive rows in the @@ -467,7 +467,7 @@ namespace TSQR { if (firstRow < 0 || lastRow >= nrows()) throw std::invalid_argument ("Row range invalid"); #endif // TSQR_MATVIEW_DEBUG - return ConstMatView (lastRow - firstRow + 1, ncols(), get() + firstRow, lda()); + return ConstMatView (lastRow - firstRow + 1, ncols(), data() + firstRow, lda()); } @@ -496,7 +496,7 @@ namespace TSQR { throw std::invalid_argument ("nrows_top > nrows"); #endif // TSQR_MATVIEW_DEBUG - pointer_type const A_top_ptr = get(); + pointer_type const A_top_ptr = data(); pointer_type A_rest_ptr; const Ordinal nrows_rest = nrows() - nrows_top; Ordinal lda_top, lda_rest; @@ -512,7 +512,7 @@ namespace TSQR { lda_rest = lda(); A_rest_ptr = A_top_ptr + nrows_top; } - ConstMatView A_top (nrows_top, ncols(), get(), lda_top); + ConstMatView A_top (nrows_top, ncols(), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -533,7 +533,7 @@ namespace TSQR { throw std::invalid_argument ("nrows_bottom > nrows"); #endif // TSQR_MATVIEW_DEBUG - pointer_type const A_rest_ptr = get(); + pointer_type const A_rest_ptr = data(); pointer_type A_bottom_ptr; const ordinal_type nrows_rest = nrows() - nrows_bottom; ordinal_type lda_bottom, lda_rest; @@ -559,12 +559,12 @@ namespace TSQR { bool operator== (const ConstMatView& rhs) const { return nrows() == rhs.nrows() && ncols() == rhs.ncols() && - lda() == rhs.lda() && get() == rhs.get(); + lda() == rhs.lda() && data() == rhs.data(); } bool operator!= (const ConstMatView& rhs) const { return nrows() != rhs.nrows() || ncols() != rhs.ncols() || - lda() != rhs.lda() || get() != rhs.get(); + lda() != rhs.lda() || data() != rhs.data(); } diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index a5c975f70f9c..af8adaadd38f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -191,7 +191,7 @@ namespace TSQR { A_ (verified_alloc_size (in.nrows(), in.ncols())) { if (! in.empty()) - copy_matrix (nrows(), ncols(), get(), lda(), in.get(), in.lda()); + copy_matrix (nrows(), ncols(), data(), lda(), in.data(), in.lda()); } //! Default constructor (constructs an empty matrix). @@ -204,7 +204,7 @@ namespace TSQR { /// /// This constructor allocates a new matrix and copies the /// elements of the input view into the resulting new matrix. - /// MatrixViewType must have nrows(), ncols(), get(), and lda() + /// MatrixViewType must have nrows(), ncols(), data(), and lda() /// methods that match MatView's methods. template Matrix (const MatrixViewType& in) : @@ -213,14 +213,14 @@ namespace TSQR { A_ (verified_alloc_size (in.nrows(), in.ncols())) { if (A_.size() != 0) - copy_matrix (nrows(), ncols(), get(), lda(), in.get(), in.lda()); + copy_matrix (nrows(), ncols(), data(), lda(), in.data(), in.lda()); } //! Fill all entries of the matrix with the given value. void fill (const Scalar value) { - fill_matrix (nrows(), ncols(), get(), lda(), value); + fill_matrix (nrows(), ncols(), data(), lda(), value); } /// \brief Non-const reference to element (i,j) of the matrix. @@ -248,7 +248,7 @@ namespace TSQR { template bool operator== (const MatrixViewType& B) const { - if (get() != B.get() || nrows() != B.nrows() || ncols() != B.ncols() || lda() != B.lda()) { + if (data() != B.data() || nrows() != B.nrows() || ncols() != B.ncols() || lda() != B.lda()) { return false; } else { return true; @@ -269,7 +269,7 @@ namespace TSQR { //! A non-const pointer to the matrix data. Scalar* - get() + data() { if (A_.size() > 0) return &A_[0]; @@ -279,7 +279,7 @@ namespace TSQR { //! A const pointer to the matrix data. const Scalar* - get() const + data() const { if (A_.size() > 0) return &A_[0]; @@ -289,13 +289,13 @@ namespace TSQR { //! A non-const view of the matrix. mat_view_type view () { - return mat_view_type (nrows(), ncols(), get(), lda()); + return mat_view_type (nrows(), ncols(), data(), lda()); } //! A const view of the matrix. const_mat_view_type const_view () const { return const_mat_view_type (nrows(), ncols(), - const_cast (get()), lda()); + const_cast (data()), lda()); } /// Change the dimensions of the matrix. Reallocate if necessary. diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index 873c5627e5b0..b895d9bcdafc 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -103,8 +103,8 @@ namespace TSQR { // Factor the (copy of the) matrix. On output, the explicit Q // factor (of A_local) is in Q_local and the R factor is in R. orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), - Q_local.get(), Q_local.lda(), - R.get(), R.lda()); + Q_local.data(), Q_local.lda(), + R.data(), R.lda()); if (b_debug) { messenger->barrier(); if (messenger->rank() == 0) @@ -211,7 +211,7 @@ namespace TSQR { scalarComm->barrier (); if (my_rank == 0) { cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); cerr << endl; } scalarComm->barrier (); @@ -219,8 +219,8 @@ namespace TSQR { // Test accuracy of the resulting factorization std::vector results = - global_verify (nrows_local, ncols, A_local.get(), A_local.lda(), - Q_local.get(), Q_local.lda(), R.get(), R.lda(), + global_verify (nrows_local, ncols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), scalarComm.get()); if (b_debug) { scalarComm->barrier(); @@ -294,8 +294,8 @@ namespace TSQR { // about the fact that we're overwriting the input; this is a // benchmark, not a numerical verification test. (We have the // latter implemented as mgs_verify() in this file.) - orthogonalizer.mgs (nrows_local, ncols, Q_local.get(), - Q_local.lda(), R.get(), R.lda()); + orthogonalizer.mgs (nrows_local, ncols, Q_local.data(), + Q_local.lda(), R.data(), R.lda()); // Timings in debug mode likely won't make sense, because Proc // 0 is outputting the debug messages to cerr. Nevertheless, // we don't put any "if(b_debug)" calls in the timing loop. diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index 21077b665726..bf1e66cb703d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -324,7 +324,7 @@ namespace TSQR { /// \brief Return view of topmost cache block of C /// /// \param C [in] Matrix (view), supporting the usual nrows(), - /// ncols(), get(), lda() interface. + /// ncols(), data(), lda() interface. /// \param contiguousCacheBlocks [in] Whether the cache blocks /// in C are stored contiguously. /// @@ -335,7 +335,7 @@ namespace TSQR { /// follows: /// \code /// MatrixViewType top = this->top_block (C, contig); - /// mat_view_type square (ncols, ncols, top.get(), top.lda()); + /// mat_view_type square (ncols, ncols, top.data(), top.lda()); /// \endcode virtual const_mat_view_type const_top_block (const const_mat_view_type& C, @@ -355,12 +355,12 @@ namespace TSQR { /// Tsqr::apply() need, do the following: /// \code /// MatrixViewType top = this->top_block (C, contig); - /// mat_view_type square (ncols, ncols, top.get(), top.lda()); + /// mat_view_type square (ncols, ncols, top.data(), top.lda()); /// \endcode /// /// Models for MatrixViewType are MatView and ConstMatView. /// MatrixViewType must have member functions nrows(), ncols(), - /// get(), and lda(), and its constructor must take the same four + /// data(), and lda(), and its constructor must take the same four /// arguments as the constructor of ConstMatView. template MatrixViewType @@ -372,7 +372,7 @@ namespace TSQR { // method. The only cast from const to nonconst may be in the // return value, but there it's legitimate since we're just // using the same constness as C has. - const_mat_view_type C_view (C.nrows(), C.ncols(), C.get(), C.lda()); + const_mat_view_type C_view (C.nrows(), C.ncols(), C.data(), C.lda()); const_mat_view_type C_top = const_top_block (C_view, contiguous_cache_blocks); TEUCHOS_TEST_FOR_EXCEPTION(C_top.nrows() < C_top.ncols(), std::logic_error, @@ -383,7 +383,7 @@ namespace TSQR { "developers."); typedef typename MatrixViewType::pointer_type ptr_type; return MatrixViewType (C_top.nrows(), C_top.ncols(), - const_cast (C_top.get()), + const_cast (C_top.data()), C_top.lda()); } @@ -519,9 +519,9 @@ namespace TSQR { "developers."; Scalar svd_lwork_scalar {}; - lapack.GESVD ('A', 'A', ncols, ncols, B.get(), B.lda(), - singular_values.data(), U_view.get(), U_view.lda(), - VT.get(), VT.lda(), &svd_lwork_scalar, svd_lwork, + lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.lda(), + singular_values.data(), U_view.data(), U_view.lda(), + VT.data(), VT.lda(), &svd_lwork_scalar, svd_lwork, svd_rwork.data()); // LAPACK returns the workspace array length as a Scalar. We // have to convert it back to an Ordinal in order to allocate @@ -561,9 +561,9 @@ namespace TSQR { // Compute SVD $B := U \Sigma V^*$. B is overwritten, which is // why we copied R into B (so that we don't overwrite R if R is // full rank). - lapack.GESVD ('A', 'A', ncols, ncols, B.get(), B.lda(), - singular_values.data(), U_view.get(), U_view.lda(), - VT.get(), VT.lda(), svd_work.data(), svd_lwork, + lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.lda(), + singular_values.data(), U_view.data(), U_view.lda(), + VT.data(), VT.lda(), svd_work.data(), svd_lwork, svd_rwork.data()); // // Compute the numerical rank of B, using the given relative @@ -633,14 +633,14 @@ namespace TSQR { // Compute numerical rank of the R factor using the SVD. // Store the left singular vectors in U. const Ordinal rank = - reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol); + reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol); // If R is full rank, we're done. Otherwise, reveal_R_rank() // already computed the SVD \f$R = U \Sigma V^*\f$ of (the // input) R, and overwrote R with \f$\Sigma V^*\f$. Now, we // compute \f$Q := Q \cdot U\f$, respecting cache blocks of Q. if (rank < ncols) { - Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), contiguousCacheBlocks); } return rank; diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp index ddfa3e4c39e0..153e658fb690 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp @@ -235,24 +235,24 @@ namespace TSQR { err_ << "-- Finished DistTsqr::factor" << endl; } // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Finished DistTsqr::explicit_Q" << endl; + par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); + if (debug_) { + scalarComm_->barrier(); + if (myRank == 0) { + err_ << "-- Finished DistTsqr::explicit_Q" << endl; } + } // Verify the factorization result_type result = - global_verify (numCols, numCols, A_local.get(), A_local.lda(), - Q_local.get(), Q_local.lda(), R.get(), R.lda(), + global_verify (numCols, numCols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), scalarComm_.get()); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Finished global_verify" << endl; + if (debug_) { + scalarComm_->barrier(); + if (myRank == 0) { + err_ << "-- Finished global_verify" << endl; } + } reportResults ("DistTsqr", numCols, result, additionalFieldNames, additionalData, printFieldNames && (! printedFieldNames)); @@ -261,17 +261,16 @@ namespace TSQR { } // Test DistTsqr::factorExplicit() - if (testFactorExplicit_) - { - // Factor the matrix and compute the explicit Q factor, both - // in a single operation. - par.factorExplicit (R.view(), Q_local.view()); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Finished DistTsqr::factorExplicit" << endl; - } + if (testFactorExplicit_) { + // Factor the matrix and compute the explicit Q factor, both + // in a single operation. + par.factorExplicit (R.view(), Q_local.view()); + if (debug_) { + scalarComm_->barrier(); + if (myRank == 0) { + err_ << "-- Finished DistTsqr::factorExplicit" << endl; + } + } if (printMatrices_) { @@ -281,15 +280,15 @@ namespace TSQR { if (myRank == 0) { err_ << std::endl << "Computed R factor:" << std::endl; - print_local_matrix (err_, R.nrows(), R.ncols(), R.get(), R.lda()); + print_local_matrix (err_, R.nrows(), R.ncols(), R.data(), R.lda()); err_ << std::endl; } } // Verify the factorization result_type result = - global_verify (numCols, numCols, A_local.get(), A_local.lda(), - Q_local.get(), Q_local.lda(), R.get(), R.lda(), + global_verify (numCols, numCols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), scalarComm_.get()); if (debug_) { @@ -386,34 +385,33 @@ namespace TSQR { // This modifies A_local on all procs, and A_global on Proc 0. par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_); - if (printMatrices_) - { - const int myRank = scalarComm_->rank(); - - if (myRank == 0) - err_ << "Input matrix A:" << std::endl; - printGlobalMatrix (err_, A_local, scalarComm_.get(), ordinalComm_.get()); - if (myRank == 0) - err_ << std::endl; + if (printMatrices_) { + const int myRank = scalarComm_->rank(); + if (myRank == 0) { + err_ << "Input matrix A:" << std::endl; + } + printGlobalMatrix (err_, A_local, scalarComm_.get(), ordinalComm_.get()); + if (myRank == 0) { + err_ << std::endl; } + } // Copy the test problem input into R, since the factorization // will overwrite it in place with the final R factor. R.reshape (numCols, numCols); - R.fill (Scalar (0)); + R.fill (Scalar {}); deep_copy (R, A_local); // Prepare space in which to construct the explicit Q factor // (local component on this processor) Q_local.reshape (numRowsLocal, numCols); - Q_local.fill (Scalar(0)); + Q_local.fill (Scalar {}); } }; /// \class DistTsqrBenchmarker /// \brief Generic version of \c DistTsqr performance test. - /// template< class Ordinal, class Scalar, class TimerType > class DistTsqrBenchmarker { TSQR::Random::NormalGenerator< Ordinal, Scalar > gen_; @@ -576,7 +574,7 @@ namespace TSQR { // overwritten on output) factor_output_type factorOutput = par.factor (R.view()); // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput); + par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); } // Now do the actual timing runs. Benchmark DistTsqr @@ -589,7 +587,7 @@ namespace TSQR { // overwritten on output) factor_output_type factorOutput = par.factor (R.view()); // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput); + par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); } // Cumulative timing on this MPI process. // "Cumulative" means the elapsed time of numTrials executions. diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index a94f2a248ade..6126dfad53b3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -194,14 +194,14 @@ namespace TSQR { const ordinal_type ncols = R_stack.ncols(); // Copy data from top ncols x ncols block of R_stack into R_local. - const_view_type R_stack_view_first (ncols, ncols, R_stack.get(), R_stack.lda()); + const_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); deep_copy (R_local, R_stack_view_first); // Loop through all other processors, sending each the next // ncols x ncols block of R_stack. RMessenger< ordinal_type, scalar_type > sender (messenger); for (int destProc = 1; destProc < nprocs; ++destProc) { - const scalar_type* const R_ptr = R_stack.get() + destProc*ncols; + const scalar_type* const R_ptr = R_stack.data() + destProc*ncols; const_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); sender.send (R_stack_view_cur, destProc); } @@ -234,13 +234,13 @@ namespace TSQR { const ordinal_type ncols = R_stack.ncols(); // Copy data from R_local into top ncols x ncols block of R_stack. - mat_view_type R_stack_view_first (ncols, ncols, R_stack.get(), R_stack.lda()); + mat_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); deep_copy (R_stack_view_first, R_local); // Loop through all other processors, fetching their matrix data. RMessenger< ordinal_type, scalar_type > receiver (messenger); for (int srcProc = 1; srcProc < nprocs; ++srcProc) { - const scalar_type* const R_ptr = R_stack.get() + srcProc*ncols; + const scalar_type* const R_ptr = R_stack.data() + srcProc*ncols; mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); // Fill (the lower triangle) with zeros, since // RMessenger::recv() only writes to the upper triangle. diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 44e6a243954d..38cc74e3ce47 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -67,7 +67,7 @@ namespace TSQR { if (nrows == lda) { // A is stored contiguously. const ordinal_type nelts = nrows * ncols; - scalar_type* const A_ptr = A.get (); + scalar_type* const A_ptr = A.data (); for (ordinal_type k = 0; k < nelts; ++k) { A_ptr[k] /= denom; } @@ -120,15 +120,15 @@ namespace TSQR { // Generate a random ncols by ncols upper triangular matrix R // with the given singular values. Matrix R (ncols, ncols, scalar_type {}); - matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); + matGen.fill_random_R (ncols, R.data(), R.lda(), singular_values); // Broadcast R to all the processors. - scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); + scalarMessenger->broadcast (R.data(), ncols*ncols, rootProc); // Generate (for myself) a random nrowsLocal x ncols // orthogonal matrix, stored in explicit form. Matrix Q_local (nrowsLocal, ncols); - matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); + matGen.explicit_Q (nrowsLocal, ncols, Q_local.data(), Q_local.lda()); // Scale the (local) orthogonal matrix by the number of // processors P, to make the columns of the global matrix Q @@ -147,9 +147,9 @@ namespace TSQR { // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.get(), Q_local.lda(), - R.get(), R.lda(), - scalar_type(0), A_local.get(), A_local.lda()); + scalar_type(1), Q_local.data(), Q_local.lda(), + R.data(), R.lda(), + scalar_type(0), A_local.data(), A_local.lda()); for (int recvProc = 1; recvProc < nprocs; ++recvProc) { // Ask the receiving processor how big (i.e., how many rows) @@ -163,17 +163,17 @@ namespace TSQR { // Compute a random nrowsRemote * ncols orthogonal // matrix Q_local, for the current receiving processor. - matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); + matGen.explicit_Q (nrowsRemote, ncols, Q_local.data(), Q_local.lda()); // Send Q_local to the current receiving processor. - scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); + scalarMessenger->send (Q_local.data(), nrowsRemote*ncols, recvProc, 0); } } else { // Receive the R factor from Proc 0. There's only 1 R // factor for all the processes. Matrix R (ncols, ncols, scalar_type {}); - scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); + scalarMessenger->broadcast (R.data(), ncols*ncols, rootProc); // Q_local (nrows_local by ncols, random orthogonal matrix) // will be received from Proc 0, where it was generated. @@ -185,7 +185,7 @@ namespace TSQR { ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); // Receive the orthogonal matrix from Proc 0. - scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); + scalarMessenger->recv (Q_local.data(), recvSize, rootProc, 0); // Scale the (local) orthogonal matrix by the number of // processors, to make the global matrix Q orthogonal. @@ -201,9 +201,9 @@ namespace TSQR { // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.get(), Q_local.lda(), - R.get(), R.lda(), - scalar_type(0), A_local.get(), A_local.lda()); + scalar_type(1), Q_local.data(), Q_local.lda(), + R.data(), R.lda(), + scalar_type(0), A_local.data(), A_local.lda()); } } } // namespace Random diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index edd294b4e85d..f43eb7d3d579 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -160,7 +160,7 @@ namespace TSQR { implicit_Q (MatrixViewType& Q, typename MatrixViewType::scalar_type tau[]) { - implicit_Q (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), tau); + implicit_Q (Q.nrows(), Q.ncols(), Q.data(), Q.lda(), tau); } void @@ -195,16 +195,16 @@ namespace TSQR { Scalar _lwork1, _lwork2; Impl::Lapack lapack; lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, - U.get(), U.lda(), tau_U.data(), + U.data(), U.lda(), tau_U.data(), A, lda, &_lwork1, -1); if (STS::isComplex) { lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, - V.get(), V.lda(), tau_V.data(), + V.data(), V.lda(), tau_V.data(), A, lda, &_lwork2, -1); } else { lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, - V.get(), V.lda(), tau_V.data(), + V.data(), V.lda(), tau_V.data(), A, lda, &_lwork2, -1); } @@ -215,16 +215,16 @@ namespace TSQR { // Apply U to the left side of A, and V^H to the right side of A. lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, - U.get(), U.lda(), tau_U.data(), + U.data(), U.lda(), tau_U.data(), A, lda, work.data(), lwork); if (STS::isComplex) { lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, - V.get(), V.lda(), tau_V.data(), + V.data(), V.lda(), tau_V.data(), A, lda, work.data(), lwork); } else { lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, - V.get(), V.lda(), tau_V.data(), + V.data(), V.lda(), tau_V.data(), A, lda, work.data(), lwork); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp index f81a75573e9e..15b9c2701e16 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp @@ -167,7 +167,7 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true); + nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), true); if (save_matrices) { string filename = "A_" + shortDatatype + ".txt"; @@ -175,7 +175,7 @@ namespace TSQR { cerr << "-- Saving test problem to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, A.get(), A.lda()); + print_local_matrix (fileOut, nrows, ncols, A.data(), A.lda()); fileOut.close(); } @@ -193,7 +193,7 @@ namespace TSQR { } } else { - actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); if (b_debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -205,8 +205,8 @@ namespace TSQR { if (std::numeric_limits::has_quiet_NaN) { A2.fill (std::numeric_limits::quiet_NaN ()); } - actor.un_cache_block (nrows, ncols, A2.get (), A2.lda (), - A_copy.get ()); + actor.un_cache_block (nrows, ncols, A2.data (), A2.lda (), + A_copy.data ()); if (matrix_equal (A, A2)) { if (b_debug) { cerr << "-- Cache blocking test succeeded!" << endl; @@ -225,7 +225,7 @@ namespace TSQR { // Count the number of cache blocks that factor() will use. // This is only for diagnostic purposes. numCacheBlocks = - actor.factor_num_cache_blocks (nrows, ncols, A_copy.get(), + actor.factor_num_cache_blocks (nrows, ncols, A_copy.data(), A_copy.lda(), contiguous_cache_blocks); // In debug mode, report how many cache blocks factor() will use. if (b_debug) { @@ -237,8 +237,8 @@ namespace TSQR { typedef typename SequentialTsqr::FactorOutput factor_output_type; factor_output_type factorOutput = - actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished SequentialTsqr::factor" << endl; } @@ -248,12 +248,12 @@ namespace TSQR { cerr << "-- Saving R factor to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str ()); - print_local_matrix (fileOut, ncols, ncols, R.get (), R.lda ()); + print_local_matrix (fileOut, ncols, ncols, R.data (), R.lda ()); fileOut.close (); } - actor.explicit_Q (nrows, ncols, A_copy.get(), lda, factorOutput, - ncols, Q.get(), Q.lda(), contiguous_cache_blocks); + actor.explicit_Q (nrows, ncols, A_copy.data(), lda, factorOutput, + ncols, Q.data(), Q.lda(), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished SequentialTsqr::explicit_Q" << endl; } @@ -263,7 +263,7 @@ namespace TSQR { // currently support contiguous cache blocks. if (contiguous_cache_blocks) { // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get()); + actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.lda(), Q.data()); deep_copy (Q, A_copy); if (b_debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -276,20 +276,20 @@ namespace TSQR { cerr << "-- Saving Q factor to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, Q.get(), Q.lda()); + print_local_matrix (fileOut, nrows, ncols, Q.data(), Q.lda()); fileOut.close(); } // Print out the R factor if (false && b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); cerr << endl; } // Validate the factorization vector< magnitude_type > results = - local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr); + local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr); if (b_debug) { cerr << "-- Finished local_verify" << endl; } @@ -466,7 +466,7 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.get (), A.lda (), true); + nodeTestProblem (generator, nrows, ncols, A.data (), A.lda (), true); if (b_debug) { cerr << "-- Generated test problem" << endl; @@ -488,15 +488,15 @@ namespace TSQR { // the strict lower triangle of R. R.fill (Scalar {}); - lapack.compute_QR (nrows, ncols, A_copy.get(), A_copy.lda(), + lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.lda(), tau.data(), work.data(), lwork); // Copy out the R factor from A_copy (where we computed the QR // factorization in place) into R. - copy_upper_triangle (ncols, ncols, R.get(), ldr, A_copy.get(), lda); + copy_upper_triangle (ncols, ncols, R.data(), ldr, A_copy.data(), lda); if (b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); cerr << endl; } @@ -504,13 +504,13 @@ namespace TSQR { // result of the factorization into Q. deep_copy (Q, A_copy); - lapack.compute_explicit_Q (nrows, ncols, ncols, Q.get(), ldq, + lapack.compute_explicit_Q (nrows, ncols, ncols, Q.data(), ldq, tau.data(), work.data(), lwork); // Validate the factorization std::vector results = - local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, - R.get(), ldr); + local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, + R.data(), ldr); // Print the results if (human_readable) { @@ -671,7 +671,7 @@ namespace TSQR { const Ordinal ldr = numCols; // Create a test problem - nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false); + nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false); // Copy A into Q, since LAPACK QR overwrites the input. We only // need Q because LAPACK's computation of the explicit Q factor @@ -692,15 +692,15 @@ namespace TSQR { timer.start(); for (int trialNum = 0; trialNum < numTrials; ++trialNum) { lapack_.compute_QR (numRows, numCols, - Q.get(), ldq, tau.data(), + Q.data(), ldq, tau.data(), work.data(), lwork); // Extract the upper triangular factor R from Q (where it // was computed in place by GEQRF), since UNGQR will // overwrite all of Q with the explicit Q factor. - copy_upper_triangle (numRows, numCols, R.get(), ldr, - Q.get(), ldq); + copy_upper_triangle (numRows, numCols, R.data(), ldr, + Q.data(), ldq); lapack_.compute_explicit_Q (numRows, numCols, numCols, - Q.get(), ldq, tau.data(), + Q.data(), ldq, tau.data(), work.data(), lwork); } const double lapackTiming = timer.stop(); @@ -920,7 +920,7 @@ namespace TSQR { const Ordinal ldq = numRows; // Create a test problem - nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false); + nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false); // Copy A into A_copy, since TSQR overwrites the input deep_copy (A_copy, A); @@ -935,14 +935,14 @@ namespace TSQR { typedef typename SequentialTsqr::FactorOutput factor_output_type; factor_output_type factorOutput = - actor.factor (numRows, numCols, A_copy.get(), lda, - R.get(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), lda, + R.data(), R.lda(), contiguousCacheBlocks); // Compute the explicit Q factor. Unlike with LAPACK QR, // this doesn't happen in place: the implicit Q factor is // stored in A_copy, and the explicit Q factor is written to // Q. - actor.explicit_Q (numRows, numCols, A_copy.get(), lda, factorOutput, - numCols, Q.get(), ldq, contiguousCacheBlocks); + actor.explicit_Q (numRows, numCols, A_copy.data(), lda, factorOutput, + numCols, Q.data(), ldq, contiguousCacheBlocks); } const double seqTsqrTiming = timer.stop(); reportResults (numTrials, numRows, numCols, actor.cache_size_hint(), diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index fea4c49094c4..732bc4fbde93 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -144,8 +144,8 @@ namespace TSQR { // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.nrows (), - Scalar (1), A_cur.get (), A_cur.lda (), A_cur.get (), - A_cur.lda (), Scalar (0), ATA.get (), ATA.lda ()); + Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), + A_cur.lda (), Scalar (0), ATA.data (), ATA.lda ()); // Process the remaining cache blocks in order. while (! A_rest.empty ()) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); @@ -153,8 +153,8 @@ namespace TSQR { // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.nrows (), - Scalar (1), A_cur.get (), A_cur.lda (), A_cur.get (), - A_cur.lda (), Scalar (1), ATA.get (), ATA.lda ()); + Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), + A_cur.lda (), Scalar (1), ATA.data (), ATA.lda ()); } } else { @@ -163,12 +163,12 @@ namespace TSQR { // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, nrows, Scalar (1), A, lda, A, lda, - Scalar (0), ATA.get (), ATA.lda ()); + Scalar (0), ATA.data (), ATA.lda ()); } // Compute the Cholesky factorization of ATA in place, so that // A^T * A = R^T * R, where R is ncols x ncols upper triangular. - lapack.POTRF ('U', ncols, ATA.get(), ATA.lda()); + lapack.POTRF ('U', ncols, ATA.data(), ATA.lda()); // FIXME (mfh 22 June 2010, mfh 21 Nov 2019) The right thing to // do on failure of above would be to resort to a rank-revealing // factorization, as Stathopoulos and Wu (2002) do with their @@ -176,7 +176,7 @@ namespace TSQR { // Copy out the R factor fill_matrix (ncols, ncols, R, ldr, Scalar {}); - copy_upper_triangle (ncols, ncols, R, ldr, ATA.get(), ATA.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.lda()); // Compute A := A * R^{-1}. We do this in place in A, using // BLAS' TRSM with the R factor (form POTRF) stored in the upper @@ -194,15 +194,15 @@ namespace TSQR { // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.nrows (), ncols, Scalar (1), ATA.get (), ATA.lda (), - A_cur.get (), A_cur.lda ()); + A_cur.nrows (), ncols, Scalar (1), ATA.data (), ATA.lda (), + A_cur.data (), A_cur.lda ()); // Process the remaining cache blocks in order. while (! A_rest.empty ()) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.nrows (), ncols, Scalar (1), ATA.get (), ATA.lda (), - A_cur.get (), A_cur.lda ()); + A_cur.nrows (), ncols, Scalar (1), ATA.data (), ATA.lda (), + A_cur.data (), A_cur.lda ()); } } @@ -293,7 +293,7 @@ namespace TSQR { /// \note The returned view is not necessarily square, though it /// must have at least as many rows as columns. For a square /// ncols by ncols block, as needed in TSQR::Tsqr::apply(), if - /// the output is ret, do mat_view_type(ncols, ncols, ret.get(), + /// the output is ret, do mat_view_type(ncols, ncols, ret.data(), /// ret.lda()) to get an ncols by ncols block. template< class MatrixViewType > MatrixViewType diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index eefa5cb1b0cd..b2a8dcd1b673 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -159,9 +159,9 @@ namespace TSQR { std::vector& work) const { const LocalOrdinal ncols = A_top.ncols(); - combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), + combine.factor_first (A_top.nrows(), ncols, A_top.data(), A_top.lda(), tau.data(), work.data()); - return mat_view_type(ncols, ncols, A_top.get(), A_top.lda()); + return mat_view_type(ncols, ncols, A_top.data(), A_top.lda()); } /// Apply the Q factor of the first (topmost) cache blocks, as @@ -178,8 +178,8 @@ namespace TSQR { { const LocalOrdinal nrowsLocal = Q_first.nrows(); combine.apply_first (applyType, nrowsLocal, C_first.ncols(), - Q_first.ncols(), Q_first.get(), Q_first.lda(), - tau.data(), C_first.get(), C_first.lda(), work.data()); + Q_first.ncols(), Q_first.data(), Q_first.lda(), + tau.data(), C_first.data(), C_first.lda(), work.data()); } void @@ -197,9 +197,9 @@ namespace TSQR { combine.apply_inner (apply_type, nrows_local, ncols_C, ncols_Q, - Q_cur.get(), C_cur.lda(), tau.data(), - C_top.get(), C_top.lda(), - C_cur.get(), C_cur.lda(), work.data()); + Q_cur.data(), C_cur.lda(), tau.data(), + C_top.data(), C_top.lda(), + C_cur.data(), C_cur.lda(), work.data()); } void @@ -212,8 +212,8 @@ namespace TSQR { const LocalOrdinal nrows_local = A_cur.nrows(); const LocalOrdinal ncols = A_cur.ncols(); - combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), - A_cur.get(), A_cur.lda(), tau.data(), + combine.factor_inner (nrows_local, ncols, R.data(), R.lda(), + A_cur.data(), A_cur.lda(), tau.data(), work.data()); } @@ -487,7 +487,7 @@ namespace TSQR { fill_matrix (ncols, ncols, R, ldr, Teuchos::ScalarTraits::zero()); // Copy out the upper triangle of the R factor from A into R. - copy_upper_triangle (ncols, ncols, R, ldr, A_top.get(), A_top.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.lda()); } /// \brief Compute the QR factorization of the matrix A. @@ -541,7 +541,7 @@ namespace TSQR { // R_view (a view of the topmost cache block of A) into the R // output argument. fill_matrix (ncols, ncols, R, ldr, Scalar(0)); - copy_upper_triangle (ncols, ncols, R, ldr, R_view.get(), R_view.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.lda()); return tau_arrays; } @@ -762,8 +762,8 @@ namespace TSQR { deep_copy (Q_cur_copy, Q_cur); // Q_cur := Q_cur_copy * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows (), ncols, ncols, - Scalar (1.0), Q_cur_copy.get (), Q_cur_copy.lda (), - B, ldb, Scalar (0.0), Q_cur.get (), Q_cur.lda ()); + Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.lda (), + B, ldb, Scalar (0.0), Q_cur.data (), Q_cur.lda ()); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp index a34e69f494a3..abf4475a21f3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp @@ -121,7 +121,7 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true); + nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), true); if (b_debug) { cerr << "-- Generated test problem" << endl; @@ -137,7 +137,7 @@ namespace TSQR { } } else { - actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); if (b_debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -148,7 +148,7 @@ namespace TSQR { if (std::numeric_limits< Scalar >::has_quiet_NaN) { A2.fill (std::numeric_limits< Scalar >::quiet_NaN()); } - actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get()); + actor.un_cache_block (nrows, ncols, A2.data(), A2.lda(), A_copy.data()); if (matrix_equal (A, A2)) { if (b_debug) { cerr << "-- Cache blocking test succeeded!" << endl; @@ -166,13 +166,13 @@ namespace TSQR { // Factor the matrix and compute the explicit Q factor factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), R.get(), + actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), R.data(), R.lda(), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished TbbTsqr::factor" << endl; } - actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), factor_output, - ncols, Q.get(), Q.lda(), contiguous_cache_blocks); + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), factor_output, + ncols, Q.data(), Q.lda(), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished TbbTsqr::explicit_Q" << endl; } @@ -183,7 +183,7 @@ namespace TSQR { // cache blocks. if (contiguous_cache_blocks) { // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get()); + actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.lda(), Q.data()); deep_copy (Q, A_copy); if (b_debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -193,13 +193,13 @@ namespace TSQR { // Print out the R factor if (b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); cerr << endl; } // Validate the factorization std::vector< magnitude_type > results = - local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr); + local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr); if (b_debug) { cerr << "-- Finished local_verify" << endl; } @@ -298,13 +298,13 @@ namespace TSQR { R.fill (scalar_type(0)); // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), false); + nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), false); // Copy A into A_copy, since TSQR overwrites the input. If // specified, rearrange the data in A_copy so that the data in // each cache block is contiguously stored. if (contiguous_cache_blocks) { - actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); } else { deep_copy (A_copy, A); @@ -318,14 +318,14 @@ namespace TSQR { // resulting R factor into R. typedef typename node_tsqr_type::FactorOutput factor_output_type; factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguous_cache_blocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), - factor_output, ncols, Q.get(), Q.lda(), + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), + factor_output, ncols, Q.data(), Q.lda(), contiguous_cache_blocks); } @@ -339,14 +339,14 @@ namespace TSQR { // resulting R factor into R. typedef typename node_tsqr_type::FactorOutput factor_output_type; factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguous_cache_blocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), - factor_output, ncols, Q.get(), Q.lda(), + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), + factor_output, ncols, Q.data(), Q.lda(), contiguous_cache_blocks); } const double tbb_tsqr_timing = timer.stop(); diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index 201d8a0db6c3..8a39d66e64f5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -85,8 +85,8 @@ namespace TSQR { // If specified, rearrange cache blocks in the copy. if (contiguousCacheBlocks) { - tsqr.cache_block (nrows_local, ncols, A_copy.get(), - A_local.get(), A_local.lda()); + tsqr.cache_block (nrows_local, ncols, A_copy.data(), + A_local.data(), A_local.lda()); if (b_debug) { scalarComm->barrier (); if (scalarComm->rank () == 0) @@ -110,8 +110,8 @@ namespace TSQR { else { // Factor the (copy of the) matrix. factor_output_type factorOutput = - tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); if (scalarComm->rank () == 0) @@ -120,8 +120,8 @@ namespace TSQR { // Compute the explicit Q factor in Q_local tsqr.explicit_Q (nrows_local, - ncols, A_copy.get(), A_copy.lda(), factorOutput, - ncols, Q_local.get(), Q_local.lda(), + ncols, A_copy.data(), A_copy.lda(), factorOutput, + ncols, Q_local.data(), Q_local.lda(), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); @@ -136,8 +136,8 @@ namespace TSQR { if (contiguousCacheBlocks) { // We can use A_copy as scratch space for un-cache-blocking // Q_local, since we're done using A_copy for other things. - tsqr.un_cache_block (nrows_local, ncols, A_copy.get(), - A_copy.lda(), Q_local.get()); + tsqr.un_cache_block (nrows_local, ncols, A_copy.data(), + A_copy.lda(), Q_local.data()); // Overwrite Q_local with the un-cache-blocked Q factor. deep_copy (Q_local, A_copy); @@ -321,7 +321,7 @@ namespace TSQR { scalarComm->barrier (); if (my_rank == 0) { cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.get(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); cerr << endl; } scalarComm->barrier (); @@ -329,8 +329,8 @@ namespace TSQR { // Test accuracy of the resulting factorization std::vector< magnitude_type > results = - global_verify (nrows_local, ncols, A_local.get(), A_local.lda(), - Q_local.get(), Q_local.lda(), R.get(), R.lda(), + global_verify (nrows_local, ncols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), scalarComm.get()); if (b_debug) { scalarComm->barrier (); @@ -442,8 +442,8 @@ namespace TSQR { const ordinal_type ncols = A_local.ncols(); if (contiguousCacheBlocks) { - tsqr.cache_block (nrows_local, ncols, A_copy.get(), - A_local.get(), A_local.lda()); + tsqr.cache_block (nrows_local, ncols, A_copy.data(), + A_local.data(), A_local.lda()); if (b_debug) { messenger->barrier (); if (messenger->rank () == 0) { @@ -496,11 +496,11 @@ namespace TSQR { // un-cache-blocking the output (when cache blocks are // stored contiguously). factor_output_type factor_output = - tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), - R.get(), R.lda(), contiguousCacheBlocks); + tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.lda(), + R.data(), R.lda(), contiguousCacheBlocks); tsqr.explicit_Q (nrows_local, - ncols, A_copy.get(), A_copy.lda(), factor_output, - ncols, Q_local.get(), Q_local.lda(), + ncols, A_copy.data(), A_copy.lda(), factor_output, + ncols, Q_local.data(), Q_local.lda(), contiguousCacheBlocks); // Timings in debug mode likely won't make sense, because // Proc 0 is outputting the debug messages to cerr. diff --git a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp index a432dc4d6962..74ba0846d08c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp @@ -84,14 +84,13 @@ namespace TSQR { TSQR::Random::MatrixGenerator matGen (generator); const Ordinal numRows = numProcs * numCols; A_global.reshape (numRows, numCols); - A_global.fill (Scalar(0)); + A_global.fill (Scalar {}); - for (int p = 0; p < numProcs; ++p) - { - Scalar* const curptr = A_global.get() + p*numCols; - mat_view_type R_cur (numCols, numCols, curptr, numRows); - matGen.fill_random_R (numCols, R_cur.get(), numRows, singularValues); - } + for (int p = 0; p < numProcs; ++p) { + Scalar* const curptr = A_global.data() + p*numCols; + mat_view_type R_cur (numCols, numCols, curptr, numRows); + matGen.fill_random_R (numCols, R_cur.data(), numRows, singularValues); + } } /// \brief Generate a random test problem for the distributed-memory part of TSQR. diff --git a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp index ec9cdaa39a82..f935a1b1e655 100644 --- a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp @@ -86,7 +86,7 @@ namespace TSQR { // Print the remote matrix data // out << "Processor " << my_rank << ":" << endl; print_local_matrix (out, A_local.nrows(), A_local.ncols(), - A_local.get(), A_local.lda()); + A_local.data(), A_local.lda()); // Space for remote matrix data. Other processors are allowed // to have different nrows_local values; we make space as @@ -126,11 +126,11 @@ namespace TSQR { // Receive the remote matrix data, which we assume is // stored contiguously. - scalarComm->recv (A_remote.get(), dims[0]*dims[1], srcProc, 0); + scalarComm->recv (A_remote.data(), dims[0]*dims[1], srcProc, 0); // Print the remote matrix data // out << "Processor " << proc << ":" << endl; - print_local_matrix (out, dims[0], dims[0], A_remote.get(), A_remote.lda()); + print_local_matrix (out, dims[0], dims[0], A_remote.data(), A_remote.lda()); } } else @@ -148,7 +148,7 @@ namespace TSQR { deep_copy (A_buf, A_local); // Send the actual data to proc 0. - scalarComm->send (A_buf.get(), nrowsLocal*ncols, rootProc, 0); + scalarComm->send (A_buf.data(), nrowsLocal*ncols, rootProc, 0); } scalarComm->barrier (); } From 1434fe0db62a59dcc5cb9512a982b69ebf9c9c8d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 22 Nov 2019 14:09:14 -0700 Subject: [PATCH 09/50] TSQR: Make deep_copy and matrix_equal more generic The goal is to replace MatView etc. with Kokkos::View. --- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 22 ++++---- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 55 +++++++++----------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index 5c85acab44ea..a3f678b471e5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -398,7 +398,7 @@ namespace TSQR { resizeWork (numCols); combine_.factor_pair (numCols, R_mine.data(), R_mine.lda(), R_other.data(), R_other.lda(), - &tau[0], &work_[0]); + tau.data(), work_.data()); QFactors.push_back (R_other); tauArrays.push_back (tau); } @@ -462,9 +462,9 @@ namespace TSQR { Q_other.fill (scalar_type (0)); combine_.apply_pair (ApplyType::NoTranspose, Q_mine.ncols(), Q_impl.ncols(), - Q_impl.data(), Q_impl.lda(), &tau[0], + Q_impl.data(), Q_impl.lda(), tau.data(), Q_mine.data(), Q_mine.lda(), - Q_other.data(), Q_other.lda(), &work_[0]); + Q_other.data(), Q_other.lda(), work_.data()); // Send the resulting Q_other, and the final R factor, to P_mid. send_Q_R (Q_other, R_mine, P_mid); newpos = curpos - 1; @@ -504,11 +504,11 @@ namespace TSQR { resizeWork (numElts); // Pack the Q data into the workspace array. - mat_view_type Q_contig (Q.nrows(), Q.ncols(), &work_[0], Q.nrows()); + mat_view_type Q_contig (Q.nrows(), Q.ncols(), work_.data(), Q.nrows()); deep_copy (Q_contig, Q); // Pack the R data into the workspace array. pack_R (R, &work_[Q_size]); - messenger_->send (&work_[0], numElts, destProc, 0); + messenger_->send (work_.data(), numElts, destProc, 0); } template< class MatrixType1, class MatrixType2 > @@ -529,10 +529,10 @@ namespace TSQR { // to grow again. resizeWork (numElts); - messenger_->recv (&work_[0], numElts, srcProc, 0); + messenger_->recv (work_.data(), numElts, srcProc, 0); // Unpack the C data from the workspace array. - deep_copy (Q, mat_view_type (Q.nrows(), Q.ncols(), &work_[0], Q.nrows())); + deep_copy (Q, mat_view_type (Q.nrows(), Q.ncols(), work_.data(), Q.nrows())); // Unpack the R data from the workspace array. unpack_R (R, &work_[Q_size]); } @@ -551,8 +551,8 @@ namespace TSQR { // to grow again. resizeWork (numElts); // Pack the R data into the workspace array. - pack_R (R, &work_[0]); - messenger_->send (&work_[0], numElts, destProc, 0); + pack_R (R, work_.data()); + messenger_->send (work_.data(), numElts, destProc, 0); } template< class MatrixType > @@ -568,9 +568,9 @@ namespace TSQR { // correct, but may require reallocation of data when it needs // to grow again. resizeWork (numElts); - messenger_->recv (&work_[0], numElts, srcProc, 0); + messenger_->recv (work_.data(), numElts, srcProc, 0); // Unpack the R data from the workspace array. - unpack_R (R, &work_[0]); + unpack_R (R, work_.data()); } template< class MatrixType > diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index c0644d0fd127..94d146e1a513 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -59,50 +59,47 @@ namespace TSQR { void deep_copy (MatrixViewType1& A, const MatrixViewType2& B) { - const typename MatrixViewType1::ordinal_type A_nrows = A.nrows (); - const typename MatrixViewType1::ordinal_type A_ncols = A.ncols (); - if (A_nrows != B.nrows () || A_ncols != B.ncols ()) { + const ptrdiff_t A_nrows (A.nrows ()); + const ptrdiff_t A_ncols (A.ncols ()); + if (A_nrows != ptrdiff_t (B.nrows ()) || + A_ncols != ptrdiff_t (B.ncols ())) { using std::endl; std::ostringstream os; os << "deep_copy: dimensions of A (output matrix) and B (input matrix) " << "are not compatible. A is " << A.nrows () << " x " << A.ncols () - << ", and B is " << B.nrows () << " x " << B.ncols () << "."; + << ", but B is " << B.nrows () << " x " << B.ncols () << "."; throw std::invalid_argument(os.str()); } - for (typename MatrixViewType1::ordinal_type j = 0; j < A_ncols; ++j) { - typename MatrixViewType1::scalar_type* const A_j = &A(0,j); - const typename MatrixViewType2::scalar_type* const B_j = &B(0,j); - for (typename MatrixViewType1::ordinal_type i = 0; i < A_nrows; ++i) { + for (ptrdiff_t j = 0; j < A_ncols; ++j) { + auto* const A_j = &A(0,j); + const auto* const B_j = &B(0,j); + for (ptrdiff_t i = 0; i < A_nrows; ++i) { A_j[i] = B_j[i]; } } } - template< class FirstMatrixViewType, class SecondMatrixViewType > + template bool - matrix_equal (FirstMatrixViewType& A, SecondMatrixViewType& B) + matrix_equal (const FirstMatrixViewType& A, + const SecondMatrixViewType& B) { - if (A.nrows() != B.nrows() || A.ncols() != B.ncols()) + if (A.nrows() != B.nrows() || A.ncols() != B.ncols()) { return false; - - typedef typename FirstMatrixViewType::ordinal_type first_ordinal_type; - typedef typename SecondMatrixViewType::ordinal_type second_ordinal_type; - typedef typename FirstMatrixViewType::pointer_type first_pointer_type; - typedef typename SecondMatrixViewType::pointer_type second_pointer_type; - - const first_ordinal_type nrows = A.nrows(); - const first_ordinal_type A_lda = A.lda(); - const first_ordinal_type ncols = A.ncols(); - const second_ordinal_type B_lda = B.lda(); - - first_pointer_type A_j = A.data(); - second_pointer_type B_j = B.data(); - - for (first_ordinal_type j = 0; j < ncols; ++j, A_j += A_lda, B_j += B_lda) - for (first_ordinal_type i = 0; i < nrows; ++i) - if (A_j[i] != B_j[i]) + } + const ptrdiff_t nrows (A.nrows()); + const ptrdiff_t A_lda (A.lda()); + const ptrdiff_t ncols (A.ncols()); + const ptrdiff_t B_lda (B.lda()); + const auto* A_j = A.data(); + const auto* B_j = B.data(); + for (ptrdiff_t j = 0; j < ncols; ++j, A_j += A_lda, B_j += B_lda) { + for (ptrdiff_t i = 0; i < nrows; ++i) { + if (A_j[i] != B_j[i]) { return false; - + } + } + } return true; } From ebd5b6afdcee0b9cf80e5f43bacd8072666202f3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 22 Nov 2019 14:35:02 -0700 Subject: [PATCH 10/50] TSQR: {Matrix,*MatView}{nrows,ncols} -> extent({0,1}) Replace the nrows and ncols methods in Matrix, MatView, and ConstMatView with extent (as in Kokkos::View and mdspan). The goal is to replace these classes (or at least the nonowning MatView and ConstMatView) with Kokkos::View. --- .../tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp | 11 +- .../tsqr/src/TbbTsqr_CacheBlockTask.hpp | 4 +- .../tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp | 28 ++-- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 12 +- .../tsqr/src/TbbTsqr_FillWithZerosTask.hpp | 4 +- .../tpetra/tsqr/src/TbbTsqr_Partitioner.hpp | 4 +- .../tsqr/src/TbbTsqr_RevealRankTask.hpp | 11 +- .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 14 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp | 40 +++--- .../tsqr/src/TbbTsqr_UnCacheBlockTask.hpp | 85 ++++++------ packages/tpetra/tsqr/src/Tsqr.hpp | 10 +- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 48 ++++--- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 20 +-- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 50 +++---- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 11 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 108 +++++++-------- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 26 ++-- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 123 +++++++++--------- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 38 +++--- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 6 +- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 16 +-- packages/tpetra/tsqr/src/Tsqr_ParTest.hpp | 56 ++++---- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 12 +- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 8 +- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 2 +- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 13 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 38 +++--- packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 8 +- .../tsqr/src/Tsqr_printGlobalMatrix.hpp | 6 +- 31 files changed, 410 insertions(+), 406 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp index 1fc01b237d29..c90d5ca63188 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp @@ -128,7 +128,7 @@ namespace TSQR { // that case, Q_split.second and C_split.second (the // bottom block) will be empty. We can deal with this by // treating it as the base case. - if (Q_split.second.empty() || Q_split.second.nrows() == 0) + if (Q_split.second.empty() || Q_split.second.extent(0) == 0) { execute_base_case (); return NULL; @@ -191,9 +191,9 @@ namespace TSQR { TimerType timer(""); timer.start(); const std::vector& seq_outputs = factor_output_.first; - seq_.apply (apply_type_, Q_.nrows(), Q_.ncols(), + seq_.apply (apply_type_, Q_.extent(0), Q_.extent(1), Q_.data(), Q_.lda(), seq_outputs[P_first_], - C_.ncols(), C_.data(), C_.lda(), + C_.extent(1), C_.data(), C_.lda(), contiguous_cache_blocks_); my_seq_timing_ = timer.stop(); } @@ -211,8 +211,9 @@ namespace TSQR { const ParOutput& par_output = factor_output_.second; const std::vector& tau = par_output[P_bot]; - std::vector work (C_top.ncols()); - combine_.apply_pair (apply_type_, C_top.ncols(), Q_bot.ncols(), + std::vector work (C_top.extent(1)); + combine_.apply_pair (apply_type_, + C_top.extent(1), Q_bot.extent(1), Q_bot.data(), Q_bot.lda(), tau.data(), C_top.data(), C_top.lda(), C_bot.data(), C_bot.lda(), work.data()); diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp index 38e7aea13629..2aeda840c12a 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp @@ -99,7 +99,7 @@ namespace TSQR { // the same way.) In that case, out_split.second and // in_split.second (the bottom block) will be empty. We // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.nrows() == 0) + if (out_split.second.empty() || out_split.second.extent(0) == 0) { execute_base_case (); return nullptr; @@ -134,7 +134,7 @@ namespace TSQR { void execute_base_case () { - seq_.cache_block (A_out_.nrows(), A_out_.ncols(), + seq_.cache_block (A_out_.extent(0), A_out_.extent(1), A_out_.data(), A_in_.data(), A_in_.lda()); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp index e204cb99daab..e01f4c467a6a 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp @@ -87,7 +87,7 @@ namespace TSQR { // has too few rows to be worth splitting. In that case, // Q_split.second (the bottom block) will be empty. We // can deal with this by treating it as the base case. - if (Q_split.second.empty() || Q_split.second.nrows() == 0) { + if (Q_split.second.empty() || Q_split.second.extent(0) == 0) { execute_base_case (); return NULL; } @@ -122,22 +122,22 @@ namespace TSQR { execute_base_case () { // Fill my partition with zeros. - seq_.fill_with_zeros (Q_out_.nrows(), Q_out_.ncols(), Q_out_.data(), + seq_.fill_with_zeros (Q_out_.extent(0), Q_out_.extent(1), Q_out_.data(), Q_out_.lda(), contiguous_cache_blocks_); // If our partition is the first (topmost), fill it with - // the first Q_out.ncols() columns of the identity matrix. - if (P_first_ == 0) - { - // Fetch the topmost cache block of my partition. Its - // leading dimension should be set correctly by - // top_block(). - mat_view_type Q_out_top = - seq_.top_block (Q_out_, contiguous_cache_blocks_); - // Set the top block of Q_out to the first ncols - // columns of the identity matrix. - for (LocalOrdinal j = 0; j < Q_out_top.ncols(); ++j) - Q_out_top(j,j) = Scalar(1); + // the first Q_out.extent(1) columns of the identity matrix. + if (P_first_ == 0) { + // Fetch the topmost cache block of my partition. Its + // leading dimension should be set correctly by + // top_block(). + mat_view_type Q_out_top = + seq_.top_block (Q_out_, contiguous_cache_blocks_); + // Set the top block of Q_out to the first ncols + // columns of the identity matrix. + for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j) { + Q_out_top(j,j) = Scalar(1); } + } } }; } // namespace TBB diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index 0fc2d2df2e11..8b27cd2c39da 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -120,7 +120,7 @@ namespace TSQR { // has too few rows to be worth splitting. In that case, // A_split.second (the bottom block) will be empty. We // can deal with this by treating it as the base case. - if (A_split.second.empty() || A_split.second.nrows() == 0) + if (A_split.second.empty() || A_split.second.extent(0) == 0) { execute_base_case (); return NULL; @@ -195,13 +195,13 @@ namespace TSQR { "partitions are the same."); // We only read and write the upper ncols x ncols triangle of // each block. - TEUCHOS_TEST_FOR_EXCEPTION(A_top.ncols() != A_bot.ncols(), std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION(A_top.extent(1) != A_bot.extent(1), std::logic_error, thePrefix << "The top cache block A_top is " - << A_top.nrows() << " x " << A_top.ncols() + << A_top.extent(0) << " x " << A_top.extent(1) << ", and the bottom cache block A_bot is " - << A_bot.nrows() << " x " << A_bot.ncols() + << A_bot.extent(0) << " x " << A_bot.extent(1) << "; this means we can't factor [A_top; A_bot]."); - const LocalOrdinal ncols = A_top.ncols(); + const LocalOrdinal ncols = A_top.extent(1); std::vector& tau = par_output_[P_bot]; std::vector work (ncols); combine_.factor_pair (ncols, A_top.data(), A_top.lda(), @@ -214,7 +214,7 @@ namespace TSQR { TimerType timer(""); timer.start(); seq_outputs_[P_first_] = - seq_.factor (A_.nrows(), A_.ncols(), A_.data(), + seq_.factor (A_.extent(0), A_.extent(1), A_.data(), A_.lda(), contiguous_cache_blocks_); // Assign the topmost cache block of the current partition to // *A_top_ptr_. Every base case invocation does this, so that diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp index e6f040b9b7e8..1e965b0348a4 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp @@ -89,7 +89,7 @@ namespace TSQR { // has too few rows to be worth splitting. In that case, // C_split.second (the bottom block) will be empty. We // can deal with this by treating it as the base case. - if (C_split.second.empty() || C_split.second.nrows() == 0) { + if (C_split.second.empty() || C_split.second.extent(0) == 0) { execute_base_case (); return nullptr; } @@ -124,7 +124,7 @@ namespace TSQR { execute_base_case () { // Fill my partition with zeros. - seq_.fill_with_zeros (C_.nrows(), C_.ncols(), C_.data(), + seq_.fill_with_zeros (C_.extent(0), C_.extent(1), C_.data(), C_.lda(), contiguous_cache_blocks_); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp index a1087e0142cd..f8992729587a 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp @@ -95,8 +95,8 @@ namespace TSQR { const size_t num_partitions_top = P_mid - P_first + 1; //const size_t num_partitions_bottom = P_last - P_mid; const size_t num_partitions = P_last - P_first + 1; - const ordinal_type nrows = A.nrows(); - const ordinal_type ncols = A.ncols(); + const ordinal_type nrows = A.extent(0); + const ordinal_type ncols = A.extent(1); if (! should_split (nrows, ncols, num_partitions)) { return std::make_pair (MatrixViewType(A), MatrixViewType()); diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp index 33fa6071fb2b..c7c184d865ad 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp @@ -85,7 +85,7 @@ namespace TSQR { // doesn't suggest any orthogonality of the B input matrix, // though in this case B is U and U is orthogonal // (resp. unitary if Scalar is complex). - seq_.Q_times_B (Q_.nrows(), Q_.ncols(), Q_.data(), Q_.lda(), + seq_.Q_times_B (Q_.extent(0), Q_.extent(1), Q_.data(), Q_.lda(), U_.data(), U_.lda(), contiguous_cache_blocks_); } @@ -112,11 +112,10 @@ namespace TSQR { // has too few rows to be worth splitting. In that case, // out_split.second (the bottom block) will be empty. We // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.nrows() == 0) - { - execute_base_case (); - return NULL; - } + if (out_split.second.empty() || out_split.second.extent(0) == 0) { + execute_base_case (); + return nullptr; + } // "c": continuation task tbb::empty_task& c = diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index 26c7261dfe25..2fa287765a9c 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -105,7 +105,7 @@ namespace TSQR { // C_split.second (the bottom block) will be empty. We // deal with this in the same way as the base case // (P_first == P_last) above. - if (C_split.second.empty() || C_split.second.nrows() == 0) + if (C_split.second.empty() || C_split.second.extent(0) == 0) return seq_.top_block (C_split.first, contiguous_cache_blocks); else return top_block_helper (P_first, P_mid, C_split.first, @@ -326,7 +326,7 @@ namespace TSQR { } // Copy the R factor out of A_top into R. - seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.data(), + seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), A_top.lda(), R, ldr, contiguous_cache_blocks); // Save the timings for future reference @@ -643,9 +643,9 @@ namespace TSQR { const_mat_view_type Q_top = seq_.top_block (Q, contiguous_cache_blocks); mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = - std::make_pair (const_mat_view_type (Q_top.ncols(), Q_top.ncols(), + std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()), - mat_view_type (C_top.ncols(), C_top.ncols(), + mat_view_type (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.lda())); } else { @@ -664,14 +664,14 @@ namespace TSQR { // Q, and Q_split.second (the bottom block) will be empty. // Ditto for C_split. We deal with this in the same way // as the base case (P_first == P_last) above. - if (Q_split.second.empty() || Q_split.second.nrows() == 0) { + if (Q_split.second.empty() || Q_split.second.extent(0) == 0) { const_mat_view_type Q_top = seq_.top_block (Q, contiguous_cache_blocks); mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = - std::make_pair (const_mat_view_type (Q_top.ncols(), Q_top.ncols(), + std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()), - mat_view_type (C_top.ncols(), C_top.ncols(), + mat_view_type (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.lda())); } else { diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp index c4f4e6a09588..1f3a4ac252a7 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp @@ -59,13 +59,13 @@ namespace TSQR { } else if (P_first == P_last) { CacheBlocker< LocalOrdinal, Scalar > - blocker (Q_out.nrows(), Q_out.ncols(), + blocker (Q_out.extent(0), Q_out.extent(1), seq_.cache_blocking_strategy()); // Fill my partition with zeros. blocker.fill_with_zeros (Q_out, contiguous_cache_blocks); // If our partition is the first (topmost), fill it with - // the first Q_out.ncols() columns of the identity matrix. + // the first Q_out.extent(1) columns of the identity matrix. if (P_first == 0) { // Fetch the topmost cache block of my partition. Its // leading dimension should be set correctly by @@ -73,7 +73,7 @@ namespace TSQR { mat_view Q_out_top = blocker.top_block (Q_out, contiguous_cache_blocks); - for (LocalOrdinal j = 0; j < Q_out_top.ncols(); ++j) + for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j) Q_out_top(j,j) = Scalar(1); } } @@ -110,7 +110,7 @@ namespace TSQR { } else if (P_first == P_last) { std::pair results = - seq_.factor (A.nrows(), A.ncols(), A.data(), A.lda(), + seq_.factor (A.extent(0), A.extent(1), A.data(), A.lda(), contiguous_cache_blocks); seq_outputs[P_first] = results.first; A_top = A; @@ -136,7 +136,7 @@ namespace TSQR { // If we're completely done, extract the final R factor from // the topmost partition. if (depth == 0) { - seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.data(), + seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), A_top.lda(), R, ldr, contiguous_cache_blocks); } return A_top; @@ -184,12 +184,12 @@ namespace TSQR { return; else if (P_first == P_last) { - CacheBlocker< LocalOrdinal, Scalar > blocker (Q.nrows(), Q.ncols(), seq_.cache_blocking_strategy()); + CacheBlocker< LocalOrdinal, Scalar > blocker (Q.extent(0), Q.extent(1), seq_.cache_blocking_strategy()); const_mat_view Q_top = blocker.top_block (Q, contiguous_cache_blocks); mat_view C_top = blocker.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = - std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), Q_top.data(), Q_top.lda()), - mat_view (C_top.ncols(), C_top.ncols(), C_top.data(), C_top.lda())); + std::make_pair (const_mat_view (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()), + mat_view (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.lda())); } else { @@ -227,8 +227,8 @@ namespace TSQR { else if (P_first == P_last) { const std::vector< SeqOutput >& seq_outputs = factor_output.first; - seq_.apply ("N", Q.nrows(), Q.ncols(), Q.data(), Q.lda(), - seq_outputs[P_first], C.ncols(), C.data(), + seq_.apply ("N", Q.extent(0), Q.extent(1), Q.data(), Q.lda(), + seq_outputs[P_first], C.extent(1), C.data(), C.lda(), contiguous_cache_blocks); } else @@ -270,8 +270,8 @@ namespace TSQR { } else if (P_first == P_last) { const std::vector& seq_outputs = factor_output.first; - seq_.apply (op, Q.nrows(), Q.ncols(), Q.data(), Q.lda(), - seq_outputs[P_first], C.ncols(), C.data(), + seq_.apply (op, Q.extent(0), Q.extent(1), Q.data(), Q.lda(), + seq_outputs[P_first], C.extent(1), C.data(), C.lda(), contiguous_cache_blocks); return std::make_pair (Q, C); } @@ -319,9 +319,9 @@ namespace TSQR { } // We only read and write the upper ncols x ncols triangle of // each block. - const LocalOrdinal ncols = A_top.ncols(); - if (A_bot.ncols() != ncols) - throw std::logic_error("A_bot.ncols() != A_top.ncols()"); + const LocalOrdinal ncols = A_top.extent(1); + if (A_bot.extent(1) != ncols) + throw std::logic_error("A_bot.extent(1) != A_top.extent(1)"); std::vector< Scalar >& tau = par_outputs[P_bot]; std::vector< Scalar > work (ncols); @@ -347,10 +347,10 @@ namespace TSQR { throw std::logic_error ("apply_pair: should never get here!"); } const std::vector& tau = tau_arrays[P_bot]; - std::vector work (C_top.ncols()); + std::vector work (C_top.extent(1)); TSQR::Combine combine_; - combine_.apply_pair (trans.c_str(), C_top.ncols(), Q_bot.ncols(), + combine_.apply_pair (trans.c_str(), C_top.extent(1), Q_bot.extent(1), Q_bot.data(), Q_bot.lda(), &tau[0], C_top.data(), C_top.lda(), C_bot.data(), C_bot.lda(), &work[0]); @@ -367,7 +367,7 @@ namespace TSQR { if (P_first > P_last) return; else if (P_first == P_last) - seq_.cache_block (A_out.nrows(), A_out.ncols(), A_out.data(), + seq_.cache_block (A_out.extent(0), A_out.extent(1), A_out.data(), A_in.data(), A_in.lda()); else { @@ -395,8 +395,8 @@ namespace TSQR { return; } else if (P_first == P_last) { - seq_.un_cache_block (A_out.nrows(), A_out.ncols(), A_out.data(), - A_out.lda(), A_in.data()); + seq_.un_cache_block (A_out.extent(0), A_out.extent(1), + A_out.data(), A_out.lda(), A_in.data()); } else { const size_t P_mid = (P_first + P_last) / 2; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp index 287a238af9f6..55ae23b63e76 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp @@ -76,52 +76,50 @@ namespace TSQR { { using tbb::task; - if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) - return NULL; - else if (P_first_ == P_last_) - { + if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) { + return nullptr; + } + else if (P_first_ == P_last_) { + execute_base_case (); + return nullptr; + } + else { + // Recurse on two intervals: [P_first, P_mid] and + // [P_mid+1, P_last]. + const size_t P_mid = (P_first_ + P_last_) / 2; + split_t out_split = + partitioner_.split (A_out_, P_first_, P_mid, P_last_, false); + const_split_t in_split = + partitioner_.split (A_in_, P_first_, P_mid, P_last_, true); + + // The partitioner may decide that the current blocks A_out_ + // and A_in_ have too few rows to be worth splitting. (It + // should split both A_out_ and A_in_ in the same way.) In + // that case, out_split.second and in_split.second (the + // bottom block) will be empty. We can deal with this by + // treating it as the base case. + if (out_split.second.empty() || out_split.second.extent(0) == 0) { execute_base_case (); - return NULL; + return nullptr; } - else - { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last]. - const size_t P_mid = (P_first_ + P_last_) / 2; - split_t out_split = - partitioner_.split (A_out_, P_first_, P_mid, P_last_, false); - const_split_t in_split = - partitioner_.split (A_in_, P_first_, P_mid, P_last_, true); - - // The partitioner may decide that the current blocks - // A_out_ and A_in_ have too few rows to be worth - // splitting. (It should split both A_out_ and A_in_ in - // the same way.) In that case, out_split.second and - // in_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.nrows() == 0) - { - execute_base_case (); - return NULL; - } - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - UnCacheBlockTask& topTask = *new( c.allocate_child() ) - UnCacheBlockTask (P_first_, P_mid, out_split.first, + // "c": continuation task + tbb::empty_task& c = + *new( allocate_continuation() ) tbb::empty_task; + // Recurse on the split + UnCacheBlockTask& topTask = *new( c.allocate_child() ) + UnCacheBlockTask (P_first_, P_mid, out_split.first, in_split.first, seq_); - UnCacheBlockTask& botTask = *new( c.allocate_child() ) - UnCacheBlockTask (P_mid+1, P_last_, out_split.second, + UnCacheBlockTask& botTask = *new( c.allocate_child() ) + UnCacheBlockTask (P_mid+1, P_last_, out_split.second, in_split.second, seq_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } + // Set reference count of parent (in this case, the + // continuation task) to 2 (since 2 children -- no + // additional task since no waiting). + c.set_ref_count (2); + c.spawn (botTask); + return &topTask; // scheduler bypass optimization + } } private: @@ -134,8 +132,9 @@ namespace TSQR { void execute_base_case () { - seq_.un_cache_block (A_out_.nrows(), A_out_.ncols(), - A_out_.data(), A_out_.lda(), A_in_.data()); + seq_.un_cache_block (A_out_.extent(0), A_out_.extent(1), + A_out_.data(), A_out_.lda(), + A_in_.data()); } }; diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 0491249b8117..7738843a4b68 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -270,12 +270,12 @@ namespace TSQR { // case Q is arranged using contiguous cache blocks. mat_view_type Q_top_block = nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks); - if (Q_top_block.nrows () < numCols) { + if (Q_top_block.extent (0) < numCols) { std::ostringstream os; os << "The top block of Q has too few rows. This means that the " << "the intranode TSQR implementation has a bug in its top_block" << "() method. The top block should have at least " << numCols - << " rows, but instead has only " << Q_top_block.ncols () + << " rows, but instead has only " << Q_top_block.extent (1) << " rows."; throw std::logic_error (os.str ()); } @@ -360,12 +360,12 @@ namespace TSQR { // case Q is arranged using contiguous cache blocks. mat_view_type Q_top_block = nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks); - if (Q_top_block.nrows () < numCols) { + if (Q_top_block.extent (0) < numCols) { std::ostringstream os; os << "The top block of Q has too few rows. This means that the " << "the intranode TSQR implementation has a bug in its top_block" << "() method. The top block should have at least " << numCols - << " rows, but instead has only " << Q_top_block.ncols () + << " rows, but instead has only " << Q_top_block.extent (1) << " rows."; throw std::logic_error (os.str ()); } @@ -539,7 +539,7 @@ namespace TSQR { matrix_type C_top (C_top_view); // Compute in place on all processors' C_top blocks. - distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.data(), + distTsqr_->apply (applyType, C_top.extent(1), ncols_Q, C_top.data(), C_top.lda(), factor_output.second); // Copy the result from C_top back into the top ncols_C by diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 18bd3d83717b..53207ffbcdb1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -105,7 +105,7 @@ namespace TSQR { nrows_ (num_rows), ncols_ (num_cols), strategy_ (strategy), - nrows_cache_block_ (strategy_.cache_block_num_rows (ncols())) + nrows_cache_block_ (strategy_.cache_block_num_rows (extent(1))) { validate (); } @@ -114,21 +114,21 @@ namespace TSQR { CacheBlocker () : nrows_ (0), ncols_ (0), - nrows_cache_block_ (strategy_.cache_block_num_rows (ncols())) + nrows_cache_block_ (strategy_.cache_block_num_rows (extent(1))) {} //! Copy constructor CacheBlocker (const CacheBlocker& rhs) : - nrows_ (rhs.nrows()), - ncols_ (rhs.ncols()), + nrows_ (rhs.extent(0)), + ncols_ (rhs.extent(1)), strategy_ (rhs.strategy_), nrows_cache_block_ (rhs.nrows_cache_block_) {} //! Assignment operator CacheBlocker& operator= (const CacheBlocker& rhs) { - nrows_ = rhs.nrows(); - ncols_ = rhs.ncols(); + nrows_ = rhs.extent(0); + ncols_ = rhs.extent(1); strategy_ = rhs.strategy_; nrows_cache_block_ = rhs.nrows_cache_block_; return *this; @@ -137,11 +137,9 @@ namespace TSQR { //! Cache size hint (in bytes). size_t cache_size_hint () const { return strategy_.cache_size_hint(); } - //! Number of rows in the matrix to block. - Ordinal nrows () const { return nrows_; } - - //! Number of columns in the matrix to block. - Ordinal ncols () const { return ncols_; } + constexpr Ordinal extent (const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + } /// \brief Split A in place into [A_top; A_rest]. /// @@ -161,7 +159,7 @@ namespace TSQR { /// columns with which this CacheBlocker was set up (rather than /// the number of columns in A, which may not be the same). The /// idea is to have the number and distribution of rows in the - /// cache blocks be the same as the original nrows() by ncols() + /// cache blocks be the same as the original extent(0) by extent(1) /// matrix with which this CacheBlocker was initialized. template< class MatrixViewType > MatrixViewType @@ -169,7 +167,7 @@ namespace TSQR { { typedef typename MatrixViewType::ordinal_type ordinal_type; const ordinal_type nrows_top = - strategy_.top_block_split_nrows (A.nrows(), ncols(), + strategy_.top_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); // split_top() sets A to A_rest, and returns A_top. return A.split_top (nrows_top, contiguous_cache_blocks); @@ -192,7 +190,7 @@ namespace TSQR { // Ignore the number of columns in A, since we want to block all // matrices using the same cache blocking strategy. const ordinal_type nrows_top = - strategy_.top_block_split_nrows (A.nrows(), ncols(), + strategy_.top_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); MatrixViewType A_copy (A); return A_copy.split_top (nrows_top, contiguous_cache_blocks); @@ -220,7 +218,7 @@ namespace TSQR { // Ignore the number of columns in A, since we want to block all // matrices using the same cache blocking strategy. const ordinal_type nrows_bottom = - strategy_.bottom_block_split_nrows (A.nrows(), ncols(), + strategy_.bottom_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); // split_bottom() sets A to A_rest, and returns A_bot. return A.split_bottom (nrows_bottom, contiguous_cache_blocks); @@ -342,7 +340,7 @@ namespace TSQR { // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, true); - copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.data(), + copy_matrix (A_in_cur.extent(0), num_cols, A_out_cur.data(), A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); } } @@ -375,7 +373,7 @@ namespace TSQR { // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, false); - copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.data(), + copy_matrix (A_in_cur.extent(0), num_cols, A_out_cur.data(), A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); } } @@ -404,7 +402,7 @@ namespace TSQR { // Total number of cache blocks. const ordinal_type num_cache_blocks = - strategy_.num_cache_blocks (A.nrows(), A.ncols(), nrows_cache_block()); + strategy_.num_cache_blocks (A.extent(0), A.extent(1), nrows_cache_block()); if (cache_block_index >= num_cache_blocks) return MatrixViewType (0, 0, NULL, 0); // empty @@ -414,7 +412,7 @@ namespace TSQR { // result[2] = pointer offset (A.data() + result[2]) // result[3] = leading dimension (a.k.a. stride) of the cache block std::vector result = - strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(), + strategy_.cache_block_details (cache_block_index, A.extent(0), A.extent(1), A.lda(), nrows_cache_block(), contiguous_cache_blocks); if (result[1] == 0) @@ -424,7 +422,7 @@ namespace TSQR { // We expect that ordinal_type is signed, so adding signed // (ordinal_type) to unsigned (pointer) may raise compiler // warnings. - return MatrixViewType (result[1], A.ncols(), + return MatrixViewType (result[1], A.extent(1), A.data() + static_cast(result[2]), result[3]); } @@ -437,8 +435,8 @@ namespace TSQR { bool operator== (const CacheBlockingStrategy& rhs) const { - return nrows() == rhs.nrows() && - ncols() == rhs.ncols() && + return extent(0) == rhs.extent(0) && + extent(1) == rhs.extent(1) && strategy_ == rhs.strategy_; } @@ -465,8 +463,8 @@ namespace TSQR { /// For an explanation of "typical," see the documentation of /// CacheBlockingStrategy. In brief, some cache blocks may have /// more rows (up to but not including nrows_cache_block() + - /// ncols() rows), and some may have less (but no less than - /// ncols() rows). + /// extent(1) rows), and some may have less (but no less than + /// extent(1) rows). size_t nrows_cache_block () const { return nrows_cache_block_; } }; @@ -516,7 +514,7 @@ namespace TSQR { const bool reverse, const bool contiguousCacheBlocks) : A_ (A), - blocker_ (A_.nrows(), A_.ncols(), strategy), + blocker_ (A_.extent(0), A_.extent(1), strategy), curInd_ (currentIndex), reverse_ (reverse), contiguousCacheBlocks_ (contiguousCacheBlocks) diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index b02c267f91a7..75378cf9cd40 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -205,7 +205,7 @@ namespace TSQR { printMatrix (std::ostream& out, const MatrixViewType& A) { - print_local_matrix (out, A.nrows(), A.ncols(), A.data(), A.lda()); + print_local_matrix (out, A.extent(0), A.extent(1), A.data(), A.lda()); } template @@ -215,7 +215,7 @@ namespace TSQR { const MatrixViewType& Q, const MatrixViewType& R) { - return local_verify (A.nrows(), A.ncols(), A.data(), A.lda(), + return local_verify (A.extent(0), A.extent(1), A.data(), A.lda(), Q.data(), Q.lda(), R.data(), R.lda()); } @@ -357,18 +357,18 @@ namespace TSQR { { cerr << "Results of first test problem:" << endl; cerr << "-- Copy of test problem:" << endl; - print_local_matrix (cerr, A_R1R2.nrows(), A_R1R2.ncols(), + print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1), A_R1R2.data(), A_R1R2.lda()); cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q_R1R2.nrows(), Q_R1R2.ncols(), + print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1), Q_R1R2.data(), Q_R1R2.lda()); cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, R1.nrows(), R1.ncols(), + print_local_matrix (cerr, R1.extent(0), R1.extent(1), R1.data(), R1.lda()); cerr << endl; } const results_type firstResults = - local_verify (A_R1R2.nrows(), A_R1R2.ncols(), + local_verify (A_R1R2.extent(0), A_R1R2.extent(1), A_R1R2.data(), A_R1R2.lda(), Q_R1R2.data(), Q_R1R2.lda(), R1.data(), R1.lda()); @@ -394,18 +394,18 @@ namespace TSQR { { cerr << "Results of second test problem:" << endl; cerr << "-- Copy of test problem:" << endl; - print_local_matrix (cerr, A_R3A.nrows(), A_R3A.ncols(), + print_local_matrix (cerr, A_R3A.extent(0), A_R3A.extent(1), A_R3A.data(), A_R3A.lda()); cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q_R3A.nrows(), Q_R3A.ncols(), + print_local_matrix (cerr, Q_R3A.extent(0), Q_R3A.extent(1), Q_R3A.data(), Q_R3A.lda()); cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, R3.nrows(), R3.ncols(), + print_local_matrix (cerr, R3.extent(0), R3.extent(1), R3.data(), R3.lda()); cerr << endl; } const results_type secondResults = - local_verify (A_R3A.nrows(), A_R3A.ncols(), + local_verify (A_R3A.extent(0), A_R3A.extent(1), A_R3A.data(), A_R3A.lda(), Q_R3A.data(), Q_R3A.lda(), R3.data(), R3.lda()); diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 19f9fb7e9420..67acf2c371ee 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -274,7 +274,7 @@ namespace TSQR { "MessengerBase instance."); VecVec Q_factors, tau_arrays; DistTsqrHelper helper; - const ordinal_type ncols = R_mine.ncols(); + const ordinal_type ncols = R_mine.extent(1); std::vector< scalar_type > R_local (ncols*ncols); copy_matrix (ncols, ncols, R_local.data(), ncols, R_mine.data(), R_mine.lda()); diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index a3f678b471e5..03d7db562680 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -106,20 +106,20 @@ namespace TSQR { void force (mat_view_type Q_mine, mat_view_type R_mine) { typedef Teuchos::ScalarTraits STS; - if (Q_mine.nrows() > 0 && Q_mine.ncols() > 0) { - for (int k = 0; k < R_mine.ncols(); ++k) { + if (Q_mine.extent(0) > 0 && Q_mine.extent(1) > 0) { + for (int k = 0; k < R_mine.extent(1); ++k) { if (R_mine(k,k) < STS::zero()) { // Scale column k of Q_mine. We use a raw pointer since // typically there are many rows in Q_mine, so this // operation should be fast. Scalar* const Q_k = &Q_mine(0,k); - for (int i = 0; i < Q_mine.nrows(); ++i) { + for (int i = 0; i < Q_mine.extent(0); ++i) { Q_k[i] = -Q_k[i]; } // Scale row k of R_mine. R_mine is upper triangular, // so we only have to scale right of (and including) the // diagonal entry. - for (int j = k; j < R_mine.ncols(); ++j) { + for (int j = k; j < R_mine.extent(1); ++j) { R_mine(k,j) = -R_mine(k,j); } } @@ -244,21 +244,21 @@ namespace TSQR { // R_mine has columns, but Q_mine may have any number of // columns. (It depends on how many columns of the explicit Q // factor we want to compute.) - if (R_mine.nrows() < R_mine.ncols()) + if (R_mine.extent(0) < R_mine.extent(1)) { std::ostringstream os; - os << "R factor input has fewer rows (" << R_mine.nrows() - << ") than columns (" << R_mine.ncols() << ")"; + os << "R factor input has fewer rows (" << R_mine.extent(0) + << ") than columns (" << R_mine.extent(1) << ")"; // This is a logic error because TSQR users should not be // calling this method directly. throw std::logic_error (os.str()); } - else if (Q_mine.nrows() != R_mine.ncols()) + else if (Q_mine.extent(0) != R_mine.extent(1)) { std::ostringstream os; os << "Q factor input must have the same number of rows as the R " - "factor input has columns. Q has " << Q_mine.nrows() - << " rows, but R has " << R_mine.ncols() << " columns."; + "factor input has columns. Q has " << Q_mine.extent(0) + << " rows, but R has " << R_mine.extent(1) << " columns."; // This is a logic error because TSQR users should not be // calling this method directly. throw std::logic_error (os.str()); @@ -310,11 +310,11 @@ namespace TSQR { Q_mine.fill (scalar_type (0)); if (messenger_->rank() == 0) { - for (ordinal_type j = 0; j < Q_mine.ncols(); ++j) + for (ordinal_type j = 0; j < Q_mine.extent(1); ++j) Q_mine(j, j) = scalar_type (1); } // Scratch space for computing results to send to other processors. - matrix_type Q_other (Q_mine.nrows(), Q_mine.ncols(), scalar_type (0)); + matrix_type Q_other (Q_mine.extent(0), Q_mine.extent(1), scalar_type (0)); const rank_type numSteps = QFactors.size() - 1; { @@ -387,7 +387,7 @@ namespace TSQR { // This only does anything if P_mine is either P_first or P_mid. if (P_mine == P_first) { - const ordinal_type numCols = R_mine.ncols(); + const ordinal_type numCols = R_mine.extent(1); matrix_type R_other (numCols, numCols); recv_R (R_other, P_mid); @@ -457,11 +457,11 @@ namespace TSQR { // Apply implicitly stored local Q factor to // [Q_mine; // Q_other] - // where Q_other = zeros(Q_mine.nrows(), Q_mine.ncols()). + // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)). // Overwrite both Q_mine and Q_other with the result. Q_other.fill (scalar_type (0)); combine_.apply_pair (ApplyType::NoTranspose, - Q_mine.ncols(), Q_impl.ncols(), + Q_mine.extent(1), Q_impl.extent(1), Q_impl.data(), Q_impl.lda(), tau.data(), Q_mine.data(), Q_mine.lda(), Q_other.data(), Q_other.lda(), work_.data()); @@ -493,8 +493,8 @@ namespace TSQR { { StatTimeMonitor bcastCommMonitor (*bcastCommTime_, bcastCommStats_); - const ordinal_type R_numCols = R.ncols(); - const ordinal_type Q_size = Q.nrows() * Q.ncols(); + const ordinal_type R_numCols = R.extent(1); + const ordinal_type Q_size = Q.extent(0) * Q.extent(1); const ordinal_type R_size = (R_numCols * (R_numCols + 1)) / 2; const ordinal_type numElts = Q_size + R_size; @@ -504,7 +504,7 @@ namespace TSQR { resizeWork (numElts); // Pack the Q data into the workspace array. - mat_view_type Q_contig (Q.nrows(), Q.ncols(), work_.data(), Q.nrows()); + mat_view_type Q_contig (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0)); deep_copy (Q_contig, Q); // Pack the R data into the workspace array. pack_R (R, &work_[Q_size]); @@ -519,8 +519,8 @@ namespace TSQR { { StatTimeMonitor bcastCommMonitor (*bcastCommTime_, bcastCommStats_); - const ordinal_type R_numCols = R.ncols(); - const ordinal_type Q_size = Q.nrows() * Q.ncols(); + const ordinal_type R_numCols = R.extent(1); + const ordinal_type Q_size = Q.extent(0) * Q.extent(1); const ordinal_type R_size = (R_numCols * (R_numCols + 1)) / 2; const ordinal_type numElts = Q_size + R_size; @@ -532,7 +532,7 @@ namespace TSQR { messenger_->recv (work_.data(), numElts, srcProc, 0); // Unpack the C data from the workspace array. - deep_copy (Q, mat_view_type (Q.nrows(), Q.ncols(), work_.data(), Q.nrows())); + deep_copy (Q, mat_view_type (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0))); // Unpack the R data from the workspace array. unpack_R (R, &work_[Q_size]); } @@ -543,7 +543,7 @@ namespace TSQR { { StatTimeMonitor reduceCommMonitor (*reduceCommTime_, reduceCommStats_); - const ordinal_type numCols = R.ncols(); + const ordinal_type numCols = R.extent(1); const ordinal_type numElts = (numCols * (numCols+1)) / 2; // Don't shrink the workspace array; doing so would still be @@ -561,7 +561,7 @@ namespace TSQR { { StatTimeMonitor reduceCommMonitor (*reduceCommTime_, reduceCommStats_); - const ordinal_type numCols = R.ncols(); + const ordinal_type numCols = R.extent(1); const ordinal_type numElts = (numCols * (numCols+1)) / 2; // Don't shrink the workspace array; doing so would still be @@ -578,7 +578,7 @@ namespace TSQR { unpack_R (MatrixType& R, const scalar_type buf[]) { ordinal_type curpos = 0; - for (ordinal_type j = 0; j < R.ncols(); ++j) + for (ordinal_type j = 0; j < R.extent(1); ++j) { scalar_type* const R_j = &R(0, j); for (ordinal_type i = 0; i <= j; ++i) @@ -591,7 +591,7 @@ namespace TSQR { pack_R (const ConstMatrixType& R, scalar_type buf[]) { ordinal_type curpos = 0; - for (ordinal_type j = 0; j < R.ncols(); ++j) + for (ordinal_type j = 0; j < R.extent(1); ++j) { const scalar_type* const R_j = &R(0, j); for (ordinal_type i = 0; i <= j; ++i) diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 2a8634f6a263..0868c525ede8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -240,9 +240,10 @@ namespace TSQR { // this process is in A_local) with the given singular values. // This part has O(P) communication for P MPI processes. using TSQR::Random::randomGlobalMatrix; - // Help the C++ compiler with type inference. - mat_view_type A_local_view (A_local.nrows(), A_local.ncols(), A_local.data(), A_local.lda()); - const magnitude_type* const singVals = (numCols == 0) ? NULL : &singularValues[0]; + mat_view_type A_local_view (A_local.extent(0), + A_local.extent(1), + A_local.data(), A_local.lda()); + const magnitude_type* const singVals = singularValues.data(); randomGlobalMatrix (&gen, A_local_view, singVals, ordinalMessenger.getRawPtr(), scalarMessenger.getRawPtr()); @@ -274,7 +275,7 @@ namespace TSQR { // factoring the matrix, when only the explicit Q factor is // wanted. if (testFactorExplicit) { - tsqr->factorExplicitRaw (A_copy.nrows (), A_copy.ncols (), + tsqr->factorExplicitRaw (A_copy.extent (0), A_copy.extent (1), A_copy.data (), A_copy.lda (), Q_local.data (), Q_local.lda (), R.data (), R.lda (), @@ -321,7 +322,7 @@ namespace TSQR { // actual numerical rank. const magnitude_type tol = STM::zero(); const ordinal_type rank = - tsqr->revealRankRaw (Q_local.nrows (), Q_local.ncols (), + tsqr->revealRankRaw (Q_local.extent (0), Q_local.extent (1), Q_local.data (), Q_local.lda (), R.data (), R.lda (), tol, contiguousCacheBlocks); diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index fd904a8305ca..15e744452b9d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -168,12 +168,12 @@ namespace TSQR { const mat_view_type& A_top, std::vector& work) const { - std::vector tau (A_top.ncols()); + std::vector tau (A_top.extent(1)); - // We should only call this if A_top.ncols() > 0 and therefore + // We should only call this if A_top.extent(1) > 0 and therefore // work.size() > 0, but we've already checked for that, so we // don't have to check again. - combine.factor_first (A_top.nrows(), A_top.ncols(), A_top.data(), + combine.factor_first (A_top.extent(0), A_top.extent(1), A_top.data(), A_top.lda(), tau.data(), work.data()); return tau; } @@ -184,12 +184,12 @@ namespace TSQR { const mat_view_type& A_cur, std::vector& work) const { - std::vector tau (A_top.ncols()); + std::vector tau (A_top.extent(1)); - // We should only call this if A_top.ncols() > 0 and therefore + // We should only call this if A_top.extent(1) > 0 and therefore // tau.size() > 0 and work.size() > 0, but we've already // checked for that, so we don't have to check again. - combine.factor_inner (A_cur.nrows(), A_top.ncols(), + combine.factor_inner (A_cur.extent(0), A_top.extent(1), A_top.data(), A_top.lda(), A_cur.data(), A_cur.lda(), tau.data(), work.data()); @@ -211,7 +211,7 @@ namespace TSQR { // Workspace is created here, because it must not be shared // among threads. - std::vector work (A_.ncols()); + std::vector work (A_.extent(1)); // Range of cache blocks to factor. cb_range_type cbRange (A_, strategy_, cbIndices.first, @@ -339,7 +339,7 @@ namespace TSQR { } else { const std::pair cbIndices = - cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex, + cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex, numPartitions_, strategy_); // It's legitimate, though suboptimal, for some partitions // not to get any work to do (in this case, not to get any @@ -385,17 +385,17 @@ namespace TSQR { const mat_view_type& C_top, std::vector& work) const { - TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast (Q_top.ncols()), + TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast (Q_top.extent(1)), std::logic_error, "ApplyFirstPass::applyFirstCacheBlock: tau.size() " "(= " << tau.size() << ") < number of columns " - << Q_top.ncols() << " in the Q factor. Please " + << Q_top.extent(1) << " in the Q factor. Please " "report this bug to the Kokkos developers."); // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. - combine.apply_first (applyType, C_top.nrows(), C_top.ncols(), - Q_top.ncols(), Q_top.data(), Q_top.lda(), + combine.apply_first (applyType, C_top.extent(0), C_top.extent(1), + Q_top.extent(1), Q_top.data(), Q_top.lda(), tau.data(), C_top.data(), C_top.lda(), work.data()); } @@ -409,16 +409,16 @@ namespace TSQR { std::vector& work) const { TEUCHOS_TEST_FOR_EXCEPTION - (tau.size() < static_cast (Q_cur.ncols()), + (tau.size() < static_cast (Q_cur.extent(1)), std::logic_error, "ApplyFirstPass::applyCacheBlock: tau.size() " "(= " << tau.size() << ") < number of columns " - << Q_cur.ncols() << " in the Q factor." + << Q_cur.extent(1) << " in the Q factor." " Please report this bug to the Tpetra developers."); // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. - combine.apply_inner (applyType, C_cur.nrows(), C_cur.ncols(), - Q_cur.ncols(), Q_cur.data(), Q_cur.lda(), + combine.apply_inner (applyType, C_cur.extent(0), C_cur.extent(1), + Q_cur.extent(1), Q_cur.data(), Q_cur.lda(), tau.data(), C_top.data(), C_top.lda(), C_cur.data(), C_cur.lda(), @@ -466,11 +466,11 @@ namespace TSQR { "indices [" << cbIndices.first << ", " << cbIndices.second << ") is not empty." << suffix); - // Task-local workspace array of length C_.ncols(). Workspace + // Task-local workspace array of length C_.extent(1). Workspace // must be per task, else there will be race conditions as // different tasks attempt to write to and read from the same // workspace simultaneously. - std::vector work (C_.ncols()); + std::vector work (C_.extent(1)); Combine combine; if (applyType.transposed ()) { @@ -492,7 +492,7 @@ namespace TSQR { if (explicitQ_) { C_top.fill (Scalar {}); if (partitionIndex == 0) { - for (LocalOrdinal j = 0; j < C_top.ncols(); ++j) { + for (LocalOrdinal j = 0; j < C_top.extent(1); ++j) { C_top(j,j) = Scalar (1.0); } } @@ -536,9 +536,9 @@ namespace TSQR { // internode part of the Q factor via DistTsqr). However, // we still need to fill the rest of C_top (everything but // the top ncols rows of C_top) with zeros. - mat_view_type C_top_rest (C_top.nrows() - C_top.ncols(), - C_top.ncols(), - C_top.data() + C_top.ncols(), + mat_view_type C_top_rest (C_top.extent(0) - C_top.extent(1), + C_top.extent(1), + C_top.data() + C_top.extent(1), C_top.lda()); C_top_rest.fill (Scalar {}); } @@ -662,7 +662,7 @@ namespace TSQR { // We use the same cache block indices for Q and for C. std::pair cbIndices = - cacheBlockIndexRange (Q_.nrows(), Q_.ncols(), partitionIndex, + cacheBlockIndexRange (Q_.extent(0), Q_.extent(1), partitionIndex, numPartitions_, strategy_); if (cbIndices.second <= cbIndices.first) return; @@ -671,15 +671,15 @@ namespace TSQR { size_t (cbIndices.second)); TEUCHOS_TEST_FOR_EXCEPTION (cbIndices.first < LocalOrdinal(0), std::logic_error, - prefix << "cacheBlockIndexRange(" << Q_.nrows () << ", " - << Q_.ncols() << ", " << partitionIndex << ", " + prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", " + << Q_.extent(1) << ", " << partitionIndex << ", " << numPartitions_ << ", strategy) returned a cache block " "range " << cbIndices.first << "," << cbIndices.second << " with negative starting index." << suffix); TEUCHOS_TEST_FOR_EXCEPTION (cbInds.second > tauArrays_.size (), std::logic_error, - prefix << "cacheBlockIndexRange(" << Q_.nrows () << ", " - << Q_.ncols() << ", " << partitionIndex << ", " + prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", " + << Q_.extent(1) << ", " << partitionIndex << ", " << numPartitions_ << ", strategy) returned a cache block " "range" << cbIndices.first << "," << cbIndices.second << " with starting index larger than the number of tau " @@ -760,14 +760,14 @@ namespace TSQR { unblock_ (unblock) { TEUCHOS_TEST_FOR_EXCEPTION - (A_in_.nrows() != A_out_.nrows() || - A_in_.ncols() != A_out_.ncols(), + (A_in_.extent(0) != A_out_.extent(0) || + A_in_.extent(1) != A_out_.extent(1), std::invalid_argument, "A_in and A_out do not have the same dimensions: " - "A_in is " << A_in_.nrows() << " by " - << A_in_.ncols() << ", but A_out is " - << A_out_.nrows() << " by " - << A_out_.ncols() << "."); + "A_in is " << A_in_.extent(0) << " by " + << A_in_.extent(1) << ", but A_out is " + << A_out_.extent(0) << " by " + << A_out_.extent(1) << "."); TEUCHOS_TEST_FOR_EXCEPTION (numPartitions_ < 1, std::invalid_argument, "The number of partitions " << numPartitions_ @@ -788,7 +788,7 @@ namespace TSQR { else { using index_range_type = std::pair; const index_range_type cbIndices = - cacheBlockIndexRange (A_in_.nrows (), A_in_.ncols (), + cacheBlockIndexRange (A_in_.extent (0), A_in_.extent (1), partitionIndex, numPartitions_, strategy_); // It's perfectly legal for a partitioning to assign zero // cache block indices to a particular partition. In that @@ -837,16 +837,16 @@ namespace TSQR { Matrix& Q_temp) const { using Teuchos::NO_TRANS; - const LocalOrdinal numCols = Q_cur.ncols (); + const LocalOrdinal numCols = Q_cur.extent (1); // GEMM doesn't like aliased arguments, so we use a copy. We // only copy the current cache block, rather than all of Q; // this saves memory. - Q_temp.reshape (Q_cur.nrows (), numCols); + Q_temp.reshape (Q_cur.extent (0), numCols); deep_copy (Q_temp, Q_cur); // Q_cur := Q_temp * B. - blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows(), numCols, numCols, + blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent(0), numCols, numCols, Scalar (1.0), Q_temp.data(), Q_temp.lda(), B_.data(), B_.lda(), Scalar(0), Q_cur.data(), Q_cur.lda()); @@ -909,7 +909,7 @@ namespace TSQR { else { typedef std::pair index_range_type; const index_range_type cbIndices = - cacheBlockIndexRange (Q_.nrows (), Q_.ncols (), partitionIndex, + cacheBlockIndexRange (Q_.extent (0), Q_.extent (1), partitionIndex, numPartitions_, strategy_); if (cbIndices.first >= cbIndices.second) { return; @@ -987,7 +987,7 @@ namespace TSQR { else { typedef std::pair index_range_type; const index_range_type cbIndices = - cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex, + cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex, numPartitions_, strategy_); if (cbIndices.first >= cbIndices.second) { return; @@ -1423,9 +1423,9 @@ namespace TSQR { return FactorOutput (0, 0); } const LO numRowsPerCacheBlock = - strategy_.cache_block_num_rows (A.ncols()); + strategy_.cache_block_num_rows (A.extent(1)); const LO numCacheBlocks = - strategy_.num_cache_blocks (A.nrows(), A.ncols(), numRowsPerCacheBlock); + strategy_.num_cache_blocks (A.extent(0), A.extent(1), numRowsPerCacheBlock); // // Compute the first factorization pass (over partitions). // @@ -1453,11 +1453,11 @@ namespace TSQR { (R_top.empty (), std::logic_error, prefix << "After " "factorSecondPass: result.topBlocks[0] is an empty view." << suffix); - mat_view_type R_top_square (R_top.ncols(), R_top.ncols(), + mat_view_type R_top_square (R_top.extent(1), R_top.extent(1), R_top.data(), R_top.lda()); R.fill (Scalar {}); // Only copy the upper triangle of R_top into R. - copy_upper_triangle (R.ncols(), R.ncols(), R.data(), R.lda(), + copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.lda(), R_top.data(), R_top.lda()); return result; } @@ -1496,12 +1496,12 @@ namespace TSQR { { using index_range_type = std::pair; using blocker_type = CacheBlocker; - blocker_type C_blocker (C.nrows(), C.ncols(), strategy_); + blocker_type C_blocker (C.extent(0), C.extent(1), strategy_); // For each partition, collect its top block of C. for (int partIdx = 0; partIdx < numParts; ++partIdx) { const index_range_type cbIndices = - cacheBlockIndexRange (C.nrows(), C.ncols(), partIdx, + cacheBlockIndexRange (C.extent(0), C.extent(1), partIdx, numParts, strategy_); if (cbIndices.first >= cbIndices.second) { topBlocksOfC[partIdx] = mat_view_type (0, 0, nullptr, 0); @@ -1541,11 +1541,11 @@ namespace TSQR { (work_.size() == 0, std::logic_error, "Workspace array work_ has length zero."); TEUCHOS_TEST_FOR_EXCEPTION - (work_.size() < size_t (R_top.ncols()), std::logic_error, + (work_.size() < size_t (R_top.extent(1)), std::logic_error, "Workspace array work_ has length = " << work_.size() - << " < R_top.ncols() = " << R_top.ncols() << "."); + << " < R_top.extent(1) = " << R_top.extent(1) << "."); - std::vector tau (R_top.ncols ()); + std::vector tau (R_top.extent (1)); // Our convention for such helper methods is for the immediate // parent to allocate workspace (the work_ array in this case). @@ -1553,7 +1553,7 @@ namespace TSQR { // The statement below only works if R_top and R_bot have a // nonzero (and the same) number of columns, but we have already // checked that above. - combine_.factor_pair (R_top.ncols(), R_top.data(), R_top.lda(), + combine_.factor_pair (R_top.extent(1), R_top.data(), R_top.lda(), R_bot.data(), R_bot.lda(), tau.data(), work_.data()); return tau; @@ -1587,7 +1587,7 @@ namespace TSQR { // However, other partitions besides the top one might be empty, // in which case their top blocks will be empty. We skip over // the empty partitions in the loop below. - work_.resize (size_t (topBlocks[0].ncols())); + work_.resize (size_t (topBlocks[0].extent(1))); for (int partIdx = 1; partIdx < numPartitions; ++partIdx) { if (! topBlocks[partIdx].empty ()) { tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]); @@ -1608,7 +1608,7 @@ namespace TSQR { // The statement below only works if C_top, R_bot, and C_bot // have a nonzero (and the same) number of columns, but we have // already checked that above. - combine_.apply_pair (applyType, C_top.ncols(), R_bot.ncols(), + combine_.apply_pair (applyType, C_top.extent(1), R_bot.extent(1), R_bot.data(), R_bot.lda(), tau.data(), C_top.data(), C_top.lda(), C_bot.data(), C_bot.lda(), work_.data()); @@ -1639,7 +1639,7 @@ namespace TSQR { << factorOutput.secondPassTauArrays.size() << ") != number of partitions minus 1 (= " << (numParts-1) << ")." << suffix); - const LocalOrdinal numCols = topBlocksOfC[0].ncols(); + const LocalOrdinal numCols = topBlocksOfC[0].extent(1); work_.resize (size_t (numCols)); // Top blocks of C are the whole cache blocks. We only want to @@ -1667,7 +1667,7 @@ namespace TSQR { } } else { // In non-transposed mode, when computing the first - // C.ncols() columns of the explicit Q factor, intranode + // C.extent(1) columns of the explicit Q factor, intranode // TSQR would run after internode TSQR (i.e., DistTsqr) // (even if only running on a single node in non-MPI mode). // Therefore, internode TSQR is responsible for filling the @@ -1719,7 +1719,7 @@ namespace TSQR { const bool contiguous_cache_blocks) const { typedef CacheBlocker blocker_type; - blocker_type blocker (C.nrows(), C.ncols(), strategy_); + blocker_type blocker (C.extent(0), C.extent(1), strategy_); // C_top_block is a view of the topmost cache block of C. // C_top_block should have >= ncols rows, otherwise either cache diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index ba3e195e42f0..a54c64d95eb8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -148,9 +148,9 @@ namespace TSQR { if (debug) { cerr << "-- Generated test problem" << endl; // Don't print the matrix if it's too big. - if (A.nrows() <= 30) { + if (A.extent(0) <= 30) { cerr << "A = " << endl; - print_local_matrix (cerr, A.nrows(), A.ncols(), + print_local_matrix (cerr, A.extent(0), A.extent(1), A.data(), A.lda()); cerr << endl << endl; } @@ -164,9 +164,9 @@ namespace TSQR { if (debug) { cerr << "-- Copied test problem from A into A_copy" << endl; // Don't print the matrix if it's too big. - if (A_copy.nrows() <= 30) { + if (A_copy.extent(0) <= 30) { cerr << "A_copy = " << endl; - print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), + print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), A_copy.data(), A_copy.lda()); cerr << endl << endl; } @@ -178,9 +178,9 @@ namespace TSQR { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; // Don't print the matrix if it's too big. - if (A_copy.nrows() <= 30) { + if (A_copy.extent(0) <= 30) { cerr << "A_copy = " << endl; - print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), + print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), A_copy.data(), A_copy.lda()); cerr << endl << endl; } @@ -203,12 +203,12 @@ namespace TSQR { cerr << "*** Cache blocking test failed! A != A2 ***" << endl << endl; // Don't print the matrices if they are too big. - if (A.nrows() <= 30 && A2.nrows() <= 30) { + if (A.extent(0) <= 30 && A2.extent(0) <= 30) { cerr << "A = " << endl; - print_local_matrix (cerr, A.nrows(), A.ncols(), + print_local_matrix (cerr, A.extent(0), A.extent(1), A.data(), A.lda()); cerr << endl << "A2 = " << endl; - print_local_matrix (cerr, A2.nrows(), A2.ncols(), + print_local_matrix (cerr, A2.extent(0), A2.extent(1), A2.data(), A2.lda()); cerr << endl; } @@ -244,10 +244,10 @@ namespace TSQR { { mat_view_type Q_top = actor.top_block (Q.view (), contiguousCacheBlocks); - mat_view_type Q_top_square (Q_top.ncols(), Q_top.ncols(), + mat_view_type Q_top_square (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()); Q_top_square.fill (Scalar {}); - for (Ordinal j = 0; j < Q_top_square.ncols(); ++j) { + for (Ordinal j = 0; j < Q_top_square.extent(1); ++j) { Q_top_square(j,j) = Scalar (1.0); } } @@ -275,9 +275,9 @@ namespace TSQR { // Print out the Q and R factors in debug mode. if (debug) { // Don't print the matrix if it's too big. - if (Q.nrows() <= 30) { + if (Q.extent(0) <= 30) { cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q.nrows(), Q.ncols(), + print_local_matrix (cerr, Q.extent(0), Q.extent(1), Q.data(), Q.lda()); cerr << endl << endl; } diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 94d146e1a513..0bd6fce365db 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -59,15 +59,16 @@ namespace TSQR { void deep_copy (MatrixViewType1& A, const MatrixViewType2& B) { - const ptrdiff_t A_nrows (A.nrows ()); - const ptrdiff_t A_ncols (A.ncols ()); - if (A_nrows != ptrdiff_t (B.nrows ()) || - A_ncols != ptrdiff_t (B.ncols ())) { + const ptrdiff_t A_nrows (A.extent (0)); + const ptrdiff_t A_ncols (A.extent (1)); + if (A_nrows != ptrdiff_t (B.extent (0)) || + A_ncols != ptrdiff_t (B.extent (1))) { using std::endl; std::ostringstream os; - os << "deep_copy: dimensions of A (output matrix) and B (input matrix) " - << "are not compatible. A is " << A.nrows () << " x " << A.ncols () - << ", but B is " << B.nrows () << " x " << B.ncols () << "."; + os << "deep_copy: dimensions of A (output matrix) and B (input " + "matrix) are not compatible. A is " << A.extent (0) << " x " + << A.extent (1) << ", but B is " << B.extent (0) << " x " + << B.extent (1) << "."; throw std::invalid_argument(os.str()); } for (ptrdiff_t j = 0; j < A_ncols; ++j) { @@ -84,12 +85,12 @@ namespace TSQR { matrix_equal (const FirstMatrixViewType& A, const SecondMatrixViewType& B) { - if (A.nrows() != B.nrows() || A.ncols() != B.ncols()) { + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { return false; } - const ptrdiff_t nrows (A.nrows()); + const ptrdiff_t nrows (A.extent(0)); const ptrdiff_t A_lda (A.lda()); - const ptrdiff_t ncols (A.ncols()); + const ptrdiff_t ncols (A.extent(1)); const ptrdiff_t B_lda (B.lda()); const auto* A_j = A.data(); const auto* B_j = B.data(); @@ -196,18 +197,18 @@ namespace TSQR { { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed) { - if (i < 0 || i >= nrows()) { + if (i < 0 || i >= extent(0)) { throw std::invalid_argument("Row range invalid"); } - else if (j < 0 || j >= ncols()) { + else if (j < 0 || j >= extent(1)) { throw std::invalid_argument("Column range invalid"); } } else { - if (i >= nrows()) { + if (i >= extent(0)) { throw std::invalid_argument("Row range invalid"); } - else if (j >= ncols()) { + else if (j >= extent(1)) { throw std::invalid_argument("Column range invalid"); } } @@ -218,8 +219,10 @@ namespace TSQR { return A_[i + j*lda()]; } - Ordinal nrows() const { return nrows_; } - Ordinal ncols() const { return ncols_; } + constexpr Ordinal extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + } + Ordinal lda() const { return lda_; } /// \note The function is const, only because returning A_ doesn't @@ -227,7 +230,7 @@ namespace TSQR { /// resulting pointer to fiddle with entries in the matrix, but /// that doesn't affect the MatView's properties. pointer_type data() const { return A_; } - bool empty() const { return nrows() == 0 || ncols() == 0; } + bool empty() const { return extent(0) == 0 || extent(1) == 0; } /// Return a "row block" (submatrix of consecutive rows in the /// inclusive range [firstRow,lastRow]). @@ -235,17 +238,17 @@ namespace TSQR { { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed) { - if (firstRow < 0 || firstRow > lastRow || lastRow >= nrows()) { + if (firstRow < 0 || firstRow > lastRow || lastRow >= extent(0)) { throw std::invalid_argument ("Row range invalid"); } } else { - if (firstRow > lastRow || lastRow >= nrows()) { + if (firstRow > lastRow || lastRow >= extent(0)) { throw std::invalid_argument ("Row range invalid"); } } #endif // TSQR_MATVIEW_DEBUG - return MatView (lastRow - firstRow + 1, ncols(), data() + firstRow, lda()); + return MatView (lastRow - firstRow + 1, extent(1), data() + firstRow, lda()); } /// Split off and return the top cache block of nrows_top rows. @@ -273,23 +276,23 @@ namespace TSQR { os << "nrows_top (= " << nrows_top << ") < 0"; throw std::invalid_argument (os.str()); } - else if (nrows_top > nrows()) + else if (nrows_top > extent(0)) { std::ostringstream os; - os << "nrows_top (= " << nrows_top << ") > nrows (= " << nrows() << ")"; + os << "nrows_top (= " << nrows_top << ") > nrows (= " << extent(0) << ")"; throw std::invalid_argument (os.str()); } #endif // TSQR_MATVIEW_DEBUG Scalar* const A_top_ptr = data(); Scalar* A_rest_ptr; - const Ordinal nrows_rest = nrows() - nrows_top; + const Ordinal nrows_rest = extent(0) - nrows_top; Ordinal lda_top, lda_rest; if (b_contiguous_blocks) { lda_top = nrows_top; lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * ncols(); + A_rest_ptr = A_top_ptr + nrows_top * extent(1); } else { @@ -297,7 +300,7 @@ namespace TSQR { lda_rest = lda(); A_rest_ptr = A_top_ptr + nrows_top; } - MatView A_top (nrows_top, ncols(), data(), lda_top); + MatView A_top (nrows_top, extent(1), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -313,19 +316,19 @@ namespace TSQR { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed && nrows_bottom < 0) throw std::invalid_argument ("nrows_bottom < 0"); - if (nrows_bottom > nrows()) + if (nrows_bottom > extent(0)) throw std::invalid_argument ("nrows_bottom > nrows"); #endif // TSQR_MATVIEW_DEBUG Scalar* const A_rest_ptr = data(); Scalar* A_bottom_ptr; - const Ordinal nrows_rest = nrows() - nrows_bottom; + const Ordinal nrows_rest = extent(0) - nrows_bottom; Ordinal lda_bottom, lda_rest; if (b_contiguous_blocks) { lda_bottom = nrows_bottom; - lda_rest = nrows() - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * ncols(); + lda_rest = extent(0) - nrows_bottom; + A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); } else { @@ -333,7 +336,7 @@ namespace TSQR { lda_rest = lda(); A_bottom_ptr = A_rest_ptr + nrows_rest; } - MatView A_bottom (nrows_bottom, ncols(), A_bottom_ptr, lda_bottom); + MatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -344,8 +347,8 @@ namespace TSQR { void fill (const scalar_type& value) { - const ordinal_type num_rows = nrows(); - const ordinal_type num_cols = ncols(); + const ordinal_type num_rows = extent(0); + const ordinal_type num_cols = extent(1); const ordinal_type stride = lda(); scalar_type* A_j = data(); @@ -355,12 +358,12 @@ namespace TSQR { } bool operator== (const MatView& rhs) const { - return nrows() == rhs.nrows() && ncols() == rhs.ncols() && + return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && lda() == rhs.lda() && data() == rhs.data(); } bool operator!= (const MatView& rhs) const { - return nrows() != rhs.nrows() || ncols() != rhs.ncols() || + return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || lda() != rhs.lda() || data() != rhs.data(); } @@ -406,8 +409,8 @@ namespace TSQR { } ConstMatView (const ConstMatView& view) : - nrows_(view.nrows()), - ncols_(view.ncols()), + nrows_(view.extent(0)), + ncols_(view.extent(1)), lda_(view.lda()), A_(view.data()) {} @@ -415,8 +418,8 @@ namespace TSQR { //! Assignment operator: Does a shallow (pointer) copy. ConstMatView& operator= (const ConstMatView& view) { if (this != &view) { - nrows_ = view.nrows(); - ncols_ = view.ncols(); + nrows_ = view.extent(0); + ncols_ = view.extent(1); lda_ = view.lda(); A_ = view.data(); } @@ -427,18 +430,18 @@ namespace TSQR { { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed) { - if (i < 0 || i >= nrows()) { + if (i < 0 || i >= extent(0)) { throw std::invalid_argument("Row range invalid"); } - else if (j < 0 || j >= ncols()) { + else if (j < 0 || j >= extent(1)) { throw std::invalid_argument("Column range invalid"); } } else { - if (i >= nrows()) { + if (i >= extent(0)) { throw std::invalid_argument("Row range invalid"); } - else if (j >= ncols()) { + else if (j >= extent(1)) { throw std::invalid_argument("Column range invalid"); } } @@ -449,11 +452,15 @@ namespace TSQR { return A_[i + j*lda()]; } - Ordinal nrows() const { return nrows_; } - Ordinal ncols() const { return ncols_; } + constexpr Ordinal extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + } + Ordinal lda() const { return lda_; } + pointer_type data() const { return A_; } - bool empty() const { return nrows() == 0 || ncols() == 0; } + + bool empty() const { return extent(0) == 0 || extent(1) == 0; } /// Return a "row block" (submatrix of consecutive rows in the /// inclusive range [firstRow,lastRow]). @@ -461,10 +468,10 @@ namespace TSQR { const Ordinal lastRow) const { #ifdef TSQR_MATVIEW_DEBUG - if (firstRow < 0 || lastRow >= nrows()) + if (firstRow < 0 || lastRow >= extent(0)) throw std::invalid_argument ("Row range invalid"); #endif // TSQR_MATVIEW_DEBUG - return ConstMatView (lastRow - firstRow + 1, ncols(), data() + firstRow, lda()); + return ConstMatView (lastRow - firstRow + 1, extent(1), data() + firstRow, lda()); } @@ -489,19 +496,19 @@ namespace TSQR { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed && nrows_top < 0) throw std::invalid_argument ("nrows_top < 0"); - if (nrows_top > nrows()) + if (nrows_top > extent(0)) throw std::invalid_argument ("nrows_top > nrows"); #endif // TSQR_MATVIEW_DEBUG pointer_type const A_top_ptr = data(); pointer_type A_rest_ptr; - const Ordinal nrows_rest = nrows() - nrows_top; + const Ordinal nrows_rest = extent(0) - nrows_top; Ordinal lda_top, lda_rest; if (b_contiguous_blocks) { lda_top = nrows_top; lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * ncols(); + A_rest_ptr = A_top_ptr + nrows_top * extent(1); } else { @@ -509,7 +516,7 @@ namespace TSQR { lda_rest = lda(); A_rest_ptr = A_top_ptr + nrows_top; } - ConstMatView A_top (nrows_top, ncols(), data(), lda_top); + ConstMatView A_top (nrows_top, extent(1), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -526,19 +533,19 @@ namespace TSQR { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits< Ordinal >::is_signed && nrows_bottom < 0) throw std::invalid_argument ("nrows_bottom < 0"); - if (nrows_bottom > nrows()) + if (nrows_bottom > extent(0)) throw std::invalid_argument ("nrows_bottom > nrows"); #endif // TSQR_MATVIEW_DEBUG pointer_type const A_rest_ptr = data(); pointer_type A_bottom_ptr; - const ordinal_type nrows_rest = nrows() - nrows_bottom; + const ordinal_type nrows_rest = extent(0) - nrows_bottom; ordinal_type lda_bottom, lda_rest; if (b_contiguous_blocks) { lda_bottom = nrows_bottom; - lda_rest = nrows() - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * ncols(); + lda_rest = extent(0) - nrows_bottom; + A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); } else { @@ -546,7 +553,7 @@ namespace TSQR { lda_rest = lda(); A_bottom_ptr = A_rest_ptr + nrows_rest; } - ConstMatView A_bottom (nrows_bottom, ncols(), A_bottom_ptr, lda_bottom); + ConstMatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); A_ = A_rest_ptr; nrows_ = nrows_rest; lda_ = lda_rest; @@ -555,12 +562,12 @@ namespace TSQR { } bool operator== (const ConstMatView& rhs) const { - return nrows() == rhs.nrows() && ncols() == rhs.ncols() && + return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && lda() == rhs.lda() && data() == rhs.data(); } bool operator!= (const ConstMatView& rhs) const { - return nrows() != rhs.nrows() || ncols() != rhs.ncols() || + return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || lda() != rhs.lda() || data() != rhs.data(); } diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index af8adaadd38f..74dceb335a5d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -186,12 +186,12 @@ namespace TSQR { /// default copy constructor would override the generic matrix /// view "copy constructor" below. Matrix (const Matrix& in) : - nrows_ (in.nrows()), - ncols_ (in.ncols()), - A_ (verified_alloc_size (in.nrows(), in.ncols())) + nrows_ (in.extent(0)), + ncols_ (in.extent(1)), + A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (! in.empty()) - copy_matrix (nrows(), ncols(), data(), lda(), in.data(), in.lda()); + copy_matrix (extent(0), extent(1), data(), lda(), in.data(), in.lda()); } //! Default constructor (constructs an empty matrix). @@ -204,23 +204,23 @@ namespace TSQR { /// /// This constructor allocates a new matrix and copies the /// elements of the input view into the resulting new matrix. - /// MatrixViewType must have nrows(), ncols(), data(), and lda() + /// MatrixViewType must have extent(0), extent(1), data(), and lda() /// methods that match MatView's methods. template Matrix (const MatrixViewType& in) : - nrows_ (in.nrows()), - ncols_ (in.ncols()), - A_ (verified_alloc_size (in.nrows(), in.ncols())) + nrows_ (in.extent(0)), + ncols_ (in.extent(1)), + A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (A_.size() != 0) - copy_matrix (nrows(), ncols(), data(), lda(), in.data(), in.lda()); + copy_matrix (extent(0), extent(1), data(), lda(), in.data(), in.lda()); } //! Fill all entries of the matrix with the given value. void fill (const Scalar value) { - fill_matrix (nrows(), ncols(), data(), lda(), value); + fill_matrix (extent(0), extent(1), data(), lda(), value); } /// \brief Non-const reference to element (i,j) of the matrix. @@ -248,24 +248,22 @@ namespace TSQR { template bool operator== (const MatrixViewType& B) const { - if (data() != B.data() || nrows() != B.nrows() || ncols() != B.ncols() || lda() != B.lda()) { + if (data() != B.data() || extent(0) != B.extent(0) || extent(1) != B.extent(1) || lda() != B.lda()) { return false; } else { return true; } } - //! Number of rows in the matrix. - Ordinal nrows() const { return nrows_; } - - //! Number of columns in the matrix. - Ordinal ncols() const { return ncols_; } + constexpr Ordinal extent (const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + } //! Leading dimension (a.k.a. stride) of the matrix. Ordinal lda() const { return nrows_; } //! Whether the matrix is empty (has either zero rows or zero columns). - bool empty() const { return nrows() == 0 || ncols() == 0; } + bool empty() const { return extent(0) == 0 || extent(1) == 0; } //! A non-const pointer to the matrix data. Scalar* @@ -289,12 +287,12 @@ namespace TSQR { //! A non-const view of the matrix. mat_view_type view () { - return mat_view_type (nrows(), ncols(), data(), lda()); + return mat_view_type (extent(0), extent(1), data(), lda()); } //! A const view of the matrix. const_mat_view_type const_view () const { - return const_mat_view_type (nrows(), ncols(), + return const_mat_view_type (extent(0), extent(1), const_cast (data()), lda()); } @@ -311,7 +309,7 @@ namespace TSQR { void reshape (const Ordinal num_rows, const Ordinal num_cols) { - if (num_rows == nrows() && num_cols == ncols()) + if (num_rows == extent(0) && num_cols == extent(1)) return; // no need to reallocate or do anything else const size_t alloc_size = verified_alloc_size (num_rows, num_cols); diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index b895d9bcdafc..3aade82f0fc0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -102,7 +102,7 @@ namespace TSQR { // Factor the (copy of the) matrix. On output, the explicit Q // factor (of A_local) is in Q_local and the R factor is in R. - orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), + orthogonalizer.mgs (Q_local.extent(0), Q_local.extent(1), Q_local.data(), Q_local.lda(), R.data(), R.lda()); if (b_debug) { @@ -277,8 +277,8 @@ namespace TSQR { TSQR::Test::verifyTimerConcept(); - const ordinal_type nrows_local = Q_local.nrows(); - const ordinal_type ncols = Q_local.ncols(); + const ordinal_type nrows_local = Q_local.extent(0); + const ordinal_type ncols = Q_local.extent(1); // Benchmark MGS for ntrials trials. The answer (the numerical // results of the factorization) is only valid if ntrials == 1, diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index bf1e66cb703d..d8cb2925ec4c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -323,8 +323,8 @@ namespace TSQR { protected: /// \brief Return view of topmost cache block of C /// - /// \param C [in] Matrix (view), supporting the usual nrows(), - /// ncols(), data(), lda() interface. + /// \param C [in] Matrix (view), supporting the usual extent(0), + /// extent(1), data(), lda() interface. /// \param contiguousCacheBlocks [in] Whether the cache blocks /// in C are stored contiguously. /// @@ -351,7 +351,7 @@ namespace TSQR { /// Return a view of the topmost cache block (on this node) of the /// given matrix C. This is not necessarily square, though it /// must have at least as many rows as columns. For a view of the - /// first C.ncols() rows of that block, which methods like + /// first C.extent(1) rows of that block, which methods like /// Tsqr::apply() need, do the following: /// \code /// MatrixViewType top = this->top_block (C, contig); @@ -359,7 +359,7 @@ namespace TSQR { /// \endcode /// /// Models for MatrixViewType are MatView and ConstMatView. - /// MatrixViewType must have member functions nrows(), ncols(), + /// MatrixViewType must have member functions extent(0), extent(1), /// data(), and lda(), and its constructor must take the same four /// arguments as the constructor of ConstMatView. template @@ -372,17 +372,17 @@ namespace TSQR { // method. The only cast from const to nonconst may be in the // return value, but there it's legitimate since we're just // using the same constness as C has. - const_mat_view_type C_view (C.nrows(), C.ncols(), C.data(), C.lda()); + const_mat_view_type C_view (C.extent(0), C.extent(1), C.data(), C.lda()); const_mat_view_type C_top = const_top_block (C_view, contiguous_cache_blocks); - TEUCHOS_TEST_FOR_EXCEPTION(C_top.nrows() < C_top.ncols(), std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION(C_top.extent(0) < C_top.extent(1), std::logic_error, "The subclass of NodeTsqr has a bug in const_top_block" "(); it returned a block with fewer rows than columns " - "(" << C_top.nrows() << " rows and " << C_top.ncols() + "(" << C_top.extent(0) << " rows and " << C_top.extent(1) << " columns). Please report this bug to the Kokkos " "developers."); typedef typename MatrixViewType::pointer_type ptr_type; - return MatrixViewType (C_top.nrows(), C_top.ncols(), + return MatrixViewType (C_top.extent(0), C_top.extent(1), const_cast (C_top.data()), C_top.lda()); } diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp index 153e658fb690..76b4560e70d6 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp @@ -272,36 +272,36 @@ namespace TSQR { } } - if (printMatrices_) - { - if (myRank == 0) - err_ << std::endl << "Computed Q factor:" << std::endl; - printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get()); - if (myRank == 0) - { - err_ << std::endl << "Computed R factor:" << std::endl; - print_local_matrix (err_, R.nrows(), R.ncols(), R.data(), R.lda()); - err_ << std::endl; - } - } + if (printMatrices_) { + if (myRank == 0) { + err_ << std::endl << "Computed Q factor:" << std::endl; + } + printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get()); + if (myRank == 0) { + err_ << std::endl << "Computed R factor:" << std::endl; + print_local_matrix (err_, R.extent(0), R.extent(1), R.data(), R.lda()); + err_ << std::endl; + } + } - // Verify the factorization - result_type result = - global_verify (numCols, numCols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), - scalarComm_.get()); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Finished global_verify" << endl; - } - reportResults ("DistTsqrRB", numCols, result, - additionalFieldNames, additionalData, - printFieldNames && (! printedFieldNames)); - if (printFieldNames && (! printedFieldNames)) - printedFieldNames = true; + // Verify the factorization + result_type result = + global_verify (numCols, numCols, A_local.data(), A_local.lda(), + Q_local.data(), Q_local.lda(), R.data(), R.lda(), + scalarComm_.get()); + if (debug_) { + scalarComm_->barrier(); + if (myRank == 0) { + err_ << "-- Finished global_verify" << endl; + } + } + reportResults ("DistTsqrRB", numCols, result, + additionalFieldNames, additionalData, + printFieldNames && (! printedFieldNames)); + if (printFieldNames && (! printedFieldNames)) { + printedFieldNames = true; } + } } private: diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index 6126dfad53b3..64ac372b73c6 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -82,7 +82,7 @@ namespace TSQR { void recv (MatrixViewType& R, const int srcProc) { - const typename MatrixViewType::ordinal_type ncols = R.ncols(); + const typename MatrixViewType::ordinal_type ncols = R.extent(1); const Ordinal buflen = buffer_length (ncols); buffer_.resize (buflen); messenger_->recv (&buffer_[0], buflen, srcProc, 0); @@ -96,7 +96,7 @@ namespace TSQR { const int myRank = messenger_->rank(); if (myRank == rootProc) pack (R); - messenger_->broadcast (&buffer_[0], buffer_length (R.ncols()), rootProc); + messenger_->broadcast (buffer_.data(), buffer_length (R.extent(1)), rootProc); if (myRank != rootProc) unpack (R); } @@ -138,7 +138,7 @@ namespace TSQR { typedef typename ConstMatrixViewType::ordinal_type view_ordinal_type; typedef typename std::vector< Scalar >::iterator iter_type; - const view_ordinal_type ncols = R.ncols(); + const view_ordinal_type ncols = R.extent(1); const Ordinal buf_length = buffer_length (ncols); buffer_.resize (buf_length); iter_type iter = buffer_.begin(); @@ -156,7 +156,7 @@ namespace TSQR { typedef typename MatrixViewType::ordinal_type view_ordinal_type; typedef typename std::vector< Scalar >::const_iterator const_iter_type; - const view_ordinal_type ncols = R.ncols(); + const view_ordinal_type ncols = R.extent(1); const_iter_type iter = buffer_.begin(); for (view_ordinal_type j = 0; j < ncols; ++j) { std::copy (iter, iter + (j+1), &R(0,j)); @@ -191,7 +191,7 @@ namespace TSQR { const int my_rank = messenger->rank(); if (my_rank == 0) { - const ordinal_type ncols = R_stack.ncols(); + const ordinal_type ncols = R_stack.extent(1); // Copy data from top ncols x ncols block of R_stack into R_local. const_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); @@ -231,7 +231,7 @@ namespace TSQR { const int my_rank = messenger->rank(); if (my_rank == 0) { - const ordinal_type ncols = R_stack.ncols(); + const ordinal_type ncols = R_stack.extent(1); // Copy data from R_local into top ncols x ncols block of R_stack. mat_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 38cc74e3ce47..2b0f075aad5b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -61,8 +61,8 @@ namespace TSQR { typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; - const ordinal_type nrows = A.nrows(); - const ordinal_type ncols = A.ncols(); + const ordinal_type nrows = A.extent(0); + const ordinal_type ncols = A.extent(1); const ordinal_type lda = A.lda(); if (nrows == lda) { // A is stored contiguously. @@ -100,8 +100,8 @@ namespace TSQR { const int myRank = ordinalMessenger->rank(); Impl::SystemBlas blas; - const ordinal_type nrowsLocal = A_local.nrows(); - const ordinal_type ncols = A_local.ncols(); + const ordinal_type nrowsLocal = A_local.extent(0); + const ordinal_type ncols = A_local.extent(1); // Theory: Suppose there are P processors. Proc q wants an m_q by n // component of the matrix A, which we write as A_q. On Proc 0, we diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index f43eb7d3d579..e8af89e0878b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -160,7 +160,7 @@ namespace TSQR { implicit_Q (MatrixViewType& Q, typename MatrixViewType::scalar_type tau[]) { - implicit_Q (Q.nrows(), Q.ncols(), Q.data(), Q.lda(), tau); + implicit_Q (Q.extent(0), Q.extent(1), Q.data(), Q.lda(), tau); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index 732bc4fbde93..e0a09da84780 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -143,7 +143,7 @@ namespace TSQR { // Process the first cache block: ATA := A_cur^T * A_cur // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? - blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.nrows (), + blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), A_cur.lda (), Scalar (0), ATA.data (), ATA.lda ()); // Process the remaining cache blocks in order. @@ -152,7 +152,7 @@ namespace TSQR { // ATA := ATA + A_cur^T * A_cur // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? - blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.nrows (), + blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), A_cur.lda (), Scalar (1), ATA.data (), ATA.lda ()); } @@ -194,14 +194,14 @@ namespace TSQR { // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.nrows (), ncols, Scalar (1), ATA.data (), ATA.lda (), + A_cur.extent (0), ncols, Scalar (1), ATA.data (), ATA.lda (), A_cur.data (), A_cur.lda ()); // Process the remaining cache blocks in order. while (! A_rest.empty ()) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.nrows (), ncols, Scalar (1), ATA.data (), ATA.lda (), + A_cur.extent (0), ncols, Scalar (1), ATA.data (), ATA.lda (), A_cur.data (), A_cur.lda ()); } } @@ -305,14 +305,15 @@ namespace TSQR { // blocks (in C) may or may not be stored contiguously. If they // are stored contiguously, the CacheBlocker knows the right // layout, based on the cache blocking strategy. - CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_); + CacheBlocker blocker + (C.extent(0), C.extent(1), strategy_); // C_top_block is a view of the topmost cache block of C. // C_top_block should have >= ncols rows, otherwise either cache // blocking is broken or the input matrix C itself had fewer // rows than columns. MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks); - if (C_top_block.nrows() < C_top_block.ncols()) + if (C_top_block.extent(0) < C_top_block.extent(1)) throw std::logic_error ("C\'s topmost cache block has fewer rows than " "columns"); return C_top_block; diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index b2a8dcd1b673..6754e9011a10 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -131,7 +131,7 @@ namespace TSQR { /// Overwrite the upper triangle of A_top with the R factor, and /// return a view of the R factor (stored in place in A_top). /// Overwrite the (strict) lower triangle of A_top, and the - /// A_top.ncols() entries of tau, with an implicit representation + /// A_top.extent(1) entries of tau, with an implicit representation /// of the Q factor. /// /// \param combine [in/out] Implementation of linear algebra @@ -139,16 +139,16 @@ namespace TSQR { /// implementations use scratch space. /// /// \param A_top [in/out] On input: the first (topmost) cache - /// block of the matrix. Prerequisite: A_top.nrows() >= - /// A.top.ncols(). On output, the upper triangle of A_top is + /// block of the matrix. Prerequisite: A_top.extent(0) >= + /// A.top.extent(1). On output, the upper triangle of A_top is /// overwritten with the R factor, and the lower trapezoid of /// A_top is overwritten with part of the implicit /// representation of the Q factor. /// - /// \param tau [out] Array of length >= A_top.ncols(). On output: + /// \param tau [out] Array of length >= A_top.extent(1). On output: /// the TAU array (see the LAPACK documentation for _GEQRF). /// - /// \param work [out] Workspace array of length >= A_top.ncols(). + /// \param work [out] Workspace array of length >= A_top.extent(1). /// /// \return A view of the upper triangle of A_top, containing the /// R factor. @@ -158,8 +158,8 @@ namespace TSQR { std::vector& tau, std::vector& work) const { - const LocalOrdinal ncols = A_top.ncols(); - combine.factor_first (A_top.nrows(), ncols, A_top.data(), A_top.lda(), + const LocalOrdinal ncols = A_top.extent(1); + combine.factor_first (A_top.extent(0), ncols, A_top.data(), A_top.lda(), tau.data(), work.data()); return mat_view_type(ncols, ncols, A_top.data(), A_top.lda()); } @@ -176,9 +176,9 @@ namespace TSQR { mat_view_type& C_first, std::vector& work) const { - const LocalOrdinal nrowsLocal = Q_first.nrows(); - combine.apply_first (applyType, nrowsLocal, C_first.ncols(), - Q_first.ncols(), Q_first.data(), Q_first.lda(), + const LocalOrdinal nrowsLocal = Q_first.extent(0); + combine.apply_first (applyType, nrowsLocal, C_first.extent(1), + Q_first.extent(1), Q_first.data(), Q_first.lda(), tau.data(), C_first.data(), C_first.lda(), work.data()); } @@ -191,9 +191,9 @@ namespace TSQR { mat_view_type& C_cur, std::vector& work) const { - const LocalOrdinal nrows_local = Q_cur.nrows(); - const LocalOrdinal ncols_Q = Q_cur.ncols(); - const LocalOrdinal ncols_C = C_cur.ncols(); + const LocalOrdinal nrows_local = Q_cur.extent(0); + const LocalOrdinal ncols_Q = Q_cur.extent(1); + const LocalOrdinal ncols_C = C_cur.extent(1); combine.apply_inner (apply_type, nrows_local, ncols_C, ncols_Q, @@ -209,8 +209,8 @@ namespace TSQR { std::vector& tau, std::vector& work) const { - const LocalOrdinal nrows_local = A_cur.nrows(); - const LocalOrdinal ncols = A_cur.ncols(); + const LocalOrdinal nrows_local = A_cur.extent(0); + const LocalOrdinal ncols = A_cur.extent(1); combine.factor_inner (nrows_local, ncols, R.data(), R.lda(), A_cur.data(), A_cur.lda(), tau.data(), @@ -758,12 +758,12 @@ namespace TSQR { // GEMM doesn't like aliased arguments, so we use a copy. // We only copy the current cache block, rather than all of // Q; this saves memory. - Q_cur_copy.reshape (Q_cur.nrows (), ncols); + Q_cur_copy.reshape (Q_cur.extent (0), ncols); deep_copy (Q_cur_copy, Q_cur); // Q_cur := Q_cur_copy * B. - blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.nrows (), ncols, ncols, + blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent (0), ncols, ncols, Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.lda (), - B, ldb, Scalar (0.0), Q_cur.data (), Q_cur.lda ()); + B, ldb, Scalar {}, Q_cur.data (), Q_cur.lda ()); } } @@ -861,7 +861,7 @@ namespace TSQR { // are stored contiguously, the CacheBlocker knows the right // layout, based on the cache blocking strategy. typedef CacheBlocker blocker_type; - blocker_type blocker (C.nrows(), C.ncols(), strategy_); + blocker_type blocker (C.extent(0), C.extent(1), strategy_); // C_top_block is a view of the topmost cache block of C. // C_top_block should have >= ncols rows, otherwise either cache diff --git a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp index b514294b4436..c61f4051e08a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp @@ -80,7 +80,7 @@ namespace TSQR { typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type; const int myRank = scalarComm->rank(); - const ordinal_type ncols = A_local.ncols(); + const ordinal_type ncols = A_local.extent(1); if (myRank == 0) { // Generate some singular values for the test problem. diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index 8a39d66e64f5..1d829e975bb5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -80,8 +80,8 @@ namespace TSQR { using std::cerr; using std::endl; - const ordinal_type nrows_local = A_local.nrows(); - const ordinal_type ncols = A_local.ncols(); + const ordinal_type nrows_local = A_local.extent(0); + const ordinal_type ncols = A_local.extent(1); // If specified, rearrange cache blocks in the copy. if (contiguousCacheBlocks) { @@ -438,8 +438,8 @@ namespace TSQR { using std::cout; using std::endl; - const ordinal_type nrows_local = A_local.nrows(); - const ordinal_type ncols = A_local.ncols(); + const ordinal_type nrows_local = A_local.extent(0); + const ordinal_type ncols = A_local.extent(1); if (contiguousCacheBlocks) { tsqr.cache_block (nrows_local, ncols, A_copy.data(), diff --git a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp index f935a1b1e655..e907b5fccf5c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp @@ -77,15 +77,15 @@ namespace TSQR { const int myRank = scalarComm->rank (); const int nprocs = scalarComm->size (); - const LocalOrdinal nrowsLocal = A_local.nrows(); - const LocalOrdinal ncols = A_local.ncols(); + const LocalOrdinal nrowsLocal = A_local.extent(0); + const LocalOrdinal ncols = A_local.extent(1); const Scalar quiet_NaN = STS::nan(); if (myRank == 0) { // Print the remote matrix data // out << "Processor " << my_rank << ":" << endl; - print_local_matrix (out, A_local.nrows(), A_local.ncols(), + print_local_matrix (out, A_local.extent(0), A_local.extent(1), A_local.data(), A_local.lda()); // Space for remote matrix data. Other processors are allowed From 81461cecb9f1d5c683ea137b02e628e99066b42c Mon Sep 17 00:00:00 2001 From: Alexander Heinlein Date: Mon, 25 Nov 2019 11:23:45 +0100 Subject: [PATCH 11/50] Unique domain map for Phi_. Still some error... --- .../CoarseSpaces/FROSch_CoarseSpace_decl.hpp | 83 +++++++----- .../CoarseSpaces/FROSch_CoarseSpace_def.hpp | 127 +++++++++++++----- .../FROSch_LocalPartitionOfUnityBasis_def.hpp | 14 +- .../FROSch_CoarseOperator_decl.hpp | 4 +- .../FROSch_CoarseOperator_def.hpp | 39 +++--- .../FROSch_GDSWCoarseOperator_def.hpp | 49 ++++--- .../FROSch_HarmonicCoarseOperator_decl.hpp | 4 +- .../FROSch_HarmonicCoarseOperator_def.hpp | 80 ++++++----- .../FROSch_IPOUHarmonicCoarseOperator_def.hpp | 8 +- .../FROSch_RGDSWCoarseOperator_def.hpp | 6 +- .../FROSch_SchwarzOperator_decl.hpp | 1 + .../frosch/src/Tools/FROSch_Tools_decl.hpp | 2 +- .../frosch/src/Tools/FROSch_Tools_def.hpp | 2 +- 13 files changed, 254 insertions(+), 165 deletions(-) diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp index 5a6daaecdfa6..fb37d0a6247d 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp @@ -61,39 +61,50 @@ namespace FROSch { protected: - using XMap = Map; - using XMapPtr = RCP; - using ConstXMapPtr = RCP; - using XMapPtrVec = Array; - - using XMatrix = Matrix; - using XMatrixPtr = RCP; - - using XMultiVector = MultiVector; - using XMultiVectorPtr = RCP; - using XMultiVectorPtrVec = Array; - - using ParameterListPtr = RCP; + using CommPtr = RCP >; + + using XMap = Map; + using XMapPtr = RCP; + using ConstXMapPtr = RCP; + using XMapPtrVec = Array; + using ConstXMapPtrVec = Array; + + using XMatrix = Matrix; + using XMatrixPtr = RCP; + + using XMultiVector = MultiVector; + using XMultiVectorPtr = RCP; + using ConstXMultiVectorPtr = RCP; + using ConstXMultiVectorPtrVec = Array; + + using ParameterListPtr = RCP; - using UN = unsigned; + using UN = unsigned; + using UNVec = Array; + using ConstUNVecView = ArrayView; - using LOVec = Array; - using GOVec = Array; - using LOVecPtr = ArrayRCP; - using LOVecPtr2D = ArrayRCP; + using LOVec = Array; + using LOVecPtr = ArrayRCP; + using LOVecPtr2D = ArrayRCP; + + using GOVec = Array; - using SCVec = Array; + using SCVec = Array; public: - CoarseSpace(); + CoarseSpace(CommPtr mpiComm, + CommPtr serialComm); - int addSubspace(XMapPtr subspaceBasisMap, - XMultiVectorPtr subspaceBasis = null); + int addSubspace(ConstXMapPtr subspaceBasisMap, + ConstXMapPtr subspaceBasisMapUnique = null, + ConstXMultiVectorPtr subspaceBasis = null, + UN offset = 0); int assembleCoarseSpace(); int buildGlobalBasisMatrix(ConstXMapPtr rowMap, + ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, SC treshold); @@ -105,30 +116,42 @@ namespace FROSch { bool hasBasisMap() const; - XMapPtr getBasisMap() const; + ConstXMapPtr getBasisMap() const; + + bool hasBasisMapUnique() const; + + ConstXMapPtr getBasisMapUnique() const; bool hasAssembledBasis() const; - XMultiVectorPtr getAssembledBasis() const; + ConstXMultiVectorPtr getAssembledBasis() const; + + ConstUNVecView getLocalSubspaceSizes() const; bool hasGlobalBasisMatrix() const; XMatrixPtr getGlobalBasisMatrix() const; - protected: - - ConstXMapPtr SerialRowMap_; + protected: - XMapPtrVec UnassembledBasesMaps_; + CommPtr MpiComm_; + CommPtr SerialComm_; + + ConstXMapPtrVec UnassembledBasesMaps_; + ConstXMapPtrVec UnassembledBasesMapsUnique_; - XMultiVectorPtrVec UnassembledSubspaceBases_; + ConstXMultiVectorPtrVec UnassembledSubspaceBases_; + + LOVec Offsets_; XMapPtr AssembledBasisMap_; + XMapPtr AssembledBasisMapUnique_; XMultiVectorPtr AssembledBasis_; + UNVec LocalSubspacesSizes_; + XMatrixPtr GlobalBasisMatrix_; - }; } diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp index e48c0d490928..60afe891b080 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp @@ -51,12 +51,18 @@ namespace FROSch { using namespace Xpetra; template - CoarseSpace::CoarseSpace() : - SerialRowMap_ (), + CoarseSpace::CoarseSpace(CommPtr mpiComm, + CommPtr serialComm) : + MpiComm_ (mpiComm), + SerialComm_ (serialComm), UnassembledBasesMaps_ (0), + UnassembledBasesMapsUnique_ (0), UnassembledSubspaceBases_ (0), + Offsets_ (0), AssembledBasisMap_ (), + AssembledBasisMapUnique_ (), AssembledBasis_ (), + LocalSubspacesSizes_ (0), GlobalBasisMatrix_ () { @@ -64,64 +70,92 @@ namespace FROSch { // Will man Informationen über die Subspaces als strings reingeben? template - int CoarseSpace::addSubspace(XMapPtr subspaceBasisMap, - XMultiVectorPtr localSubspaceBasis) + int CoarseSpace::addSubspace(ConstXMapPtr subspaceBasisMap, + ConstXMapPtr subspaceBasisMapUnique, + ConstXMultiVectorPtr subspaceBasis, + UN offset) { - FROSCH_ASSERT(!subspaceBasisMap.is_null(),"subspaceBasisMap.is_null()"); - if (!localSubspaceBasis.is_null()) { - FROSCH_ASSERT(localSubspaceBasis->getNumVectors()==subspaceBasisMap->getNodeNumElements(),"localSubspaceBasis->getNumVectors()!=subspaceBasisMap->getNodeNumElements()"); - if (!SerialRowMap_.is_null()) { - FROSCH_ASSERT(SerialRowMap_->isSameAs(*localSubspaceBasis->getMap()),"!UnassembledSubspaceBases_[0]->isSameAs(localSubspaceBasis->getMap())"); - } else { - SerialRowMap_ = localSubspaceBasis->getMap(); - } + FROSCH_ASSERT(!subspaceBasisMap.is_null(),"FROSch::CoarseSpace : ERROR: subspaceBasisMap.is_null()"); + if (!subspaceBasis.is_null()) { + FROSCH_ASSERT(subspaceBasis->getNumVectors()==subspaceBasisMap->getNodeNumElements(),"FROSch::CoarseSpace : ERROR: subspaceBasis->getNumVectors()!=subspaceBasisMap->getNodeNumElements()"); } else { - FROSCH_ASSERT(subspaceBasisMap->getNodeNumElements()==0,"subspaceBasisMap->getNodeNumElements()!=0"); + FROSCH_ASSERT(subspaceBasisMap->getNodeNumElements()==0,"FROSch::CoarseSpace : ERROR: subspaceBasisMap->getNodeNumElements()!=0"); } + if (subspaceBasisMapUnique.is_null()) subspaceBasisMapUnique = BuildUniqueMap(subspaceBasisMap); + UnassembledBasesMaps_.push_back(subspaceBasisMap); - UnassembledSubspaceBases_.push_back(localSubspaceBasis); + UnassembledBasesMapsUnique_.push_back(subspaceBasisMapUnique); + UnassembledSubspaceBases_.push_back(subspaceBasis); + Offsets_.push_back(offset); + LocalSubspacesSizes_.push_back(subspaceBasisMap->getNodeNumElements()); + return 0; } template int CoarseSpace::assembleCoarseSpace() { - FROSCH_ASSERT(UnassembledBasesMaps_.size()>0,"UnassembledBasesMaps_.size()==0"); - FROSCH_ASSERT(UnassembledSubspaceBases_.size()>0,"UnassembledSubspaceBases_.size()==0"); + FROSCH_ASSERT(UnassembledBasesMaps_.size()>0,"FROSch::CoarseSpace : ERROR: UnassembledBasesMaps_.size()==0"); + FROSCH_ASSERT(UnassembledBasesMapsUnique_.size()>0,"FROSch::CoarseSpace : ERROR: UnassembledBasesMapsUnique_.size()==0"); + FROSCH_ASSERT(UnassembledSubspaceBases_.size()>0,"FROSch::CoarseSpace : ERROR: UnassembledSubspaceBases_.size()==0"); UN itmp = 0; LOVecPtr2D partMappings; + + // BasisMap AssembledBasisMap_ = AssembleMaps(UnassembledBasesMaps_(),partMappings); - if (!AssembledBasisMap_.is_null()&&!SerialRowMap_.is_null()) { + + // BasisMapUnique + AssembledBasisMapUnique_ = AssembleMaps(UnassembledBasesMapsUnique_(),partMappings); + + // Basis + if (!AssembledBasisMap_.is_null()) { if (AssembledBasisMap_->getGlobalNumElements()>0) { // AH 02/12/2019: Is this the right condition? Seems to work for now... - AssembledBasis_ = MultiVectorFactory::Build(SerialRowMap_,AssembledBasisMap_->getNodeNumElements()); - for (UN i=0; igetNodeNumElements(); j++) { - AssembledBasis_->getDataNonConst(itmp).deepCopy(UnassembledSubspaceBases_[i]->getData(j)()); // Here, we copy data. Do we need to do this? - itmp++; + LO totalSize = -1; + for (UN i=0; igetLocalLength()+Offsets_[i])); + } + XMapPtr serialMap = MapFactory::Build(AssembledBasisMap_->lib(),totalSize,0,this->SerialComm_); + + AssembledBasis_ = MultiVectorFactory::Build(serialMap,AssembledBasisMap_->getNodeNumElements()); + for (UN i=0; igetNumVectors(); j++) { + for (UN k=0; kgetLocalLength(); k++) { + FROSCH_ASSERT(itmpgetNumVectors(),"FROSch::CoarseSpace : ERROR: itmp>=AssembledBasis_->getNumVectors()"); + FROSCH_ASSERT(k+Offsets_[i]getLocalLength(),"FROSch::CoarseSpace : ERROR: k+Offsets_[i]>=AssembledBasis_->getLocalLength()"); + AssembledBasis_->replaceLocalValue(k+Offsets_[i],itmp,UnassembledSubspaceBases_[i]->getData(j)[k]); + } + itmp++; + } } } } } UnassembledBasesMaps_.resize(0); + UnassembledBasesMapsUnique_.resize(0); UnassembledSubspaceBases_.resize(0); + Offsets_.resize(0); UnassembledBasesMaps_.push_back(AssembledBasisMap_); + UnassembledBasesMapsUnique_.push_back(AssembledBasisMapUnique_); UnassembledSubspaceBases_.push_back(AssembledBasis_); + Offsets_.push_back(0); return 0; } template int CoarseSpace::buildGlobalBasisMatrix(ConstXMapPtr rowMap, + ConstXMapPtr rangeMap, ConstXMapPtr repeatedMap, SC treshold) { - FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"AssembledBasisMap_.is_null()."); - FROSCH_ASSERT(!AssembledBasis_.is_null(),"AssembledBasis_.is_null()."); + FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"FROSch::CoarseSpace : ERROR: AssembledBasisMap_.is_null()."); + FROSCH_ASSERT(!AssembledBasis_.is_null(),"FROSch::CoarseSpace : ERROR: AssembledBasis_.is_null()."); - GlobalBasisMatrix_ = MatrixFactory::Build(rowMap,AssembledBasisMap_,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!! + GlobalBasisMatrix_ = MatrixFactory::Build(rowMap,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!! LO iD; SC valueTmp; @@ -134,17 +168,17 @@ namespace FROSch { for (UN j=0; jgetNumVectors(); j++) { valueTmp=AssembledBasis_->getData(j)[i]; if (fabs(valueTmp)>treshold) { - indices.push_back( AssembledBasisMap_->getGlobalElement(j) ); + indices.push_back(AssembledBasisMap_->getGlobalElement(j)); values.push_back(valueTmp); } } iD = rowMap->getLocalElement(repeatedMap->getGlobalElement(i)); if (iD!=-1) { - GlobalBasisMatrix_->insertGlobalValues( repeatedMap->getGlobalElement(i) ,indices(),values()); + GlobalBasisMatrix_->insertGlobalValues(repeatedMap->getGlobalElement(i),indices(),values()); } } - GlobalBasisMatrix_->fillComplete(AssembledBasisMap_,rowMap); + GlobalBasisMatrix_->fillComplete(AssembledBasisMapUnique_,rangeMap); return 0; } @@ -155,11 +189,15 @@ namespace FROSch { // FROSCH_ASSERT(UnassembledSubspaceBases_.size()>0,"UnassembledSubspaceBases_.size()==0"); UnassembledBasesMaps_.resize(0); + UnassembledBasesMapsUnique_.resize(0); UnassembledSubspaceBases_.resize(0); AssembledBasisMap_.reset(); + AssembledBasisMapUnique_.reset(); AssembledBasis_.reset(); - + + LocalSubspacesSizes_.resize(0); + GlobalBasisMatrix_.reset(); return 0; @@ -168,7 +206,7 @@ namespace FROSch { template int CoarseSpace::checkForLinearDependencies() { - FROSCH_ASSERT(false,"This is not implemented yet."); + FROSCH_ASSERT(false,"FROSch::CoarseSpace : ERROR: This is not implemented yet."); return 0; } @@ -185,12 +223,25 @@ namespace FROSch { } template - typename CoarseSpace::XMapPtr CoarseSpace::getBasisMap() const + typename CoarseSpace::ConstXMapPtr CoarseSpace::getBasisMap() const { - FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"AssembledBasisMap_.is_null()."); + FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"FROSch::CoarseSpace : ERROR: AssembledBasisMap_.is_null()."); return AssembledBasisMap_; } + template + bool CoarseSpace::hasBasisMapUnique() const + { + return !AssembledBasisMapUnique_.is_null(); + } + + template + typename CoarseSpace::ConstXMapPtr CoarseSpace::getBasisMapUnique() const + { + FROSCH_ASSERT(!AssembledBasisMapUnique_.is_null(),"FROSch::CoarseSpace : ERROR: AssembledBasisMapUnique_.is_null()."); + return AssembledBasisMapUnique_; + } + template bool CoarseSpace::hasAssembledBasis() const { @@ -198,12 +249,18 @@ namespace FROSch { } template - typename CoarseSpace::XMultiVectorPtr CoarseSpace::getAssembledBasis() const + typename CoarseSpace::ConstXMultiVectorPtr CoarseSpace::getAssembledBasis() const { - FROSCH_ASSERT(!AssembledBasis_.is_null(),"AssembledBasis_.is_null()."); + FROSCH_ASSERT(!AssembledBasis_.is_null(),"FROSch::CoarseSpace : ERROR: AssembledBasis_.is_null()."); return AssembledBasis_; } + template + typename CoarseSpace::ConstUNVecView CoarseSpace::getLocalSubspaceSizes() const + { + return LocalSubspacesSizes_(); + } + template bool CoarseSpace::hasGlobalBasisMatrix() const { @@ -213,7 +270,7 @@ namespace FROSch { template typename CoarseSpace::XMatrixPtr CoarseSpace::getGlobalBasisMatrix() const { - FROSCH_ASSERT(!GlobalBasisMatrix_.is_null(),"GlobalBasisMatrix_.is_null()."); + FROSCH_ASSERT(!GlobalBasisMatrix_.is_null(),"FROSch::CoarseSpace : ERROR: GlobalBasisMatrix_.is_null()."); return GlobalBasisMatrix_; } } diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_LocalPartitionOfUnityBasis_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_LocalPartitionOfUnityBasis_def.hpp index 3ce380b83dcb..72d60d3a0c0e 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_LocalPartitionOfUnityBasis_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_LocalPartitionOfUnityBasis_def.hpp @@ -93,7 +93,7 @@ namespace FROSch { FROSCH_ASSERT(!PartitionOfUnity_.is_null(),"Partition Of Unity is not set."); FROSCH_ASSERT(!PartitionOfUnityMaps_.is_null(),"Partition Of Unity Map is not set."); - LocalPartitionOfUnitySpace_ = CoarseSpacePtr(new CoarseSpace()); + LocalPartitionOfUnitySpace_ = CoarseSpacePtr(new CoarseSpace(this->MpiComm_,this->SerialComm_)); XMultiVectorPtrVecPtr2D tmpBasis(PartitionOfUnity_.size()); for (UN i=0; igetNumVectors()); XMultiVectorPtr entityBasis = MultiVectorFactory::Build(PartitionOfUnity_[i]->getMap(),PartitionOfUnity_[i]->getNumVectors()); entityBasis->scale(ScalarTraits::zero()); @@ -152,11 +152,13 @@ namespace FROSch { entityBasis->getDataNonConst(k).deepCopy(tmpBasis[i][k]->getData(j)()); // Here, we copy data. Do we need to do this? } } - LocalPartitionOfUnitySpace_->addSubspace(PartitionOfUnityMaps_[i],entityBasis); + LocalPartitionOfUnitySpace_->addSubspace(PartitionOfUnityMaps_[i],null,entityBasis); + } else { + LocalPartitionOfUnitySpace_->addSubspace(PartitionOfUnityMaps_[i]); } - } else { - LocalPartitionOfUnitySpace_->addSubspace(PartitionOfUnityMaps_[i]); } + } else { + if (this->MpiComm_->getRank()==0) std::cout << "FROSch::LocalPartitionOfUnityBasis : WARNING: PartitionOfUnityMaps_[i].is_null()" << std::endl; } } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_decl.hpp index 1b9505f8ecb1..4bb36902b5c8 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_decl.hpp @@ -143,7 +143,7 @@ namespace FROSch { XMatrixPtr buildCoarseMatrix(); - int buildCoarseSolveMap(); + int buildCoarseSolveMap(ConstXMapPtr coarseMapUnique); CommPtr CoarseSolveComm_; @@ -168,9 +168,7 @@ namespace FROSch { mutable XMultiVectorPtr YCoarseSolveTmp_; ConstXMapPtrVecPtr GatheringMaps_; - XMapPtr CoarseMap_; XMapPtr CoarseSolveMap_; - XMapPtr CoarseSolveRepeatedMap_; SubdomainSolverPtr CoarseSolver_; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp index 24df4fdbb5e4..b557f5bf8b1d 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp @@ -57,7 +57,7 @@ namespace FROSch { CoarseSolveComm_ (), OnCoarseSolveComm_ (false), NumProcsCoarseSolve_ (0), - CoarseSpace_ (new CoarseSpace()), + CoarseSpace_ (new CoarseSpace(this->MpiComm_,this->SerialComm_)), Phi_ (), CoarseMatrix_ (), XTmp_ (), @@ -69,9 +69,7 @@ namespace FROSch { YCoarseSolve_ (), YCoarseSolveTmp_ (), GatheringMaps_ (0), - CoarseMap_ (), CoarseSolveMap_ (), - CoarseSolveRepeatedMap_ (), CoarseSolver_ (), DistributionList_ (sublist(parameterList,"Distribution")), CoarseSolveExporters_ (0) @@ -110,7 +108,7 @@ namespace FROSch { if (CoarseSpace_->hasUnassembledMaps()) { // If there is no unassembled basis, the current Phi_ should already be correct CoarseSpace_->assembleCoarseSpace(); FROSCH_ASSERT(CoarseSpace_->hasAssembledBasis(),"FROSch::CoarseOperator : !CoarseSpace_->hasAssembledBasis()"); - CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Threshold Phi",1.e-8)); + CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRowMap(),this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Threshold Phi",1.e-8)); FROSCH_ASSERT(CoarseSpace_->hasGlobalBasisMatrix(),"FROSch::CoarseOperator : !CoarseSpace_->hasGlobalBasisMatrix()"); Phi_ = CoarseSpace_->getGlobalBasisMatrix(); } @@ -171,7 +169,7 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(applyPhiTTime,"CoarseOperator::applyPhiT"); // AH 08/22/2019 TODO: We cannot ger rid of the Build() calls because of "XCoarse_ = XCoarseSolveTmp_;". This is basically caused by the whole Gathering Map strategy. As soon as we have replaced this, we can get rid of the Build() calls - XCoarse_ = MultiVectorFactory::Build(CoarseSpace_->getBasisMap(),x.getNumVectors()); // AH 08/22/2019 TODO: Can we get rid of this? If possible, we should remove the whole GatheringMaps idea and replace it by some smart all-to-all MPI communication + XCoarse_ = MultiVectorFactory::Build(CoarseSpace_->getBasisMapUnique(),x.getNumVectors()); // AH 08/22/2019 TODO: Can we get rid of this? If possible, we should remove the whole GatheringMaps idea and replace it by some smart all-to-all MPI communication { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(applyTime,"apply"); @@ -232,7 +230,7 @@ namespace FROSch { } YCoarseSolveTmp_ = YCoarse_; } - YCoarse_ = MultiVectorFactory::Build(CoarseSpace_->getBasisMap(),x.getNumVectors()); + YCoarse_ = MultiVectorFactory::Build(CoarseSpace_->getBasisMapUnique(),x.getNumVectors()); { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(applyTime,"doImport"); @@ -288,11 +286,11 @@ namespace FROSch { } } k0 = tmpCoarseMatrix; - + } else if (!DistributionList_->get("Type","linear").compare("Zoltan2")) { #ifdef HAVE_SHYLU_DDFROSCH_ZOLTAN2 GatheringMaps_[0] = rcp_const_cast (BuildUniqueMap(k0->getRowMap())); - CoarseSolveExporters_[0] = ExportFactory::Build(CoarseSpace_->getBasisMap(),GatheringMaps_[0]); + CoarseSolveExporters_[0] = ExportFactory::Build(CoarseSpace_->getBasisMapUnique(),GatheringMaps_[0]); if (NumProcsCoarseSolve_ < this->MpiComm_->getSize()) { XMatrixPtr k0Unique = MatrixFactory::Build(GatheringMaps_[0],k0->getGlobalMaxNumRowEntries()); @@ -307,13 +305,13 @@ namespace FROSch { k0 = k0Unique; GatheringMaps_[0] = k0->getRowMap(); - CoarseSolveExporters_[0] = ExportFactory::Build(CoarseSpace_->getBasisMap(),GatheringMaps_[0]); + CoarseSolveExporters_[0] = ExportFactory::Build(CoarseSpace_->getBasisMapUnique(),GatheringMaps_[0]); if (GatheringMaps_[0]->getNodeNumElements()>0) { OnCoarseSolveComm_=true; } CoarseSolveComm_ = this->MpiComm_->split(!OnCoarseSolveComm_,this->MpiComm_->getRank()); - CoarseSolveMap_ = MapFactory::Build(CoarseSpace_->getBasisMap()->lib(),-1,GatheringMaps_[0]->getNodeElementList(),0,CoarseSolveComm_); + CoarseSolveMap_ = MapFactory::Build(CoarseSpace_->getBasisMapUnique()->lib(),-1,GatheringMaps_[0]->getNodeElementList(),0,CoarseSolveComm_); } #else ThrowErrorMissingPackage("FROSch::CoarseOperator","Zoltan2"); @@ -409,8 +407,9 @@ namespace FROSch { typename CoarseOperator::XMatrixPtr CoarseOperator::buildCoarseMatrix() { FROSCH_TIMER_START_LEVELID(buildCoarseMatrixTime,"CoarseOperator::buildCoarseMatrix"); - XMatrixPtr k0 = MatrixFactory::Build(CoarseSpace_->getBasisMap(),CoarseSpace_->getBasisMap()->getNodeNumElements()); - + RCP fancy = fancyOStream(rcpFromRef(std::cout)); + + XMatrixPtr k0 = MatrixFactory::Build(CoarseSpace_->getBasisMapUnique(),CoarseSpace_->getBasisMap()->getNodeNumElements()); if (this->ParameterList_->get("Use Triple MatrixMultiply",false)) { TripleMatrixMultiply::MultiplyRAP(*Phi_,true,*this->K_,false,*Phi_,false,*k0); } else { @@ -422,7 +421,7 @@ namespace FROSch { } template - int CoarseOperator::buildCoarseSolveMap() + int CoarseOperator::buildCoarseSolveMap(ConstXMapPtr coarseMapUnique) { FROSCH_TIMER_START_LEVELID(buildCoarseSolveMapTime,"CoarseOperator::buildCoarseSolveMap"); NumProcsCoarseSolve_ = DistributionList_->get("NumProcs",1); @@ -455,7 +454,7 @@ namespace FROSch { #endif LO numProcsGatheringStep = this->MpiComm_->getSize(); - GO numGlobalIndices = CoarseMap_->getMaxAllGlobalIndex()+1; + GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex()+1; int numMyRows; double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps)); @@ -474,7 +473,7 @@ namespace FROSch { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(gatheringMapsTime,"Gathering Maps"); #endif - GatheringMaps_[i] = MapFactory::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_); + GatheringMaps_[i] = MapFactory::Build(coarseMapUnique->lib(),-1,numMyRows,0,this->MpiComm_); } } @@ -490,7 +489,7 @@ namespace FROSch { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(gatheringMapsTime,"Gathering Maps"); #endif - GatheringMaps_[gatheringSteps-1] = MapFactory::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_); + GatheringMaps_[gatheringSteps-1] = MapFactory::Build(coarseMapUnique->lib(),-1,numMyRows,0,this->MpiComm_); } //cout << *GatheringMaps_->at(gatheringSteps-1); @@ -509,7 +508,7 @@ namespace FROSch { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(coarseCommMapTime,"Coarse Communicator Map"); #endif - CoarseSolveMap_ = MapFactory::Build(CoarseMap_->lib(),-1,GatheringMaps_[GatheringMaps_.size()-1]->getNodeElementList(),0,CoarseSolveComm_); + CoarseSolveMap_ = MapFactory::Build(coarseMapUnique->lib(),-1,GatheringMaps_[GatheringMaps_.size()-1]->getNodeElementList(),0,CoarseSolveComm_); } // Possibly change the Send type for this Exporter @@ -522,7 +521,7 @@ namespace FROSch { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(coarseSolveExportersTime,"Build Exporters"); #endif - CoarseSolveExporters_[0] = ExportFactory::Build(CoarseMap_,GatheringMaps_[0]); + CoarseSolveExporters_[0] = ExportFactory::Build(coarseMapUnique,GatheringMaps_[0]); CoarseSolveExporters_[0]->setDistributorParameters(gatheringCommunicationList); // Set the parameter list for the communication of the exporter } #ifdef FROSCH_COARSEOPERATOR_EXPORT_AND_IMPORT @@ -530,7 +529,7 @@ namespace FROSch { #ifdef FROSCH_COARSEOPERATOR_DETAIL_TIMERS FROSCH_TIMER_START_LEVELID(coarseSolveImportersTime,"Build Importers"); #endif - CoarseSolveImporters_[0] = ImportFactory::Build(GatheringMaps_[0],CoarseMap_); + CoarseSolveImporters_[0] = ImportFactory::Build(GatheringMaps_[0],coarseMapUnique); CoarseSolveImporters_[0]->setDistributorParameters(gatheringCommunicationList); // Set the parameter list for the communication of the exporter } #endif @@ -572,7 +571,7 @@ namespace FROSch { ------------------------------------------------------------------------------\n\ Coarse problem statistics\n\ ------------------------------------------------------------------------------\n\ - Dimension of the coarse problem --- " << CoarseMap_->getMaxAllGlobalIndex()+1 << "\n\ + Dimension of the coarse problem --- " << coarseMapUnique->getMaxAllGlobalIndex()+1 << "\n\ Number of processes --- " << NumProcsCoarseSolve_ << "\n\ ------------------------------------------------------------------------------\n"; } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp index f0fcd50a545d..dc5169fc2001 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp @@ -65,8 +65,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,repeatedMap); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -79,8 +79,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,repeatedMap,dirichletBoundaryDofs); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -94,8 +94,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNode,repeatedNodesMap,repeatedDofMaps); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -110,8 +110,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNode,repeatedNodesMap,repeatedDofMaps,dirichletBoundaryDofs); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -126,8 +126,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNode,repeatedNodesMap,repeatedDofMaps,nodeList); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -143,8 +143,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNode,repeatedNodesMap,repeatedDofMaps,dirichletBoundaryDofs,nodeList); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; @@ -160,14 +160,13 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"GDSWCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNodeVec,repeatedNodesMapVec,repeatedDofMapsVec,dirichletBoundaryDofsVec,nodeListVec); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; } - template void GDSWCoarseOperator::describe(FancyOStream &out, const EVerbosityLevel verbLevel) const @@ -414,7 +413,7 @@ namespace FROSch { } } - this->InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace()); + this->InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace(this->MpiComm_,this->SerialComm_)); if (useForCoarseSpace && (useVertexTranslations||useShortEdgeTranslations||useShortEdgeRotations||useStraightEdgeTranslations||useStraightEdgeRotations||useEdgeTranslations||useEdgeRotations||useFaceTranslations||useFaceRotations)) { @@ -441,58 +440,58 @@ namespace FROSch { if (useVertexTranslations) { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,DDInterface_->getVertices()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getVertices()->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getVertices()->getEntityMap(),null,translations[i]); } } // ShortEdges if (useShortEdgeTranslations) { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,DDInterface_->getShortEdges()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getShortEdges()->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getShortEdges()->getEntityMap(),null,translations[i]); } } if (useShortEdgeRotations) { XMultiVectorPtrVecPtr rotations = this->computeRotations(blockId,dimension,nodeList,DDInterface_->getShortEdges(),(dimension==3)); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getShortEdges()->getEntityMap(),rotations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getShortEdges()->getEntityMap(),null,rotations[i]); } } // StraightEdges if (useStraightEdgeTranslations) { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,DDInterface_->getStraightEdges()); - for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getStraightEdges()->getEntityMap(),translations[i]); + for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getStraightEdges()->getEntityMap(),null,translations[i]); } } if (useStraightEdgeRotations) { XMultiVectorPtrVecPtr rotations = this->computeRotations(blockId,dimension,nodeList,DDInterface_->getStraightEdges(),(dimension==3)); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getStraightEdges()->getEntityMap(),rotations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getStraightEdges()->getEntityMap(),null,rotations[i]); } } // Edges if (useEdgeTranslations) { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,DDInterface_->getEdges()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getEdges()->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getEdges()->getEntityMap(),null,translations[i]); } } if (useEdgeRotations) { XMultiVectorPtrVecPtr rotations = this->computeRotations(blockId,dimension,nodeList,DDInterface_->getEdges()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getEdges()->getEntityMap(),rotations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getEdges()->getEntityMap(),null,rotations[i]); } } // Faces if (useFaceTranslations) { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,DDInterface_->getFaces()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getFaces()->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getFaces()->getEntityMap(),null,translations[i]); } } if (useFaceRotations) { XMultiVectorPtrVecPtr rotations = this->computeRotations(blockId,dimension,nodeList,DDInterface_->getFaces()); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getFaces()->getEntityMap(),rotations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(DDInterface_->getFaces()->getEntityMap(),null,rotations[i]); } } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp index 210a7b854dce..41d103608b97 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp @@ -84,6 +84,7 @@ namespace FROSch { using UN = typename SchwarzOperator::UN; using UNVec = typename SchwarzOperator::UNVec; using UNVecPtr = typename SchwarzOperator::UNVecPtr; + using ConstUNVecView = typename SchwarzOperator::ConstUNVecView; using LOVec = typename SchwarzOperator::LOVec; using LOVecPtr = typename SchwarzOperator::LOVecPtr; @@ -107,7 +108,7 @@ namespace FROSch { int intializeCoarseMap(); - XMapPtr assembleCoarseMap(); + int assembleInterfaceCoarseSpace(); int addZeroCoarseSpaceBlock(ConstXMapPtr dofsMap); @@ -137,6 +138,7 @@ namespace FROSch { SubdomainSolverPtr ExtensionSolver_; CoarseSpacePtrVecPtr InterfaceCoarseSpaces_; + CoarseSpacePtr AssembledInterfaceCoarseSpace_; UNVecPtr Dimensions_; UNVecPtr DofsPerNode_; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp index 49e2f94d3358..f9f22a0aa87d 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp @@ -56,6 +56,7 @@ namespace FROSch { CoarseOperator (k,parameterList), ExtensionSolver_ (), InterfaceCoarseSpaces_ (0), + AssembledInterfaceCoarseSpace_ (new CoarseSpace(this->MpiComm_,this->SerialComm_)), Dimensions_ (0), DofsPerNode_ (0), GammaDofs_ (0), @@ -99,10 +100,10 @@ namespace FROSch { // Build the saddle point harmonic extensions XMultiVectorPtr localCoarseSpaceBasis; - if (this->CoarseMap_->getNodeNumElements()) { - localCoarseSpaceBasis = computeExtensions(repeatedMatrix->getRowMap(),this->CoarseMap_,indicesGammaDofsAll(),indicesIDofsAll(),kII,kIGamma); + if (AssembledInterfaceCoarseSpace_->getBasisMap()->getNodeNumElements()) { + localCoarseSpaceBasis = computeExtensions(repeatedMatrix->getRowMap(),AssembledInterfaceCoarseSpace_->getBasisMap(),indicesGammaDofsAll(),indicesIDofsAll(),kII,kIGamma); - coarseSpace->addSubspace(this->CoarseMap_,localCoarseSpaceBasis); + coarseSpace->addSubspace(AssembledInterfaceCoarseSpace_->getBasisMap(),AssembledInterfaceCoarseSpace_->getBasisMapUnique(),localCoarseSpaceBasis); } else { if (this->Verbose_) std::cout << "FROSch::HarmonicCoarseOperator : WARNING: The Coarse Space is empty. No extensions are computed" << std::endl; } @@ -111,28 +112,25 @@ namespace FROSch { } template - typename HarmonicCoarseOperator::XMapPtr HarmonicCoarseOperator::assembleCoarseMap() + int HarmonicCoarseOperator::assembleInterfaceCoarseSpace() { - FROSCH_TIMER_START_LEVELID(assembleCoarseMapTime,"HarmonicCoarseOperator::assembleCoarseMap"); - GOVec mapVector(0); - GO tmp = 0; + FROSCH_TIMER_START_LEVELID(assembleInterfaceCoarseSpaceTime,"HarmonicCoarseOperator::assembleInterfaceCoarseSpace"); + LO ii=0; for (UN i=0; ihasBasisMap()) { - for (UN j=0; jgetBasisMap()->getNodeNumElements(); j++) { - mapVector.push_back(InterfaceCoarseSpaces_[i]->getBasisMap()->getGlobalElement(j)+tmp); - } - if (InterfaceCoarseSpaces_[i]->getBasisMap()->getMaxAllGlobalIndex()>=0) { - tmp += InterfaceCoarseSpaces_[i]->getBasisMap()->getMaxAllGlobalIndex()+1; - } + FROSCH_ASSERT(InterfaceCoarseSpaces_[i]->hasBasisMapUnique(),"FROSch::HarmonicCoarseOperator : ERROR: !InterfaceCoarseSpaces_[i]->hasAssembledBasis()"); + this->AssembledInterfaceCoarseSpace_->addSubspace(InterfaceCoarseSpaces_[i]->getBasisMap(),InterfaceCoarseSpaces_[i]->getBasisMapUnique(),InterfaceCoarseSpaces_[i]->getAssembledBasis(),ii); } } + ii += InterfaceCoarseSpaces_[i]->getAssembledBasis()->getLocalLength(); + InterfaceCoarseSpaces_[i].reset(); } - return MapFactory::Build(DofsMaps_[0][0]->lib(),-1,mapVector(),0,this->MpiComm_); + return this->AssembledInterfaceCoarseSpace_->assembleCoarseSpace(); } template - int HarmonicCoarseOperator::addZeroCoarseSpaceBlock(ConstXMapPtr dofsMap) + int HarmonicCoarseOperator::addZeroCoarseSpaceBlock(ConstXMapPtr dofsMap) { FROSCH_TIMER_START_LEVELID(addZeroCoarseSpaceBlockTime,"HarmonicCoarseOperator::addZeroCoarseSpaceBlock"); // Das könnte man noch ändern @@ -160,7 +158,7 @@ namespace FROSch { XMultiVectorPtr mVPhiGamma; XMapPtr blockCoarseMap; if (useForCoarseSpace) { - InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace()); + InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace(this->MpiComm_,this->SerialComm_)); //Epetra_SerialComm serialComm; XMapPtr serialGammaMap = MapFactory::Build(dofsMap->lib(),dofsMap->getNodeNumElements(),0,this->SerialComm_); @@ -222,18 +220,18 @@ namespace FROSch { } if (useForCoarseSpace) { - InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace()); + InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace(this->MpiComm_,this->SerialComm_)); interior->buildEntityMap(nodesMap); XMultiVectorPtrVecPtr translations = computeTranslations(blockId,interior); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(interior->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(interior->getEntityMap(),null,translations[i]); } if (useRotations) { XMultiVectorPtrVecPtr rotations = computeRotations(blockId,dimension,nodeList,interior); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(interior->getEntityMap(),rotations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(interior->getEntityMap(),null,rotations[i]); } } @@ -461,26 +459,34 @@ namespace FROSch { //Build mVPhiGamma XMultiVectorPtr mVPhiGamma = MultiVectorFactory::Build(kIGamma->getDomainMap(),coarseMap->getNodeNumElements()); - LO jj=0; - LO kk=0; - UNVec numLocalBlockRows(NumberOfBlocks_); - for (UN i=0; ihasAssembledBasis()) { - numLocalBlockRows[i] = InterfaceCoarseSpaces_[i]->getAssembledBasis()->getNumVectors(); - for (j=0; jgetAssembledBasis()->getLocalLength(); k++) { - mVPhiGamma->replaceLocalValue(k+kk,j+jj,InterfaceCoarseSpaces_[i]->getAssembledBasis()->getData(j)[k]); - mVPhi->replaceLocalValue(indicesGammaDofsAll[k+kk],j+jj,InterfaceCoarseSpaces_[i]->getAssembledBasis()->getData(j)[k]); - } + if (AssembledInterfaceCoarseSpace_->hasAssembledBasis()) { + for (UN i=0; igetAssembledBasis()->getNumVectors(); i++) { + for (UN j=0; jgetAssembledBasis()->getLocalLength(); j++) { + mVPhiGamma->replaceLocalValue(j,i,AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i)[j]); + mVPhi->replaceLocalValue(indicesGammaDofsAll[j],i,AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i)[j]); } - } else { // Das ist für den Fall, dass keine Basisfunktionen für einen Block gebaut werden sollen - k=GammaDofs_[i].size(); } - jj += j; - kk += k; } +// LO jj=0; +// LO kk=0; +// UNVec numLocalBlockRows(NumberOfBlocks_); +// for (UN i=0; ihasAssembledBasis()) { +// numLocalBlockRows[i] = InterfaceCoarseSpaces_[i]->getAssembledBasis()->getNumVectors(); +// for (j=0; jgetAssembledBasis()->getLocalLength(); k++) { +// mVPhiGamma->replaceLocalValue(k+kk,j+jj,InterfaceCoarseSpaces_[i]->getAssembledBasis()->getData(j)[k]); +// mVPhi->replaceLocalValue(indicesGammaDofsAll[k+kk],j+jj,InterfaceCoarseSpaces_[i]->getAssembledBasis()->getData(j)[k]); +// } +// } +// } else { // Das ist für den Fall, dass keine Basisfunktionen für einen Block gebaut werden sollen +// k=GammaDofs_[i].size(); +// } +// jj += j; +// kk += k; +// } // RCP fancy = fancyOStream(rcpFromRef(std::cout)); this->Phi_->describe(*fancy,VERB_EXTREME); // Hier Multiplikation kIGamma*PhiGamma kIGamma->apply(*mVPhiGamma,*mVtmp); @@ -506,6 +512,8 @@ namespace FROSch { } LO itmp = 0; + ConstUNVecView numLocalBlockRows = AssembledInterfaceCoarseSpace_->getLocalSubspaceSizes(); + FROSCH_ASSERT(numLocalBlockRows.size()==NumberOfBlocks_,"FROSch::HarmonicCoarseOperator : ERROR: numLocalBlockRows.size()!=NumberOfBlocks_"); for (UN i=0; iCoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return ret; @@ -89,8 +89,8 @@ namespace FROSch { { FROSCH_TIMER_START_LEVELID(initializeTime,"IPOUHarmonicCoarseOperator::initialize"); buildCoarseSpace(dimension,dofsPerNodeVec,repeatedNodesMapVec,repeatedDofMapsVec,nullSpaceBasisVec,dirichletBoundaryDofsVec,nodeListVec); - this->CoarseMap_ = this->assembleCoarseMap(); - this->buildCoarseSolveMap(); + this->assembleInterfaceCoarseSpace(); + this->buildCoarseSolveMap(this->AssembledInterfaceCoarseSpace_->getBasisMapUnique()); this->IsInitialized_ = true; this->IsComputed_ = false; return 0; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp index 07cb78f85a46..e4cbc5ed7026 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp @@ -152,7 +152,7 @@ namespace FROSch { } } - this->InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace()); + this->InterfaceCoarseSpaces_[blockId].reset(new CoarseSpace(this->MpiComm_,this->SerialComm_)); if (useForCoarseSpace) { @@ -177,12 +177,12 @@ namespace FROSch { XMultiVectorPtrVecPtr translations = this->computeTranslations(blockId,this->DDInterface_->getRoots(),entitySetVector,distanceFunction); for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(this->DDInterface_->getRoots()->getEntityMap(),translations[i]); + this->InterfaceCoarseSpaces_[blockId]->addSubspace(this->DDInterface_->getRoots()->getEntityMap(),null,translations[i]); } if (useRotations) { XMultiVectorPtrVecPtr rotations = this->computeRotations(blockId,dimension,nodeList,this->DDInterface_->getRoots(),entitySetVector,distanceFunction); - for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(this->DDInterface_->getRoots()->getEntityMap(),rotations[i]); + for (UN i=0; iInterfaceCoarseSpaces_[blockId]->addSubspace(this->DDInterface_->getRoots()->getEntityMap(),null,rotations[i]); } } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp index abd3e195e67e..35f22a75e4e9 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp @@ -141,6 +141,7 @@ namespace FROSch { using ConstUN = const UN; using UNVec = Array; using UNVecPtr = ArrayRCP; + using ConstUNVecView = ArrayView; using LOVec = Array; using LOVecPtr = ArrayRCP; diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp index 01010f885617..b974c9d6ebf4 100644 --- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp @@ -238,7 +238,7 @@ namespace FROSch { RCP > SortMapByGlobalIndex(RCP > inputMap); template - RCP > AssembleMaps(ArrayView > > mapVector, + RCP > AssembleMaps(ArrayView > > mapVector, ArrayRCP > &partMappings); template diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp index 2537e8a715fe..bda865aedc96 100644 --- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp @@ -642,7 +642,7 @@ namespace FROSch { } template - RCP > AssembleMaps(ArrayView > > mapVector, + RCP > AssembleMaps(ArrayView > > mapVector, ArrayRCP > &partMappings) { FROSCH_TIMER_START(assembleMapsTime,"AssembleMaps"); From 1983753b76e9f7e1505b046747bd4c4e41957efd Mon Sep 17 00:00:00 2001 From: Alexander Heinlein Date: Mon, 25 Nov 2019 14:28:31 +0100 Subject: [PATCH 12/50] Fixing an issue with the unique maps. --- .../src/Adapters/Thyra_FROSchFactory_def.hpp | 6 ++--- .../CoarseSpaces/FROSch_CoarseSpace_decl.hpp | 6 ++--- .../CoarseSpaces/FROSch_CoarseSpace_def.hpp | 24 +++++++++++++++---- .../FROSch_CoarseOperator_def.hpp | 11 ++++----- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp b/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp index 18b5d03af72e..7f461d67484c 100644 --- a/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp @@ -422,7 +422,7 @@ namespace Thyra { repeatedMap = rcp_dynamic_cast(xTpetraRepeatedMap); } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA - if (comm->getRank()==0) std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra isntead." << std::endl; + if (comm->getRank()==0) std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; #endif } } @@ -447,7 +447,7 @@ namespace Thyra { } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA if (comm->getRank()==0) { - std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra isntead." << std::endl; + std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; } #endif } @@ -473,7 +473,7 @@ namespace Thyra { } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA if (comm->getRank()==0) { - std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra isntead." << std::endl; + std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; } #endif } diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp index fb37d0a6247d..6d2c4975068e 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp @@ -143,9 +143,9 @@ namespace FROSch { ConstXMultiVectorPtrVec UnassembledSubspaceBases_; LOVec Offsets_; - - XMapPtr AssembledBasisMap_; - XMapPtr AssembledBasisMapUnique_; + + ConstXMapPtr AssembledBasisMap_; + ConstXMapPtr AssembledBasisMapUnique_; XMultiVectorPtr AssembledBasis_; diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp index 60afe891b080..0040fe58fc31 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp @@ -81,7 +81,6 @@ namespace FROSch { } else { FROSCH_ASSERT(subspaceBasisMap->getNodeNumElements()==0,"FROSch::CoarseSpace : ERROR: subspaceBasisMap->getNodeNumElements()!=0"); } - if (subspaceBasisMapUnique.is_null()) subspaceBasisMapUnique = BuildUniqueMap(subspaceBasisMap); UnassembledBasesMaps_.push_back(subspaceBasisMap); UnassembledBasesMapsUnique_.push_back(subspaceBasisMapUnique); @@ -105,8 +104,25 @@ namespace FROSch { // BasisMap AssembledBasisMap_ = AssembleMaps(UnassembledBasesMaps_(),partMappings); - // BasisMapUnique - AssembledBasisMapUnique_ = AssembleMaps(UnassembledBasesMapsUnique_(),partMappings); + // BasisMapUnique - First, we check if any of the unassembled unique maps is null. In case, we re-build a unique map + bool buildUniqueMap = false; + UN i=0; + while (!buildUniqueMap && iMpiComm_,REDUCE_MAX,int(buildUniqueMap),ptr(&buildUniqueMapMax)); + + if (buildUniqueMapMax>0) { + if (this->MpiComm_->getRank()==0) std::cout << "FROSch::CoarseSpace : WARNING: We re-build a unique map of the AssembledBasisMap_." << std::endl; + AssembledBasisMapUnique_ = BuildUniqueMap(AssembledBasisMap_); + } else { + AssembledBasisMapUnique_ = AssembleMaps(UnassembledBasesMapsUnique_(),partMappings); + } + FROSCH_ASSERT(AssembledBasisMap_->getMaxAllGlobalIndex()==AssembledBasisMapUnique_->getMaxAllGlobalIndex(),"FROSch::CoarseSpace : ERROR: AssembledBasisMap_->getMaxAllGlobalIndex()!=AssembledBasisMapUnique_->getMaxAllGlobalIndex()"); + FROSCH_ASSERT(AssembledBasisMap_->getMinAllGlobalIndex()==AssembledBasisMapUnique_->getMinAllGlobalIndex(),"FROSch::CoarseSpace : ERROR: AssembledBasisMap_->getMinAllGlobalIndex()!=AssembledBasisMapUnique_->getMinAllGlobalIndex()"); + FROSCH_ASSERT(GO(AssembledBasisMapUnique_->getGlobalNumElements())==GO(AssembledBasisMapUnique_->getMaxAllGlobalIndex()+1),"FROSch::CoarseSpace : ERROR: AssembledBasisMapUnique_->getGlobalNumElements()!=(AssembledBasisMapUnique_->getMaxAllGlobalIndex()+1)"); // Basis if (!AssembledBasisMap_.is_null()) { @@ -137,7 +153,7 @@ namespace FROSch { UnassembledBasesMapsUnique_.resize(0); UnassembledSubspaceBases_.resize(0); Offsets_.resize(0); - + UnassembledBasesMaps_.push_back(AssembledBasisMap_); UnassembledBasesMapsUnique_.push_back(AssembledBasisMapUnique_); UnassembledSubspaceBases_.push_back(AssembledBasis_); diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp index b557f5bf8b1d..000dcf2033e5 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp @@ -407,15 +407,14 @@ namespace FROSch { typename CoarseOperator::XMatrixPtr CoarseOperator::buildCoarseMatrix() { FROSCH_TIMER_START_LEVELID(buildCoarseMatrixTime,"CoarseOperator::buildCoarseMatrix"); - RCP fancy = fancyOStream(rcpFromRef(std::cout)); - - XMatrixPtr k0 = MatrixFactory::Build(CoarseSpace_->getBasisMapUnique(),CoarseSpace_->getBasisMap()->getNodeNumElements()); + XMatrixPtr k0; if (this->ParameterList_->get("Use Triple MatrixMultiply",false)) { + k0 = MatrixFactory::Build(CoarseSpace_->getBasisMapUnique(),as(0)); TripleMatrixMultiply::MultiplyRAP(*Phi_,true,*this->K_,false,*Phi_,false,*k0); } else { - XMatrixPtr tmp = MatrixFactory::Build(this->K_->getRowMap(),50); - MatrixMatrix::Multiply(*this->K_,false,*Phi_,false,*tmp); - MatrixMatrix::Multiply(*Phi_,true,*tmp,false,*k0); + RCP fancy = fancyOStream(rcpFromRef(std::cout)); + XMatrixPtr tmp = MatrixMatrix::Multiply(*this->K_,false,*Phi_,false,*fancy); + k0 = MatrixMatrix::Multiply(*Phi_,true,*tmp,false,*fancy); } return k0; } From 69deac52cfbdbea404a57e803b2d9390efec62a1 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 25 Nov 2019 14:16:45 -0700 Subject: [PATCH 13/50] TSQR: Clean up Matrix & (Const)MatView The goal is to replace MatView and ConstMatView with Kokkos::View. Towards that end, I've started making MatView and ConstMatView have the same interface. I still need to replace member functions like fill with nonmember functions. --- .../tpetra/tsqr/src/TbbTsqr_Partitioner.hpp | 4 +- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 332 +++++++++--------- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 124 +++---- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 4 +- 4 files changed, 219 insertions(+), 245 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp index f8992729587a..e9d2ad9c7896 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp @@ -89,8 +89,8 @@ namespace TSQR { const size_t P_last, const bool contiguous_cache_blocks) const { - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::pointer_type pointer_type; + using ordinal_type = typename MatrixViewType::ordinal_type; + using pointer_type = typename MatrixViewType::pointer; const size_t num_partitions_top = P_mid - P_first + 1; //const size_t num_partitions_bottom = P_last - P_mid; diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 0bd6fce365db..b02b6434795d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -105,7 +105,7 @@ namespace TSQR { } #ifdef TSQR_MATVIEW_DEBUG - template< class Ordinal, class Scalar > + template class MatViewVerify { public: static void @@ -150,36 +150,38 @@ namespace TSQR { // Forward declaration - template< class Ordinal, class Scalar > + template class ConstMatView; // Forward declaration - template< class Ordinal, class Scalar > + template class Matrix; /// \class MatView /// /// A read-and-write nonowning view of a column-oriented matrix. - template< class Ordinal, class Scalar > + template class MatView { public: - typedef Scalar scalar_type; - typedef Ordinal ordinal_type; - typedef Scalar* pointer_type; + using scalar_type = Scalar; + using ordinal_type = Ordinal; + using pointer = Scalar*; + using reference = Scalar&; MatView () = default; - MatView (const Ordinal num_rows, - const Ordinal num_cols, - Scalar* const A, - const Ordinal leading_dim) : + MatView (const ordinal_type num_rows, + const ordinal_type num_cols, + pointer const A, + const ordinal_type leading_dim) : nrows_(num_rows), ncols_(num_cols), lda_(leading_dim), A_(A) { #ifdef TSQR_MATVIEW_DEBUG - MatViewVerify< Ordinal, Scalar >::verify (num_rows, num_cols, A, leading_dim); + MatViewVerify:: + verify (num_rows, num_cols, A, leading_dim); #endif // TSQR_MATVIEW_DEBUG } @@ -188,15 +190,12 @@ namespace TSQR { MatView (MatView&& view) = default; MatView& operator= (MatView&& view) = default; - /// \note The function is const, only because returning a - /// reference to the matrix data doesn't change any members of - /// *this. Of course one may use the resulting reference to - /// change an entry in the matrix, but that doesn't affect the - /// MatView's properties. - Scalar& operator() (const Ordinal i, const Ordinal j) const + reference + operator() (const ordinal_type i, + const ordinal_type j) const { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed) { + if (std::numeric_limits::is_signed) { if (i < 0 || i >= extent(0)) { throw std::invalid_argument("Row range invalid"); } @@ -212,32 +211,33 @@ namespace TSQR { throw std::invalid_argument("Column range invalid"); } } - if (A_ == NULL) { + if (A_ == nullptr) { throw std::logic_error("Attempt to reference NULL data"); } #endif // TSQR_MATVIEW_DEBUG return A_[i + j*lda()]; } - constexpr Ordinal extent(const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + constexpr ordinal_type extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } - Ordinal lda() const { return lda_; } + ordinal_type lda() const { return lda_; } /// \note The function is const, only because returning A_ doesn't /// change any members of *this. Of course one may use the /// resulting pointer to fiddle with entries in the matrix, but /// that doesn't affect the MatView's properties. - pointer_type data() const { return A_; } + pointer data() const { return A_; } bool empty() const { return extent(0) == 0 || extent(1) == 0; } /// Return a "row block" (submatrix of consecutive rows in the /// inclusive range [firstRow,lastRow]). - MatView row_block (const Ordinal firstRow, const Ordinal lastRow) + MatView row_block (const ordinal_type firstRow, + const ordinal_type lastRow) { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed) { + if (std::numeric_limits::is_signed) { if (firstRow < 0 || firstRow > lastRow || lastRow >= extent(0)) { throw std::invalid_argument ("Row range invalid"); } @@ -266,40 +266,37 @@ namespace TSQR { /// /// \return The top block of nrows_top rows. Data is a shallow /// copy of the data in *this. - MatView split_top (const Ordinal nrows_top, - const bool b_contiguous_blocks = false) + MatView + split_top (const ordinal_type nrows_top, + const bool b_contiguous_blocks = false) { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed && nrows_top < 0) - { - std::ostringstream os; - os << "nrows_top (= " << nrows_top << ") < 0"; - throw std::invalid_argument (os.str()); - } - else if (nrows_top > extent(0)) - { - std::ostringstream os; - os << "nrows_top (= " << nrows_top << ") > nrows (= " << extent(0) << ")"; - throw std::invalid_argument (os.str()); - } + if (std::numeric_limits::is_signed && nrows_top < 0) { + std::ostringstream os; + os << "nrows_top (= " << nrows_top << ") < 0"; + throw std::invalid_argument (os.str()); + } + else if (nrows_top > extent(0)) { + std::ostringstream os; + os << "nrows_top (= " << nrows_top << ") > nrows (= " << extent(0) << ")"; + throw std::invalid_argument (os.str()); + } #endif // TSQR_MATVIEW_DEBUG - Scalar* const A_top_ptr = data(); - Scalar* A_rest_ptr; - const Ordinal nrows_rest = extent(0) - nrows_top; - Ordinal lda_top, lda_rest; - if (b_contiguous_blocks) - { - lda_top = nrows_top; - lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * extent(1); - } - else - { - lda_top = lda(); - lda_rest = lda(); - A_rest_ptr = A_top_ptr + nrows_top; - } + pointer const A_top_ptr = data(); + pointer A_rest_ptr; + const ordinal_type nrows_rest = extent(0) - nrows_top; + ordinal_type lda_top, lda_rest; + if (b_contiguous_blocks) { + lda_top = nrows_top; + lda_rest = nrows_rest; + A_rest_ptr = A_top_ptr + nrows_top * extent(1); + } + else { + lda_top = lda(); + lda_rest = lda(); + A_rest_ptr = A_top_ptr + nrows_top; + } MatView A_top (nrows_top, extent(1), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; @@ -310,32 +307,33 @@ namespace TSQR { /// Split off and return the bottom block. Modify *this to be the /// "rest" of the matrix. - MatView split_bottom (const Ordinal nrows_bottom, - const bool b_contiguous_blocks = false) + MatView + split_bottom (const ordinal_type nrows_bottom, + const bool b_contiguous_blocks = false) { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed && nrows_bottom < 0) + if (std::numeric_limits::is_signed && nrows_bottom < 0) { throw std::invalid_argument ("nrows_bottom < 0"); - if (nrows_bottom > extent(0)) + } + if (nrows_bottom > extent(0)) { throw std::invalid_argument ("nrows_bottom > nrows"); + } #endif // TSQR_MATVIEW_DEBUG - Scalar* const A_rest_ptr = data(); - Scalar* A_bottom_ptr; - const Ordinal nrows_rest = extent(0) - nrows_bottom; - Ordinal lda_bottom, lda_rest; - if (b_contiguous_blocks) - { - lda_bottom = nrows_bottom; - lda_rest = extent(0) - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); - } - else - { - lda_bottom = lda(); - lda_rest = lda(); - A_bottom_ptr = A_rest_ptr + nrows_rest; - } + pointer const A_rest_ptr = data(); + pointer A_bottom_ptr; + const ordinal_type nrows_rest = extent(0) - nrows_bottom; + ordinal_type lda_bottom, lda_rest; + if (b_contiguous_blocks) { + lda_bottom = nrows_bottom; + lda_rest = extent(0) - nrows_bottom; + A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); + } + else { + lda_bottom = lda(); + lda_rest = lda(); + A_bottom_ptr = A_rest_ptr + nrows_rest; + } MatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); A_ = A_rest_ptr; nrows_ = nrows_rest; @@ -352,9 +350,11 @@ namespace TSQR { const ordinal_type stride = lda(); scalar_type* A_j = data(); - for (ordinal_type j = 0; j < num_cols; ++j, A_j += stride) - for (ordinal_type i = 0; i < num_rows; ++i) + for (ordinal_type j = 0; j < num_cols; ++j, A_j += stride) { + for (ordinal_type i = 0; i < num_rows; ++i) { A_j[i] = value; + } + } } bool operator== (const MatView& rhs) const { @@ -371,112 +371,97 @@ namespace TSQR { ordinal_type nrows_ = 0; ordinal_type ncols_ = 0; ordinal_type lda_ = 0; - scalar_type* A_ = nullptr; + pointer A_ = nullptr; }; - /// \class ConstMatView /// /// A read-only view of a column-oriented matrix. - /// - /// \note Implicit promotion of a MatView to a ConstMatView is - /// forbidden, because it violates the expectation that - /// ConstMatView points to a matrix that doesn't change during the - /// computation. - template< class Ordinal, class Scalar > + template class ConstMatView { public: - typedef Scalar scalar_type; - typedef Ordinal ordinal_type; - typedef const Scalar* pointer_type; + using scalar_type = Scalar; + using ordinal_type = Ordinal; + using pointer = const Scalar*; - ConstMatView () : nrows_(0), ncols_(0), lda_(0), A_(NULL) {} + ConstMatView () = default; /// \note g++ with -Wall wants A_ to be initialized after lda_, /// otherwise it emits a compiler warning. - ConstMatView (const Ordinal num_rows, - const Ordinal num_cols, - const Scalar* const A, - const Ordinal leading_dim) : + ConstMatView (const ordinal_type num_rows, + const ordinal_type num_cols, + const scalar_type* const A, + const ordinal_type leading_dim) : nrows_(num_rows), ncols_(num_cols), lda_(leading_dim), A_(A) { #ifdef TSQR_MATVIEW_DEBUG - MatViewVerify< Ordinal, Scalar >::verify (num_rows, num_cols, A, leading_dim); + MatViewVerify:: + verify (num_rows, num_cols, A, leading_dim); #endif // TSQR_MATVIEW_DEBUG } - ConstMatView (const ConstMatView& view) : - nrows_(view.extent(0)), - ncols_(view.extent(1)), - lda_(view.lda()), - A_(view.data()) - {} - - //! Assignment operator: Does a shallow (pointer) copy. - ConstMatView& operator= (const ConstMatView& view) { - if (this != &view) { - nrows_ = view.extent(0); - ncols_ = view.extent(1); - lda_ = view.lda(); - A_ = view.data(); - } - return *this; - } + ConstMatView (const ConstMatView&) = default; + ConstMatView& operator= (const ConstMatView&) = default; + ConstMatView (ConstMatView&&) = default; + ConstMatView& operator= (ConstMatView&&) = default; - const Scalar& operator() (const Ordinal i, const Ordinal j) const + const scalar_type& + operator() (const ordinal_type i, const ordinal_type j) const { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed) { + if (std::numeric_limits::is_signed) { if (i < 0 || i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); + throw std::invalid_argument("Row range invalid"); } else if (j < 0 || j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); + throw std::invalid_argument("Column range invalid"); } } else { if (i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); + throw std::invalid_argument("Row range invalid"); } else if (j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); + throw std::invalid_argument("Column range invalid"); } } - if (A_ == NULL) { + if (A_ == nullptr) { throw std::logic_error("Attempt to reference NULL data"); } #endif // TSQR_MATVIEW_DEBUG return A_[i + j*lda()]; } - constexpr Ordinal extent(const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + constexpr ordinal_type extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } - Ordinal lda() const { return lda_; } + ordinal_type lda() const { return lda_; } - pointer_type data() const { return A_; } + pointer data() const { return A_; } bool empty() const { return extent(0) == 0 || extent(1) == 0; } /// Return a "row block" (submatrix of consecutive rows in the /// inclusive range [firstRow,lastRow]). - ConstMatView rowBlock (const Ordinal firstRow, - const Ordinal lastRow) const + ConstMatView + rowBlock (const ordinal_type firstRow, + const ordinal_type lastRow) const { #ifdef TSQR_MATVIEW_DEBUG - if (firstRow < 0 || lastRow >= extent(0)) + if (firstRow < 0 || lastRow >= extent(0)) { throw std::invalid_argument ("Row range invalid"); + } #endif // TSQR_MATVIEW_DEBUG - return ConstMatView (lastRow - firstRow + 1, extent(1), data() + firstRow, lda()); + return ConstMatView (lastRow - firstRow + 1, extent(1), + data() + firstRow, lda()); } - - /// Split off and return the top block. Modify *this to be the - /// "rest" of the matrix. + /// \brief Split off and return the top block. Modify *this to be + /// the "rest" of the matrix. /// /// \note Only use this method to split off a single cache block. /// It breaks if you try to use it otherwise. @@ -490,32 +475,32 @@ namespace TSQR { /// /// \return The top block of nrows_top rows. Data is a shallow /// copy of the data in *this. - ConstMatView split_top (const Ordinal nrows_top, + ConstMatView split_top (const ordinal_type nrows_top, const bool b_contiguous_blocks = false) { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed && nrows_top < 0) + if (std::numeric_limits::is_signed && nrows_top < 0) { throw std::invalid_argument ("nrows_top < 0"); - if (nrows_top > extent(0)) + } + if (nrows_top > extent(0)) { throw std::invalid_argument ("nrows_top > nrows"); + } #endif // TSQR_MATVIEW_DEBUG - pointer_type const A_top_ptr = data(); - pointer_type A_rest_ptr; - const Ordinal nrows_rest = extent(0) - nrows_top; - Ordinal lda_top, lda_rest; - if (b_contiguous_blocks) - { - lda_top = nrows_top; - lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * extent(1); - } - else - { - lda_top = lda(); - lda_rest = lda(); - A_rest_ptr = A_top_ptr + nrows_top; - } + pointer const A_top_ptr = data(); + pointer A_rest_ptr; + const ordinal_type nrows_rest = extent(0) - nrows_top; + ordinal_type lda_top, lda_rest; + if (b_contiguous_blocks) { + lda_top = nrows_top; + lda_rest = nrows_rest; + A_rest_ptr = A_top_ptr + nrows_top * extent(1); + } + else { + lda_top = lda(); + lda_rest = lda(); + A_rest_ptr = A_top_ptr + nrows_top; + } ConstMatView A_top (nrows_top, extent(1), data(), lda_top); A_ = A_rest_ptr; nrows_ = nrows_rest; @@ -524,35 +509,35 @@ namespace TSQR { return A_top; } - - /// Split off and return the bottom block. Modify *this to be the - /// "rest" of the matrix. - ConstMatView split_bottom (const Ordinal nrows_bottom, - const bool b_contiguous_blocks = false) + /// \brief Split off and return the bottom block. Modify *this to + /// be the "rest" of the matrix. + ConstMatView + split_bottom (const ordinal_type nrows_bottom, + const bool b_contiguous_blocks = false) { #ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits< Ordinal >::is_signed && nrows_bottom < 0) + if (std::numeric_limits::is_signed && nrows_bottom < 0) { throw std::invalid_argument ("nrows_bottom < 0"); - if (nrows_bottom > extent(0)) + } + if (nrows_bottom > extent(0)) { throw std::invalid_argument ("nrows_bottom > nrows"); + } #endif // TSQR_MATVIEW_DEBUG - pointer_type const A_rest_ptr = data(); - pointer_type A_bottom_ptr; + pointer const A_rest_ptr = data(); + pointer A_bottom_ptr; const ordinal_type nrows_rest = extent(0) - nrows_bottom; ordinal_type lda_bottom, lda_rest; - if (b_contiguous_blocks) - { - lda_bottom = nrows_bottom; - lda_rest = extent(0) - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); - } - else - { - lda_bottom = lda(); - lda_rest = lda(); - A_bottom_ptr = A_rest_ptr + nrows_rest; - } + if (b_contiguous_blocks) { + lda_bottom = nrows_bottom; + lda_rest = extent(0) - nrows_bottom; + A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); + } + else { + lda_bottom = lda(); + lda_rest = lda(); + A_bottom_ptr = A_rest_ptr + nrows_rest; + } ConstMatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); A_ = A_rest_ptr; nrows_ = nrows_rest; @@ -571,10 +556,11 @@ namespace TSQR { lda() != rhs.lda() || data() != rhs.data(); } - private: - ordinal_type nrows_, ncols_, lda_; - pointer_type A_; + ordinal_type nrows_ = 0; + ordinal_type ncols_ = 0; + ordinal_type lda_ = 0; + pointer A_ = nullptr; }; } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 74dceb335a5d..f10d54e77cc7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -90,31 +90,28 @@ namespace TSQR { verified_alloc_size (const Ordinal num_rows, const Ordinal num_cols) const { - if (! std::numeric_limits< Ordinal >::is_integer) - throw std::logic_error("Ordinal must be an integer type"); - + static_assert (std::numeric_limits::is_integer, + "Ordinal must be an integer type."); // Quick exit also checks for zero num_cols (which prevents // division by zero in the tests below). - if (num_rows == 0 || num_cols == 0) + if (num_rows == 0 || num_cols == 0) { return size_t(0); + } // If Ordinal is signed, make sure that num_rows and num_cols // are nonnegative. - if (std::numeric_limits< Ordinal >::is_signed) - { - if (num_rows < 0) - { - std::ostringstream os; - os << "# rows (= " << num_rows << ") < 0"; - throw std::logic_error (os.str()); - } - else if (num_cols < 0) - { - std::ostringstream os; - os << "# columns (= " << num_cols << ") < 0"; - throw std::logic_error (os.str()); - } + if (std::numeric_limits::is_signed) { + if (num_rows < 0) { + std::ostringstream os; + os << "# rows (= " << num_rows << ") < 0"; + throw std::logic_error (os.str()); + } + else if (num_cols < 0) { + std::ostringstream os; + os << "# columns (= " << num_cols << ") < 0"; + throw std::logic_error (os.str()); } + } // If Ordinal is bigger than a size_t, do special range // checking. The compiler warns (comparison of signed and @@ -125,20 +122,18 @@ namespace TSQR { // and see if we get the same result. If not, then we // definitely can't return a size_t product of num_rows and // num_cols. - if (! fits_in_size_t (num_rows)) - { - std::ostringstream os; - os << "# rows (= " << num_rows << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } - else if (! fits_in_size_t (num_cols)) - { - std::ostringstream os; - os << "# columns (= " << num_cols << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } + if (! fits_in_size_t (num_rows)) { + std::ostringstream os; + os << "# rows (= " << num_rows << ") > max size_t value (= " + << std::numeric_limits::max() << ")"; + throw std::range_error (os.str()); + } + else if (! fits_in_size_t (num_cols)) { + std::ostringstream os; + os << "# columns (= " << num_cols << ") > max size_t value (= " + << std::numeric_limits::max() << ")"; + throw std::range_error (os.str()); + } // Both num_rows and num_cols fit in a size_t, and are // nonnegative. Now check whether their product also fits in a @@ -146,22 +141,21 @@ namespace TSQR { // // Note: This may throw a SIGFPE (floating-point exception) if // num_cols is zero. Be sure to check first (above). - if (static_cast(num_rows) > - std::numeric_limits::max() / static_cast(num_cols)) - { - std::ostringstream os; - os << "num_rows (= " << num_rows << ") * num_cols (= " - << num_cols << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } - return static_cast(num_rows) * static_cast(num_cols); + if (size_t (num_rows) > + std::numeric_limits::max() / size_t (num_cols)) { + std::ostringstream os; + os << "num_rows (= " << num_rows << ") * num_cols (= " + << num_cols << ") > max size_t value (= " + << std::numeric_limits::max() << ")"; + throw std::range_error (os.str()); + } + return size_t (num_rows) * size_t (num_cols); } public: - typedef Scalar scalar_type; - typedef Ordinal ordinal_type; - typedef Scalar* pointer_type; + using scalar_type = Scalar; + using ordinal_type = Ordinal; + using pointer_type = Scalar*; //! Constructor with dimensions. Matrix (const Ordinal num_rows, @@ -190,15 +184,14 @@ namespace TSQR { ncols_ (in.extent(1)), A_ (verified_alloc_size (in.extent(0), in.extent(1))) { - if (! in.empty()) - copy_matrix (extent(0), extent(1), data(), lda(), in.data(), in.lda()); + if (! in.empty()) { + copy_matrix (extent(0), extent(1), data(), lda(), + in.data(), in.lda()); + } } //! Default constructor (constructs an empty matrix). - Matrix () : nrows_(0), ncols_(0), A_(0) {} - - //! Trivial destructor. - ~Matrix () {} + Matrix () = default; /// \brief "Copy constructor" from a matrix view type. /// @@ -212,8 +205,10 @@ namespace TSQR { ncols_ (in.extent(1)), A_ (verified_alloc_size (in.extent(0), in.extent(1))) { - if (A_.size() != 0) - copy_matrix (extent(0), extent(1), data(), lda(), in.data(), in.lda()); + if (A_.size() != 0) { + copy_matrix (extent(0), extent(1), data(), lda(), + in.data(), in.lda()); + } } //! Fill all entries of the matrix with the given value. @@ -248,7 +243,8 @@ namespace TSQR { template bool operator== (const MatrixViewType& B) const { - if (data() != B.data() || extent(0) != B.extent(0) || extent(1) != B.extent(1) || lda() != B.lda()) { + if (data() != B.data() || extent(0) != B.extent(0) || + extent(1) != B.extent(1) || lda() != B.lda()) { return false; } else { return true; @@ -266,23 +262,15 @@ namespace TSQR { bool empty() const { return extent(0) == 0 || extent(1) == 0; } //! A non-const pointer to the matrix data. - Scalar* - data() + Scalar* data() { - if (A_.size() > 0) - return &A_[0]; - else - return static_cast (NULL); + return A_.size() != 0 ? A_.data () : nullptr; } //! A const pointer to the matrix data. - const Scalar* - data() const + const Scalar* data() const { - if (A_.size() > 0) - return &A_[0]; - else - return static_cast (NULL); + return A_.size() != 0 ? A_.data () : nullptr; } //! A non-const view of the matrix. @@ -320,9 +308,9 @@ namespace TSQR { private: //! Number of rows in the matrix. - Ordinal nrows_; + Ordinal nrows_ = 0; //! Number of columns in the matrix. - Ordinal ncols_; + Ordinal ncols_ = 0; /// \brief Where the entries of the matrix are stored. /// /// The matrix is stored using one-dimensional storage with diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index d8cb2925ec4c..85cdb46fb8e1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -381,9 +381,9 @@ namespace TSQR { "(" << C_top.extent(0) << " rows and " << C_top.extent(1) << " columns). Please report this bug to the Kokkos " "developers."); - typedef typename MatrixViewType::pointer_type ptr_type; + using pointer = typename MatrixViewType::pointer; return MatrixViewType (C_top.extent(0), C_top.extent(1), - const_cast (C_top.data()), + const_cast (C_top.data()), C_top.lda()); } From 2fc83dcd38ba38826d7343b908d7ccb0b6b308df Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 25 Nov 2019 16:18:31 -0700 Subject: [PATCH 14/50] TSQR::Mat{View,rix}: replace fill method w/ deep_copy The goal is eventual replacement of (at least) MatView and ConstMatView with Kokkos::View. I also added a constexpr stride method in imitation of Kokkos::View. --- packages/tpetra/tsqr/src/Tsqr.hpp | 2 +- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 13 +- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 15 +- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 10 +- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 33 +-- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 35 +-- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 14 +- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 24 +- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 105 ++++--- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 106 ++++--- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 8 +- packages/tpetra/tsqr/src/Tsqr_ParTest.hpp | 10 +- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 6 +- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp | 22 +- packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp | 14 +- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 278 +++++++++--------- packages/tpetra/tsqr/src/Tsqr_Util.hpp | 1 - .../tpetra/tsqr/src/Tsqr_generateStack.hpp | 6 +- 19 files changed, 367 insertions(+), 337 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 7738843a4b68..1e8282c98c7d 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -450,7 +450,7 @@ namespace TSQR { const bool contiguousCacheBlocks = false) { mat_view_type R_view (ncols, ncols, R, ldr); - R_view.fill (STS::zero()); + deep_copy (R_view, Scalar {}); NodeOutput nodeResults = nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, R_view.data(), R_view.lda(), diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 53207ffbcdb1..9cbb86d28fd1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -251,7 +251,7 @@ namespace TSQR { // This call modifies the matrix view A, but that's OK since // we passed the input view by copy, not by reference. MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks); - A_cur.fill (Scalar(0)); + deep_copy (A_cur, Scalar {}); } } @@ -289,12 +289,11 @@ namespace TSQR { // dimension is set correctly by A_rest.split_top(). mat_view_type A_rest (num_rows, num_cols, A, lda); - while (! A_rest.empty()) - { - // This call modifies A_rest. - mat_view_type A_cur = split_top_block (A_rest, contiguous_cache_blocks); - A_cur.fill (Scalar(0)); - } + while (! A_rest.empty()) { + // This call modifies A_rest. + mat_view_type A_cur = split_top_block (A_rest, contiguous_cache_blocks); + deep_copy (A_cur, Scalar {}); + } } /// \brief Cache-block the given A_in matrix into A_out. diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 7ba5e87d0dc1..52aa6203fdac 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -297,9 +297,10 @@ namespace TSQR { // A place to put the Q factor. matrix_type Q (numRows, numCols); - Q.fill (STS::zero()); - for (Ordinal j = 0; j < numCols; ++j) + deep_copy (Q, Scalar {}); + for (Ordinal j = 0; j < numCols; ++j) { Q(j,j) = STS::one(); + } // TAU array (Householder reflector scaling factors). std::vector tau (numCols); @@ -395,7 +396,7 @@ namespace TSQR { // A place to put the Q factor. matrix_type Q (numRows, numCols); - Q.fill (STS::zero()); + deep_copy (Q, Scalar {}); for (Ordinal j = 0; j < numCols; ++j) Q(j,j) = STS::one(); @@ -486,7 +487,7 @@ namespace TSQR { // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); - Q.fill (STS::zero()); + deep_copy (Q, Scalar {}); for (Ordinal j = 0; j < numCols; ++j) Q(j,j) = STS::one(); @@ -596,7 +597,7 @@ namespace TSQR { // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); - Q.fill (STS::zero()); + deep_copy (Q, Scalar {}); for (Ordinal j = 0; j < numCols; ++j) Q(j,j) = STS::one(); @@ -687,7 +688,7 @@ namespace TSQR { // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); - Q.fill (STS::zero()); + deep_copy (Q, Scalar {}); for (Ordinal j = 0; j < numCols; ++j) Q(j,j) = STS::one(); @@ -795,7 +796,7 @@ namespace TSQR { // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); - Q.fill (STS::zero()); + deep_copy (Q, Scalar {}); for (Ordinal j = 0; j < numCols; ++j) Q(j,j) = STS::one(); diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index c19404fb2c9b..63c618c05746 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -135,13 +135,13 @@ namespace TSQR { const Ordinal numRows = m + ncols_Q; A_buf_.reshape (numRows, ncols_Q); - A_buf_.fill (Scalar(0)); + deep_copy (A_buf_, Scalar {}); const_mat_view_type A_bot (m, ncols_Q, A, lda); mat_view_type A_buf_bot (m, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.lda()); deep_copy (A_buf_bot, A_bot); C_buf_.reshape (numRows, ncols_C); - C_buf_.fill (Scalar(0)); + deep_copy (C_buf_, Scalar {}); mat_view_type C_buf_top (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.lda()); mat_view_type C_buf_bot (m, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.lda()); mat_view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top); @@ -173,7 +173,7 @@ namespace TSQR { const Ordinal numRows = m + n; A_buf_.reshape (numRows, n); - A_buf_.fill (Scalar {}); + deep_copy (A_buf_, Scalar {}); // R might be a view of the upper triangle of a cache block, but // we only want to include the upper triangle in the // factorization. Thus, only copy the upper triangle of R into @@ -203,7 +203,7 @@ namespace TSQR { const Ordinal numRows = Ordinal(2) * n; A_buf_.reshape (numRows, n); - A_buf_.fill (Scalar {}); + deep_copy (A_buf_, Scalar {}); // Copy the inputs into the compute buffer. Only touch the // upper triangles of R_top and R_bot, since they each may be // views of some cache block (where the strict lower triangle @@ -239,7 +239,7 @@ namespace TSQR { const Ordinal numRows = Ordinal(2) * ncols_Q; A_buf_.reshape (numRows, ncols_Q); - A_buf_.fill (Scalar {}); + deep_copy (A_buf_, Scalar {}); copy_upper_triangle (ncols_Q, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.lda(), R_bot, ldr_bot); diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index 03d7db562680..7fa749cfed34 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -297,24 +297,23 @@ namespace TSQR { factorReduce (R_mine, P_mine, P_first, P_last, QFactors, tauArrays); } - if (QFactors.size() != tauArrays.size()) - { - std::ostringstream os; - os << "QFactors and tauArrays should have the same number of element" - "s after factorReduce() returns, but they do not. QFactors has " - << QFactors.size() << " elements, but tauArrays has " - << tauArrays.size() << " elements."; - throw std::logic_error (os.str()); - } + if (QFactors.size() != tauArrays.size()) { + std::ostringstream os; + os << "QFactors and tauArrays should have the same number of element" + "s after factorReduce() returns, but they do not. QFactors has " + << QFactors.size() << " elements, but tauArrays has " + << tauArrays.size() << " elements."; + throw std::logic_error (os.str()); + } - Q_mine.fill (scalar_type (0)); - if (messenger_->rank() == 0) - { - for (ordinal_type j = 0; j < Q_mine.extent(1); ++j) - Q_mine(j, j) = scalar_type (1); + deep_copy (Q_mine, scalar_type {}); + if (messenger_->rank() == 0) { + for (ordinal_type j = 0; j < Q_mine.extent(1); ++j) { + Q_mine(j, j) = scalar_type (1); } + } // Scratch space for computing results to send to other processors. - matrix_type Q_other (Q_mine.extent(0), Q_mine.extent(1), scalar_type (0)); + matrix_type Q_other (Q_mine.extent(0), Q_mine.extent(1), scalar_type {}); const rank_type numSteps = QFactors.size() - 1; { @@ -326,7 +325,7 @@ namespace TSQR { if (forceNonnegativeDiagonal && ! QR_produces_R_factor_with_nonnegative_diagonal()) { - typedef Teuchos::ScalarTraits STS; + using STS = Teuchos::ScalarTraits; details::NonnegDiagForcer forcer; forcer.force (Q_mine, R_mine); } @@ -459,7 +458,7 @@ namespace TSQR { // Q_other] // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)). // Overwrite both Q_mine and Q_other with the result. - Q_other.fill (scalar_type (0)); + deep_copy (Q_other, scalar_type {}); combine_.apply_pair (ApplyType::NoTranspose, Q_mine.extent(1), Q_impl.extent(1), Q_impl.data(), Q_impl.lda(), tau.data(), diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 0868c525ede8..829eee19e8dd 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -194,46 +194,43 @@ namespace TSQR { matrix_type R (numCols, numCols); // Start out by filling the test problem with zeros. - typedef Teuchos::ScalarTraits STS; - A_local.fill (STS::zero()); - A_copy.fill (STS::zero()); - Q_local.fill (STS::zero()); - R.fill (STS::zero()); + deep_copy (A_local, Scalar {}); + deep_copy (A_copy, Scalar {}); + deep_copy (Q_local, Scalar {}); + deep_copy (R, Scalar {}); // Create some reasonable singular values for the test problem: // 1, 1/2, 1/4, 1/8, ... - typedef typename STS::magnitudeType magnitude_type; + using STS = Teuchos::ScalarTraits; + using magnitude_type = typename STS::magnitudeType; std::vector singularValues (numCols); - typedef Teuchos::ScalarTraits STM; + using STM = Teuchos::ScalarTraits; { const magnitude_type scalingFactor = STM::one() + STM::one(); magnitude_type curVal = STM::one(); - typedef typename std::vector::iterator iter_type; - for (iter_type it = singularValues.begin(); - it != singularValues.end(); ++it) - { - *it = curVal; - curVal = curVal / scalingFactor; - } + for (magnitude_type& singularValue : singularValues) { + singularValue = curVal; + curVal = curVal / scalingFactor; + } } // Construct a normal(0,1) pseudorandom number generator with // the given random seed. using TSQR::Random::NormalGenerator; - typedef NormalGenerator generator_type; + using generator_type = NormalGenerator; generator_type gen (randomSeed); // We need a Messenger for Ordinal-type data, so that we can // build a global random test matrix. - RCP > ordinalMessenger = - rcp_implicit_cast > (rcp (new TeuchosMessenger (comm))); + RCP> ordinalMessenger = + rcp_implicit_cast> (rcp (new TeuchosMessenger (comm))); // We also need a Messenger for Scalar-type data. The TSQR // implementation already constructed one, but it's OK to // construct another one; TeuchosMessenger is just a thin // wrapper over the Teuchos::Comm object. - RCP > scalarMessenger = - rcp_implicit_cast > (rcp (new TeuchosMessenger (comm))); + RCP> scalarMessenger = + rcp_implicit_cast> (rcp (new TeuchosMessenger (comm))); { // Generate a global distributed matrix (whose part local to diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 15e744452b9d..f8a8156c19ba 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -490,7 +490,7 @@ namespace TSQR { const_mat_view_type Q_top = *Q_rangeIter; mat_view_type C_top = *C_rangeIter; if (explicitQ_) { - C_top.fill (Scalar {}); + deep_copy (C_top, Scalar {}); if (partitionIndex == 0) { for (LocalOrdinal j = 0; j < C_top.extent(1); ++j) { C_top(j,j) = Scalar (1.0); @@ -517,7 +517,7 @@ namespace TSQR { ++Q_rangeIter; ++C_rangeIter; if (explicitQ_) { - C_cur.fill (Scalar {}); + deep_copy (C_cur, Scalar {}); } applyCacheBlock (combine, applyType, Q_cur, tauArrays_[curTauIndex++], @@ -540,7 +540,7 @@ namespace TSQR { C_top.extent(1), C_top.data() + C_top.extent(1), C_top.lda()); - C_top_rest.fill (Scalar {}); + deep_copy (C_top_rest, Scalar {}); } LocalOrdinal curTauIndex = cbIndices.second-1; @@ -570,7 +570,7 @@ namespace TSQR { mat_view_type C_cur = *C_rangeIter; if (explicitQ_) { - C_cur.fill (Scalar {}); + deep_copy (C_cur, Scalar {}); } TEUCHOS_TEST_FOR_EXCEPTION (curTauIndex < cbIndices.first, std::logic_error, @@ -946,7 +946,7 @@ namespace TSQR { iter_type end = cbRange.end(); while (iter != end) { mat_view_type A_cur = *iter; - A_cur.fill (value); + deep_copy (A_cur, value); ++iter; } } @@ -1455,7 +1455,7 @@ namespace TSQR { << suffix); mat_view_type R_top_square (R_top.extent(1), R_top.extent(1), R_top.data(), R_top.lda()); - R.fill (Scalar {}); + deep_copy (R, Scalar {}); // Only copy the upper triangle of R_top into R. copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.lda(), R_top.data(), R_top.lda()); @@ -1689,7 +1689,7 @@ namespace TSQR { // just fill the top n x n part of the top blocks // with zeros. if (explicitQ) { - C_cur_square.fill (Scalar {}); + deep_copy (C_cur_square, Scalar {}); } applyPair (applyType, factorOutput.topBlocks[partIdx], factorOutput.secondPassTauArrays[partIdx-1], diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index a54c64d95eb8..b899cca15790 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -127,16 +127,16 @@ namespace TSQR { matrix_type Q (numRows, numCols); matrix_type R (numCols, numCols); if (std::numeric_limits::has_quiet_NaN) { - A.fill (std::numeric_limits::quiet_NaN()); - A_copy.fill (std::numeric_limits::quiet_NaN()); - Q.fill (std::numeric_limits::quiet_NaN()); - R.fill (std::numeric_limits::quiet_NaN()); + deep_copy (A, std::numeric_limits::quiet_NaN()); + deep_copy (A_copy, std::numeric_limits::quiet_NaN()); + deep_copy (Q, std::numeric_limits::quiet_NaN()); + deep_copy (R, std::numeric_limits::quiet_NaN()); } else { - A.fill (Scalar {}); - A_copy.fill (Scalar {}); - Q.fill (Scalar {}); - R.fill (Scalar {}); + deep_copy (A, Scalar {}); + deep_copy (A_copy, Scalar {}); + deep_copy (Q, Scalar {}); + deep_copy (R, Scalar {}); } const Ordinal lda = numRows; const Ordinal ldq = numRows; @@ -190,7 +190,7 @@ namespace TSQR { if (debug) { matrix_type A2 (numRows, numCols); if (std::numeric_limits::has_quiet_NaN) { - A2.fill (std::numeric_limits::quiet_NaN()); + deep_copy (A2, std::numeric_limits::quiet_NaN()); } actor.un_cache_block (numRows, numCols, A2.data(), A2.lda(), A_copy.data()); @@ -223,7 +223,7 @@ namespace TSQR { if (debug) { cerr << "-- Filling R with zeros" << endl; } - R.fill (Scalar {}); + deep_copy (R, Scalar {}); if (debug) { cerr << "-- Calling factor()" << endl; @@ -246,7 +246,7 @@ namespace TSQR { actor.top_block (Q.view (), contiguousCacheBlocks); mat_view_type Q_top_square (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()); - Q_top_square.fill (Scalar {}); + deep_copy (Q_top_square, Scalar {}); for (Ordinal j = 0; j < Q_top_square.extent(1); ++j) { Q_top_square(j,j) = Scalar (1.0); } @@ -408,7 +408,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (Scalar {}); + deep_copy (R, Scalar {}); // Create a test problem nodeTestProblem (gen, numRows, numCols, A.data(), A.lda(), false); diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index b02b6434795d..b7bf49202333 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -46,39 +46,25 @@ #ifdef TSQR_MATVIEW_DEBUG # include #endif // TSQR_MATVIEW_DEBUG - #include #include -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - namespace TSQR { - template< class MatrixViewType1, class MatrixViewType2 > + template + class MatView; + + template void - deep_copy (MatrixViewType1& A, const MatrixViewType2& B) - { - const ptrdiff_t A_nrows (A.extent (0)); - const ptrdiff_t A_ncols (A.extent (1)); - if (A_nrows != ptrdiff_t (B.extent (0)) || - A_ncols != ptrdiff_t (B.extent (1))) { - using std::endl; - std::ostringstream os; - os << "deep_copy: dimensions of A (output matrix) and B (input " - "matrix) are not compatible. A is " << A.extent (0) << " x " - << A.extent (1) << ", but B is " << B.extent (0) << " x " - << B.extent (1) << "."; - throw std::invalid_argument(os.str()); - } - for (ptrdiff_t j = 0; j < A_ncols; ++j) { - auto* const A_j = &A(0,j); - const auto* const B_j = &B(0,j); - for (ptrdiff_t i = 0; i < A_nrows; ++i) { - A_j[i] = B_j[i]; - } - } - } + deep_copy (const MatView& tgt, + const SourceScalar& src); + + template class SourceMat> + void + deep_copy (const MatView& tgt, + const SourceMat& src); template bool @@ -222,7 +208,13 @@ namespace TSQR { return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } - ordinal_type lda() const { return lda_; } + constexpr ordinal_type stride(const int r) const noexcept { + return r == 0 ? ordinal_type(1) : (r == 1 ? lda_ : ordinal_type(0)); + } + + constexpr ordinal_type lda() const noexcept { + return stride(1); + } /// \note The function is const, only because returning A_ doesn't /// change any members of *this. Of course one may use the @@ -342,21 +334,6 @@ namespace TSQR { return A_bottom; } - void - fill (const scalar_type& value) - { - const ordinal_type num_rows = extent(0); - const ordinal_type num_cols = extent(1); - const ordinal_type stride = lda(); - - scalar_type* A_j = data(); - for (ordinal_type j = 0; j < num_cols; ++j, A_j += stride) { - for (ordinal_type i = 0; i < num_rows; ++i) { - A_j[i] = value; - } - } - } - bool operator== (const MatView& rhs) const { return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && lda() == rhs.lda() && data() == rhs.data(); @@ -563,6 +540,48 @@ namespace TSQR { pointer A_ = nullptr; }; + template + void + deep_copy (const MatView& tgt, const SourceScalar& src) + { + using ordinal_type = typename MatView::ordinal_type; + const ordinal_type num_rows = tgt.extent(0); + const ordinal_type num_cols = tgt.extent(1); + const ordinal_type stride = tgt.lda(); + auto* tgt_j = tgt.data(); + for (ordinal_type j = 0; j < num_cols; ++j, tgt_j += stride) { + for (ordinal_type i = 0; i < num_rows; ++i) { + tgt_j[i] = src; + } + } + } + + template class SourceMat> + void + deep_copy (const MatView& tgt, + const SourceMat& src) + { + const ptrdiff_t tgt_nrows (tgt.extent (0)); + const ptrdiff_t tgt_ncols (tgt.extent (1)); + if (tgt_nrows != ptrdiff_t (src.extent (0)) || + tgt_ncols != ptrdiff_t (src.extent (1))) { + std::ostringstream os; + os << "TSQR::deep_copy: dimensions of tgt (output matrix) and " + "src (input matrix) are not compatible. tgt is " + << tgt.extent (0) << " x " << tgt.extent (1) << ", but src " + "is " << src.extent (0) << " x " << src.extent (1) << "."; + throw std::invalid_argument (os.str ()); + } + for (ptrdiff_t j = 0; j < tgt_ncols; ++j) { + auto* const tgt_j = &tgt(0,j); + const auto* const src_j = &src(0,j); + for (ptrdiff_t i = 0; i < tgt_nrows; ++i) { + tgt_j[i] = src_j[i]; + } + } + } } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index f10d54e77cc7..5b94a6c71afe 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -64,14 +64,19 @@ namespace TSQR { template class Matrix { public: - typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + using scalar_type = Scalar; + using ordinal_type = Ordinal; + using pointer = scalar_type*; + using const_pointer = const scalar_type*; + + using mat_view_type = MatView; + using const_mat_view_type = ConstMatView; private: static bool - fits_in_size_t (const Ordinal& ord) + fits_in_size_t (const ordinal_type& ord) { - const Ordinal result = static_cast< Ordinal > (static_cast< size_t > (ord)); + const ordinal_type result = ordinal_type (size_t (ord)); return (ord == result); } @@ -87,20 +92,20 @@ namespace TSQR { /// \param num_cols [in] Number of columns in the matrix /// \return num_rows*num_cols size_t - verified_alloc_size (const Ordinal num_rows, - const Ordinal num_cols) const + verified_alloc_size (const ordinal_type num_rows, + const ordinal_type num_cols) const { - static_assert (std::numeric_limits::is_integer, - "Ordinal must be an integer type."); + static_assert (std::numeric_limits::is_integer, + "ordinal_type must be an integer type."); // Quick exit also checks for zero num_cols (which prevents // division by zero in the tests below). if (num_rows == 0 || num_cols == 0) { return size_t(0); } - // If Ordinal is signed, make sure that num_rows and num_cols + // If ordinal_type is signed, make sure that num_rows and num_cols // are nonnegative. - if (std::numeric_limits::is_signed) { + if (std::numeric_limits::is_signed) { if (num_rows < 0) { std::ostringstream os; os << "# rows (= " << num_rows << ") < 0"; @@ -113,12 +118,12 @@ namespace TSQR { } } - // If Ordinal is bigger than a size_t, do special range + // If ordinal_type is bigger than a size_t, do special range // checking. The compiler warns (comparison of signed and - // unsigned) if Ordinal is a signed type and we try to do + // unsigned) if ordinal_type is a signed type and we try to do // "numeric_limits::max() < - // std::numeric_limits::max()", so instead we cast each - // of num_rows and num_cols to size_t and back to Ordinal again, + // std::numeric_limits::max()", so instead we cast each + // of num_rows and num_cols to size_t and back to ordinal_type again, // and see if we get the same result. If not, then we // definitely can't return a size_t product of num_rows and // num_cols. @@ -153,22 +158,18 @@ namespace TSQR { } public: - using scalar_type = Scalar; - using ordinal_type = Ordinal; - using pointer_type = Scalar*; - //! Constructor with dimensions. - Matrix (const Ordinal num_rows, - const Ordinal num_cols) : + Matrix (const ordinal_type num_rows, + const ordinal_type num_cols) : nrows_ (num_rows), ncols_ (num_cols), A_ (verified_alloc_size (num_rows, num_cols)) {} //! Constructor with dimensions and fill datum. - Matrix (const Ordinal num_rows, - const Ordinal num_cols, - const Scalar& value) : + Matrix (const ordinal_type num_rows, + const ordinal_type num_cols, + const scalar_type& value) : nrows_ (num_rows), ncols_ (num_cols), A_ (verified_alloc_size (num_rows, num_cols), value) @@ -211,18 +212,11 @@ namespace TSQR { } } - //! Fill all entries of the matrix with the given value. - void - fill (const Scalar value) - { - fill_matrix (extent(0), extent(1), data(), lda(), value); - } - /// \brief Non-const reference to element (i,j) of the matrix. /// /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. - Scalar& operator() (const Ordinal i, const Ordinal j) { + scalar_type& operator() (const ordinal_type i, const ordinal_type j) { return A_[i + j*lda()]; } @@ -230,12 +224,12 @@ namespace TSQR { /// /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. - const Scalar& operator() (const Ordinal i, const Ordinal j) const { + const scalar_type& operator() (const ordinal_type i, const ordinal_type j) const { return A_[i + j*lda()]; } //! 1-D std::vector - style access. - Scalar& operator[] (const Ordinal i) { + scalar_type& operator[] (const ordinal_type i) { return A_[i]; } @@ -251,24 +245,27 @@ namespace TSQR { } } - constexpr Ordinal extent (const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : Ordinal(0)); + constexpr ordinal_type extent (const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } - //! Leading dimension (a.k.a. stride) of the matrix. - Ordinal lda() const { return nrows_; } + constexpr ordinal_type stride(const int r) const noexcept { + return r == 0 ? ordinal_type(1) : (r == 1 ? nrows_ : ordinal_type(0)); + } + + constexpr ordinal_type lda() const noexcept { return stride(1); } //! Whether the matrix is empty (has either zero rows or zero columns). bool empty() const { return extent(0) == 0 || extent(1) == 0; } //! A non-const pointer to the matrix data. - Scalar* data() + pointer data() { return A_.size() != 0 ? A_.data () : nullptr; } //! A const pointer to the matrix data. - const Scalar* data() const + const_pointer data() const { return A_.size() != 0 ? A_.data () : nullptr; } @@ -281,7 +278,7 @@ namespace TSQR { //! A const view of the matrix. const_mat_view_type const_view () const { return const_mat_view_type (extent(0), extent(1), - const_cast (data()), lda()); + const_cast (data()), lda()); } /// Change the dimensions of the matrix. Reallocate if necessary. @@ -295,7 +292,7 @@ namespace TSQR { /// not reinterpret the existing matrix data using different /// dimensions. void - reshape (const Ordinal num_rows, const Ordinal num_cols) + reshape (const ordinal_type num_rows, const ordinal_type num_cols) { if (num_rows == extent(0) && num_cols == extent(1)) return; // no need to reallocate or do anything else @@ -308,17 +305,38 @@ namespace TSQR { private: //! Number of rows in the matrix. - Ordinal nrows_ = 0; + ordinal_type nrows_ = 0; //! Number of columns in the matrix. - Ordinal ncols_ = 0; + ordinal_type ncols_ = 0; /// \brief Where the entries of the matrix are stored. /// /// The matrix is stored using one-dimensional storage with /// column-major (Fortran-style) indexing. This makes Matrix /// compatible with the BLAS and LAPACK. - std::vector A_; + std::vector A_; }; + template + void + deep_copy (Matrix& tgt, const SourceScalar& src) + { + MatView tgt_view (tgt.extent(0), tgt.extent(1), + tgt.data(), tgt.lda()); + deep_copy (tgt_view, src); + } + + template class SourceMat> + void + deep_copy (Matrix& tgt, + const SourceMat& src) + { + using mat_view_type = MatView; + mat_view_type tgt_view (tgt.extent(0), tgt.extent(1), + tgt.data(), tgt.lda()); + deep_copy (tgt_view, src); + } } // namespace TSQR #endif // __TSQR_Tsqr_Matrix_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index 3aade82f0fc0..2c2ad7a442dd 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -145,9 +145,9 @@ namespace TSQR { // Set up storage for the test problem Matrix A_local (nrows_local, ncols); if (std::numeric_limits::has_quiet_NaN) { - A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); + deep_copy (A_local, std::numeric_limits::quiet_NaN()); } - Matrix R (ncols, ncols, Scalar(0)); + Matrix R (ncols, ncols, Scalar {}); // Generate the test problem. distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); @@ -341,8 +341,8 @@ namespace TSQR { // Set up storage for the test problem. Matrix A_local (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) { - A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); + if (std::numeric_limits::has_quiet_NaN) { + deep_copy (A_local, std::numeric_limits::quiet_NaN()); } Matrix R (ncols, ncols, Scalar(0)); diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp index 76b4560e70d6..040818017815 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp @@ -399,13 +399,13 @@ namespace TSQR { // Copy the test problem input into R, since the factorization // will overwrite it in place with the final R factor. R.reshape (numCols, numCols); - R.fill (Scalar {}); + deep_copy (R, Scalar {}); deep_copy (R, A_local); // Prepare space in which to construct the explicit Q factor // (local component on this processor) Q_local.reshape (numRowsLocal, numCols); - Q_local.fill (Scalar {}); + deep_copy (Q_local, Scalar {}); } }; @@ -749,7 +749,7 @@ namespace TSQR { // // A_global: Global matrix (only nonempty on Proc 0); only // used temporarily. - Matrix< Ordinal, Scalar > A_global; + Matrix A_global; // This modifies A_local on all procs, and A_global on Proc 0. par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_); @@ -762,7 +762,7 @@ namespace TSQR { // Prepare space in which to construct the explicit Q factor // (local component on this processor) Q_local.reshape (numRowsLocal, numCols); - Q_local.fill (Scalar(0)); + deep_copy (Q_local, Scalar {}); } /// Make sure that timer_type satisfies the TimerType concept. @@ -770,7 +770,7 @@ namespace TSQR { static void conceptChecks () { - verifyTimerConcept< timer_type >(); + verifyTimerConcept(); } }; diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index 64ac372b73c6..1f6035759d3d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -208,8 +208,8 @@ namespace TSQR { } else { const int srcProc = 0; - R_local.fill (scalar_type(0)); - RMessenger< ordinal_type, scalar_type > receiver (messenger); + deep_copy (R_local, scalar_type {}); + RMessenger receiver (messenger); receiver.recv (R_local, srcProc); } } @@ -244,7 +244,7 @@ namespace TSQR { mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); // Fill (the lower triangle) with zeros, since // RMessenger::recv() only writes to the upper triangle. - R_stack_view_cur.fill (scalar_type (0)); + deep_copy (R_stack_view_cur, scalar_type {}); receiver.recv (R_stack_view_cur, srcProc); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index e8af89e0878b..e2293cc2b21a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -181,7 +181,7 @@ namespace TSQR { // Fill A with zeros, and then make its diagonal the given set // of singular values. mat_view_type A_view (nrows, ncols, A, lda); - A_view.fill (Scalar {}); + deep_copy (A_view, Scalar {}); for (Ordinal j = 0; j < ncols; ++j) { A_view(j,j) = Scalar (singular_values[j]); } diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp index 15b9c2701e16..32d907da714b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp @@ -157,10 +157,10 @@ namespace TSQR { Matrix Q (nrows, ncols); Matrix R (ncols, ncols); if (std::numeric_limits::has_quiet_NaN) { - A.fill (std::numeric_limits< Scalar>::quiet_NaN()); - A_copy.fill (std::numeric_limits::quiet_NaN()); - Q.fill (std::numeric_limits::quiet_NaN()); - R.fill (std::numeric_limits::quiet_NaN()); + deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); + deep_copy (A_copy, std::numeric_limits::quiet_NaN()); + deep_copy (Q, std::numeric_limits::quiet_NaN()); + deep_copy (R, std::numeric_limits::quiet_NaN()); } const Ordinal lda = nrows; const Ordinal ldq = nrows; @@ -203,7 +203,7 @@ namespace TSQR { if (b_debug) { Matrix A2 (nrows, ncols); if (std::numeric_limits::has_quiet_NaN) { - A2.fill (std::numeric_limits::quiet_NaN ()); + deep_copy (A2, std::numeric_limits::quiet_NaN ()); } actor.un_cache_block (nrows, ncols, A2.data (), A2.lda (), A_copy.data ()); @@ -220,7 +220,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (Scalar (0)); + deep_copy (R, Scalar {}); // Count the number of cache blocks that factor() will use. // This is only for diagnostic purposes. @@ -456,10 +456,10 @@ namespace TSQR { Matrix Q (nrows, ncols); Matrix R (ncols, ncols); if (std::numeric_limits::has_quiet_NaN) { - A.fill (std::numeric_limits< Scalar>::quiet_NaN()); - A_copy.fill (std::numeric_limits::quiet_NaN()); - Q.fill (std::numeric_limits::quiet_NaN()); - R.fill (std::numeric_limits::quiet_NaN()); + deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); + deep_copy (A_copy, std::numeric_limits::quiet_NaN()); + deep_copy (Q, std::numeric_limits::quiet_NaN()); + deep_copy (R, std::numeric_limits::quiet_NaN()); } const Ordinal lda = nrows; const Ordinal ldq = nrows; @@ -486,7 +486,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (Scalar {}); + deep_copy (R, Scalar {}); lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.lda(), tau.data(), work.data(), lwork); diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp index abf4475a21f3..2d333e266b81 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp @@ -111,10 +111,10 @@ namespace TSQR { Matrix< Ordinal, Scalar > Q (nrows, ncols); Matrix< Ordinal, Scalar > R (ncols, ncols); if (std::numeric_limits< Scalar >::has_quiet_NaN) { - A.fill (std::numeric_limits< Scalar>::quiet_NaN()); - A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN()); - Q.fill (std::numeric_limits< Scalar >::quiet_NaN()); - R.fill (std::numeric_limits< Scalar >::quiet_NaN()); + deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); + deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN()); + deep_copy (Q, std::numeric_limits< Scalar >::quiet_NaN()); + deep_copy (R, std::numeric_limits< Scalar >::quiet_NaN()); } const Ordinal lda = nrows; const Ordinal ldq = nrows; @@ -146,7 +146,7 @@ namespace TSQR { if (b_debug) { Matrix< Ordinal, Scalar > A2 (nrows, ncols); if (std::numeric_limits< Scalar >::has_quiet_NaN) { - A2.fill (std::numeric_limits< Scalar >::quiet_NaN()); + deep_copy (A2, std::numeric_limits< Scalar >::quiet_NaN()); } actor.un_cache_block (nrows, ncols, A2.data(), A2.lda(), A_copy.data()); if (matrix_equal (A, A2)) { @@ -162,7 +162,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (Scalar(0)); + deep_copy (R, Scalar {}); // Factor the matrix and compute the explicit Q factor factor_output_type factor_output = @@ -295,7 +295,7 @@ namespace TSQR { // Fill R with zeros, since the factorization may not overwrite // the strict lower triangle of R. - R.fill (scalar_type(0)); + deep_copy (R, scalar_type {}); // Create a test problem nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), false); diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index 1d829e975bb5..c52a485c79d5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -231,8 +231,8 @@ namespace TSQR { Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols); if (std::numeric_limits::has_quiet_NaN) { - A_local.fill (std::numeric_limits::quiet_NaN ()); - Q_local.fill (std::numeric_limits::quiet_NaN ()); + deep_copy (A_local, std::numeric_limits::quiet_NaN ()); + deep_copy (Q_local, std::numeric_limits::quiet_NaN ()); } Matrix R (ncols, ncols, Scalar(0)); @@ -259,9 +259,9 @@ namespace TSQR { // Factoring the matrix stored in A_local overwrites it, so we // make a copy of A_local. Initialize with NaNs to make sure // that cache blocking works correctly (if applicable). - Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) { - A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN ()); + Matrix A_copy (nrows_local, ncols); + if (std::numeric_limits::has_quiet_NaN) { + deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN ()); } // actual_cache_size_hint: "cache_size_hint" is just a @@ -610,42 +610,44 @@ namespace TSQR { const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); // Set up storage for the test problem. - Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); - Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) - { - A_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); - Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN()); - } - Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0)); + Matrix A_local (nrows_local, ncols); + Matrix Q_local (nrows_local, ncols); + if (std::numeric_limits::has_quiet_NaN) { + deep_copy (A_local, std::numeric_limits::quiet_NaN()); + deep_copy (Q_local, std::numeric_limits::quiet_NaN()); + } + Matrix R (ncols, ncols, Scalar {}); // Generate the test problem. - distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "-- Generated test problem." << endl; + distributedTestProblem (generator, A_local, ordinalComm.get(), + scalarComm.get()); + if (b_debug) { + scalarComm->barrier(); + if (my_rank == 0) { + cerr << "-- Generated test problem." << endl; } + } // Make sure that the test problem (the matrix to factor) was // distributed correctly. - if (b_extra_debug && b_debug) - { - if (my_rank == 0) - cerr << "Test matrix A:" << endl; - scalarComm->barrier (); - printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); - scalarComm->barrier (); + if (b_extra_debug && b_debug) { + if (my_rank == 0) { + cerr << "Test matrix A:" << endl; } + scalarComm->barrier (); + printGlobalMatrix (cerr, A_local, scalarComm.get(), + ordinalComm.get()); + scalarComm->barrier (); + } // Factoring the matrix stored in A_local overwrites it, so we // make a copy of A_local. If specified, rearrange cache blocks // in the copy. Initialize with NaNs to make sure that cache // blocking worked correctly. - Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) - A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN()); + Matrix A_copy (nrows_local, ncols); + if (std::numeric_limits::has_quiet_NaN) { + deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN()); + } // actual_cache_size_hint: "cache_size_hint" is just a // suggestion. TSQR determines the cache block size itself; @@ -655,141 +657,137 @@ namespace TSQR { // value) for TSQR on this MPI node. double tsqr_timing; - if (which == "MpiTbbTSQR") - { + if (which == "MpiTbbTSQR") { #ifdef HAVE_KOKKOSTSQR_TBB - using Teuchos::RCP; - typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; - typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; - typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; - - RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint)); - RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (nodeTsqr, distTsqr); - - // Run the benchmark. - tsqr_timing = - do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, - A_copy, Q_local, R, ntrials, - contiguousCacheBlocks, - human_readable, b_debug); - - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); + using Teuchos::RCP; + typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; + typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; + typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; + + RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint)); + RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); + tsqr_type tsqr (nodeTsqr, distTsqr); + + // Run the benchmark. + tsqr_timing = + do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, + A_copy, Q_local, R, ntrials, + contiguousCacheBlocks, + human_readable, b_debug); + + // Save the "actual" cache block size + actual_cache_size_hint = tsqr.cache_size_hint(); #else - throw std::logic_error("TSQR not built with Intel TBB support"); + throw std::logic_error("TSQR not built with Intel TBB support"); #endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqTSQR") - { - using Teuchos::RCP; - typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type; - typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; - typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; - - // Set up TSQR. - RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (cache_size_hint)); - RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (nodeTsqr, distTsqr); - - // Run the benchmark. - tsqr_timing = - do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, - A_copy, Q_local, R, ntrials, - contiguousCacheBlocks, - human_readable, b_debug); - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); - } - else + } + else if (which == "MpiSeqTSQR") { + using Teuchos::RCP; + using node_tsqr_type = SequentialTsqr; + using dist_tsqr_type = TSQR::DistTsqr; + using tsqr_type = typedef Tsqr; + + // Set up TSQR. + RCP nodeTsqr (new node_tsqr_type (cache_size_hint)); + RCP distTsqr (new dist_tsqr_type (scalarComm)); + tsqr_type tsqr (nodeTsqr, distTsqr); + + // Run the benchmark. + tsqr_timing = + do_tsqr_benchmark (which, tsqr, scalarComm, A_local, + A_copy, Q_local, R, ntrials, + contiguousCacheBlocks, + human_readable, b_debug); + // Save the "actual" cache block size + actual_cache_size_hint = tsqr.cache_size_hint(); + } + else { throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); + } // Find the min and max TSQR timing on all processors. const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing); const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing); // Print the results on Proc 0. - if (my_rank == 0) - { - if (human_readable) - { - std::string human_readable_name; + if (my_rank == 0) { + if (human_readable) { + std::string human_readable_name; - if (which == "MpiSeqTSQR") - human_readable_name = "MPI parallel / cache-blocked TSQR"; - else if (which == "MpiTbbTSQR") - { + if (which == "MpiSeqTSQR") { + human_readable_name = "MPI parallel / cache-blocked TSQR"; + } + else if (which == "MpiTbbTSQR") { #ifdef HAVE_KOKKOSTSQR_TBB - human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; + human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; #else - throw std::logic_error("TSQR not built with Intel TBB support"); + throw std::logic_error("TSQR not built with Intel TBB support"); #endif // HAVE_KOKKOSTSQR_TBB - } - else - throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); + } + else { + throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); + } - cout << human_readable_name << ":" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << nrows_global << endl - << "# columns: " << ncols << endl - << "# MPI processes: " << nprocs << endl; + cout << human_readable_name << ":" << endl + << "Scalar type: " << scalarTypeName << endl + << "# rows: " << nrows_global << endl + << "# columns: " << ncols << endl + << "# MPI processes: " << nprocs << endl; #ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") - cout << "# cores per process: " << num_cores << endl; + if (which == "MpiTbbTSQR") + cout << "# cores per process: " << num_cores << endl; #endif // HAVE_KOKKOSTSQR_TBB - cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl - << "contiguous cache blocks? " << contiguousCacheBlocks << endl - << "# trials: " << ntrials << endl - << "Min total time (s) over all MPI processes: " - << min_tsqr_timing << endl - << "Max total time (s) over all MPI processes: " - << max_tsqr_timing << endl - << endl; - } - else - { - if (printFieldNames) - { - cout << "%" - << "method" - << ",scalarType" - << ",globalNumRows" - << ",numCols" - << ",numProcs" - << ",numCores" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",minTiming" - << ",maxTiming" - << endl; - } - cout << which - << "," << scalarTypeName - << "," << nrows_global - << "," << ncols - << "," << nprocs; + cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl + << "contiguous cache blocks? " << contiguousCacheBlocks << endl + << "# trials: " << ntrials << endl + << "Min total time (s) over all MPI processes: " + << min_tsqr_timing << endl + << "Max total time (s) over all MPI processes: " + << max_tsqr_timing << endl + << endl; + } + else { + if (printFieldNames) { + cout << "%" + << "method" + << ",scalarType" + << ",globalNumRows" + << ",numCols" + << ",numProcs" + << ",numCores" + << ",cacheSizeHint" + << ",contiguousCacheBlocks" + << ",numTrials" + << ",minTiming" + << ",maxTiming" + << endl; + } + cout << which + << "," << scalarTypeName + << "," << nrows_global + << "," << ncols + << "," << nprocs; #ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") - cout << "," << num_cores; - else - cout << ",1"; + if (which == "MpiTbbTSQR") { + cout << "," << num_cores; + } + else { + cout << ",1"; + } #else - cout << ",1"; + cout << ",1"; #endif // HAVE_KOKKOSTSQR_TBB - cout << "," << actual_cache_size_hint - << "," << contiguousCacheBlocks - << "," << ntrials - << "," << min_tsqr_timing - << "," << max_tsqr_timing - << endl; - } + cout << "," << actual_cache_size_hint + << "," << contiguousCacheBlocks + << "," << ntrials + << "," << min_tsqr_timing + << "," << max_tsqr_timing + << endl; } + } } - - } // namespace Test } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp index 9224e4e5bfd8..2063ae9ddf5f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp @@ -159,7 +159,6 @@ namespace TSQR { } } - template< class Ordinal, class Scalar, class Generator > void generate_matrix (const Ordinal nrows, diff --git a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp index 74ba0846d08c..51d97b9a4d69 100644 --- a/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_generateStack.hpp @@ -79,15 +79,15 @@ namespace TSQR { const int numProcs, const Ordinal numCols) { - typedef MatView mat_view_type; + using mat_view_type = MatView; TSQR::Random::MatrixGenerator matGen (generator); const Ordinal numRows = numProcs * numCols; A_global.reshape (numRows, numCols); - A_global.fill (Scalar {}); + deep_copy (A_global, Scalar {}); for (int p = 0; p < numProcs; ++p) { - Scalar* const curptr = A_global.data() + p*numCols; + auto* const curptr = A_global.data() + p*numCols; mat_view_type R_cur (numCols, numCols, curptr, numRows); matGen.fill_random_R (numCols, R_cur.data(), numRows, singularValues); } From 98bd27407a3c60956fe32eb48b4cd451dffc66a6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 25 Nov 2019 18:08:46 -0700 Subject: [PATCH 15/50] TSQR::*Mat{View,rix}: Replace lda() with stride(r), as in Kokkos --- .../tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp | 10 +- .../tsqr/src/TbbTsqr_CacheBlockTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp | 5 +- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 28 +- .../tsqr/src/TbbTsqr_FillWithZerosTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_Partitioner.hpp | 4 +- .../tsqr/src/TbbTsqr_RevealRankTask.hpp | 81 +++--- .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 27 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp | 31 +-- .../tsqr/src/TbbTsqr_UnCacheBlockTask.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr.hpp | 16 +- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 73 +++--- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 235 ++++++++--------- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 46 ++-- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 110 ++++---- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 7 +- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 239 +++++++++--------- .../tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp | 28 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 45 ++-- .../tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp | 52 ++-- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 83 +++--- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 28 +- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 12 +- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 28 +- packages/tpetra/tsqr/src/Tsqr_ParTest.hpp | 16 +- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 11 +- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 20 +- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 14 +- packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp | 40 ++- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 26 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 26 +- packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp | 38 +-- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 98 ++++--- .../tsqr/src/Tsqr_printGlobalMatrix.hpp | 4 +- 34 files changed, 746 insertions(+), 741 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp index c90d5ca63188..4edacbee0255 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp @@ -192,8 +192,8 @@ namespace TSQR { timer.start(); const std::vector& seq_outputs = factor_output_.first; seq_.apply (apply_type_, Q_.extent(0), Q_.extent(1), - Q_.data(), Q_.lda(), seq_outputs[P_first_], - C_.extent(1), C_.data(), C_.lda(), + Q_.data(), Q_.stride(1), seq_outputs[P_first_], + C_.extent(1), C_.data(), C_.stride(1), contiguous_cache_blocks_); my_seq_timing_ = timer.stop(); } @@ -214,9 +214,9 @@ namespace TSQR { std::vector work (C_top.extent(1)); combine_.apply_pair (apply_type_, C_top.extent(1), Q_bot.extent(1), - Q_bot.data(), Q_bot.lda(), tau.data(), - C_top.data(), C_top.lda(), - C_bot.data(), C_bot.lda(), work.data()); + Q_bot.data(), Q_bot.stride(1), tau.data(), + C_top.data(), C_top.stride(1), + C_bot.data(), C_bot.stride(1), work.data()); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp index 2aeda840c12a..a70ae206da55 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp @@ -135,7 +135,7 @@ namespace TSQR { execute_base_case () { seq_.cache_block (A_out_.extent(0), A_out_.extent(1), - A_out_.data(), A_in_.data(), A_in_.lda()); + A_out_.data(), A_in_.data(), A_in_.stride(1)); } }; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp index e01f4c467a6a..0a93f2723ac5 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp @@ -122,8 +122,9 @@ namespace TSQR { execute_base_case () { // Fill my partition with zeros. - seq_.fill_with_zeros (Q_out_.extent(0), Q_out_.extent(1), Q_out_.data(), - Q_out_.lda(), contiguous_cache_blocks_); + seq_.fill_with_zeros (Q_out_.extent(0), Q_out_.extent(1), + Q_out_.data(), Q_out_.stride(1), + contiguous_cache_blocks_); // If our partition is the first (topmost), fill it with // the first Q_out.extent(1) columns of the identity matrix. if (P_first_ == 0) { diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index 8b27cd2c39da..8072f55ab4aa 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -189,23 +189,25 @@ namespace TSQR { mat_view_type& A_bot) { const char thePrefix[] = "TSQR::TBB::Factor::factor_pair: "; - TEUCHOS_TEST_FOR_EXCEPTION(P_top == P_bot, std::logic_error, - thePrefix << "Should never get here! P_top == P_bot (= " - << P_top << "), that is, the indices of the thread " - "partitions are the same."); + TEUCHOS_TEST_FOR_EXCEPTION + (P_top == P_bot, std::logic_error, thePrefix << "Should " + "never get here! P_top == P_bot (= " << P_top << "), that " + "is, the indices of the thread partitions are the same."); // We only read and write the upper ncols x ncols triangle of // each block. - TEUCHOS_TEST_FOR_EXCEPTION(A_top.extent(1) != A_bot.extent(1), std::logic_error, - thePrefix << "The top cache block A_top is " - << A_top.extent(0) << " x " << A_top.extent(1) - << ", and the bottom cache block A_bot is " - << A_bot.extent(0) << " x " << A_bot.extent(1) - << "; this means we can't factor [A_top; A_bot]."); + TEUCHOS_TEST_FOR_EXCEPTION + (A_top.extent(1) != A_bot.extent(1), std::logic_error, + thePrefix << "The top cache block A_top is " + << A_top.extent(0) << " x " << A_top.extent(1) + << ", and the bottom cache block A_bot is " + << A_bot.extent(0) << " x " << A_bot.extent(1) + << "; this means we can't factor [A_top; A_bot]."); const LocalOrdinal ncols = A_top.extent(1); std::vector& tau = par_output_[P_bot]; std::vector work (ncols); - combine_.factor_pair (ncols, A_top.data(), A_top.lda(), - A_bot.data(), A_bot.lda(), &tau[0], &work[0]); + combine_.factor_pair (ncols, A_top.data(), A_top.stride(1), + A_bot.data(), A_bot.stride(1), + tau.data(), work.data()); } void @@ -215,7 +217,7 @@ namespace TSQR { timer.start(); seq_outputs_[P_first_] = seq_.factor (A_.extent(0), A_.extent(1), A_.data(), - A_.lda(), contiguous_cache_blocks_); + A_.stride(1), contiguous_cache_blocks_); // Assign the topmost cache block of the current partition to // *A_top_ptr_. Every base case invocation does this, so that // we can combine subproblems. The root task also does this, diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp index 1e965b0348a4..8bc0f42264a7 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp @@ -125,7 +125,7 @@ namespace TSQR { { // Fill my partition with zeros. seq_.fill_with_zeros (C_.extent(0), C_.extent(1), C_.data(), - C_.lda(), contiguous_cache_blocks_); + C_.stride(1), contiguous_cache_blocks_); } }; } // namespace TBB diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp index e9d2ad9c7896..f37ab6a7a06c 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp @@ -124,8 +124,8 @@ namespace TSQR { } else { pointer_type A_bot_ptr = A.data() + num_rows_top; - MatrixViewType A_top (num_rows_top, ncols, A.data(), A.lda()); - MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, A.lda()); + MatrixViewType A_top (num_rows_top, ncols, A.data(), A.stride(1)); + MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, A.stride(1)); return std::make_pair (A_top, A_bot); } } diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp index c7c184d865ad..e03ea15f5660 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp @@ -85,55 +85,56 @@ namespace TSQR { // doesn't suggest any orthogonality of the B input matrix, // though in this case B is U and U is orthogonal // (resp. unitary if Scalar is complex). - seq_.Q_times_B (Q_.extent(0), Q_.extent(1), Q_.data(), Q_.lda(), - U_.data(), U_.lda(), contiguous_cache_blocks_); + seq_.Q_times_B (Q_.extent(0), Q_.extent(1), + Q_.data(), Q_.stride(1), + U_.data(), U_.stride(1), + contiguous_cache_blocks_); } tbb::task* execute () { using tbb::task; - if (P_first_ > P_last_ || Q_.empty()) - return NULL; // shouldn't get here, but just in case... - else if (P_first_ == P_last_) - { + if (P_first_ > P_last_ || Q_.empty()) { + return nullptr; // shouldn't get here, but just in case... + } + else if (P_first_ == P_last_) { + execute_base_case (); + return nullptr; + } + else { + // Recurse on two intervals: [P_first, P_mid] and + // [P_mid+1, P_last] + const size_t P_mid = (P_first_ + P_last_) / 2; + split_type out_split = + partitioner_.split (Q_, P_first_, P_mid, P_last_, + contiguous_cache_blocks_); + // The partitioner may decide that the current block Q_ has + // too few rows to be worth splitting. In that case, + // out_split.second (the bottom block) will be empty. We + // can deal with this by treating it as the base case. + if (out_split.second.empty() || out_split.second.extent(0) == 0) { execute_base_case (); - return NULL; + return nullptr; } - else - { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last] - const size_t P_mid = (P_first_ + P_last_) / 2; - split_type out_split = - partitioner_.split (Q_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block Q_ - // has too few rows to be worth splitting. In that case, - // out_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.extent(0) == 0) { - execute_base_case (); - return nullptr; - } - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - RevealRankTask& topTask = *new( c.allocate_child() ) - RevealRankTask (P_first_, P_mid, out_split.first, U_, - seq_, contiguous_cache_blocks_); - RevealRankTask& botTask = *new( c.allocate_child() ) - RevealRankTask (P_mid+1, P_last_, out_split.second, U_, - seq_, contiguous_cache_blocks_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } + // "c": continuation task + tbb::empty_task& c = + *new( allocate_continuation() ) tbb::empty_task; + // Recurse on the split + RevealRankTask& topTask = *new( c.allocate_child() ) + RevealRankTask (P_first_, P_mid, out_split.first, U_, + seq_, contiguous_cache_blocks_); + RevealRankTask& botTask = *new( c.allocate_child() ) + RevealRankTask (P_mid+1, P_last_, out_split.second, U_, + seq_, contiguous_cache_blocks_); + // Set reference count of parent (in this case, the + // continuation task) to 2 (since 2 children -- no + // additional task since no waiting). + c.set_ref_count (2); + c.spawn (botTask); + return &topTask; // scheduler bypass optimization + } } private: diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index 2fa287765a9c..db8a354f09ac 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -327,7 +327,7 @@ namespace TSQR { // Copy the R factor out of A_top into R. seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.lda(), R, ldr, contiguous_cache_blocks); + A_top.stride(1), R, ldr, contiguous_cache_blocks); // Save the timings for future reference if (min_seq_timing < min_seq_factor_timing_) @@ -514,15 +514,14 @@ namespace TSQR { const LocalOrdinal rank = reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol); - if (rank < ncols) - { - // If R is not full rank: reveal_R_rank() already computed - // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and - // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q - // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), - contiguous_cache_blocks); - } + if (rank < ncols) { + // If R is not full rank: reveal_R_rank() already computed + // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and + // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q + // := Q \cdot U\f$, respecting cache blocks of Q. + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1), + contiguous_cache_blocks); + } return rank; } @@ -644,9 +643,9 @@ namespace TSQR { mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.lda()), + Q_top.data(), Q_top.stride(1)), mat_view_type (C_top.extent(1), C_top.extent(1), - C_top.data(), C_top.lda())); + C_top.data(), C_top.stride(1))); } else { // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] @@ -670,9 +669,9 @@ namespace TSQR { mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.lda()), + Q_top.data(), Q_top.stride(1)), mat_view_type (C_top.extent(1), C_top.extent(1), - C_top.data(), C_top.lda())); + C_top.data(), C_top.stride(1))); } else { build_partition_array (P_first, P_mid, top_blocks, diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp index 1f3a4ac252a7..5c1d584628d5 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp @@ -110,7 +110,7 @@ namespace TSQR { } else if (P_first == P_last) { std::pair results = - seq_.factor (A.extent(0), A.extent(1), A.data(), A.lda(), + seq_.factor (A.extent(0), A.extent(1), A.data(), A.stride(1), contiguous_cache_blocks); seq_outputs[P_first] = results.first; A_top = A; @@ -137,7 +137,7 @@ namespace TSQR { // the topmost partition. if (depth == 0) { seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.lda(), R, ldr, contiguous_cache_blocks); + A_top.stride(1), R, ldr, contiguous_cache_blocks); } return A_top; } @@ -188,8 +188,8 @@ namespace TSQR { const_mat_view Q_top = blocker.top_block (Q, contiguous_cache_blocks); mat_view C_top = blocker.top_block (C, contiguous_cache_blocks); top_blocks[P_first] = - std::make_pair (const_mat_view (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.lda()), - mat_view (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.lda())); + std::make_pair (const_mat_view (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.stride(1)), + mat_view (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.stride(1))); } else { @@ -227,9 +227,9 @@ namespace TSQR { else if (P_first == P_last) { const std::vector< SeqOutput >& seq_outputs = factor_output.first; - seq_.apply ("N", Q.extent(0), Q.extent(1), Q.data(), Q.lda(), + seq_.apply ("N", Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), seq_outputs[P_first], C.extent(1), C.data(), - C.lda(), contiguous_cache_blocks); + C.stride(1), contiguous_cache_blocks); } else { @@ -270,9 +270,9 @@ namespace TSQR { } else if (P_first == P_last) { const std::vector& seq_outputs = factor_output.first; - seq_.apply (op, Q.extent(0), Q.extent(1), Q.data(), Q.lda(), + seq_.apply (op, Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), seq_outputs[P_first], C.extent(1), C.data(), - C.lda(), contiguous_cache_blocks); + C.stride(1), contiguous_cache_blocks); return std::make_pair (Q, C); } else { @@ -327,8 +327,8 @@ namespace TSQR { std::vector< Scalar > work (ncols); TSQR::Combine< LocalOrdinal, Scalar > combine_; - combine_.factor_pair (ncols, A_top.data(), A_top.lda(), - A_bot.data(), A_bot.lda(), &tau[0], &work[0]); + combine_.factor_pair (ncols, A_top.data(), A_top.stride(1), + A_bot.data(), A_bot.stride(1), &tau[0], &work[0]); } template< class LocalOrdinal, class Scalar > @@ -351,9 +351,9 @@ namespace TSQR { TSQR::Combine combine_; combine_.apply_pair (trans.c_str(), C_top.extent(1), Q_bot.extent(1), - Q_bot.data(), Q_bot.lda(), &tau[0], - C_top.data(), C_top.lda(), - C_bot.data(), C_bot.lda(), &work[0]); + Q_bot.data(), Q_bot.stride(1), &tau[0], + C_top.data(), C_top.stride(1), + C_bot.data(), C_bot.stride(1), &work[0]); } template< class LocalOrdinal, class Scalar > @@ -368,7 +368,7 @@ namespace TSQR { return; else if (P_first == P_last) seq_.cache_block (A_out.extent(0), A_out.extent(1), A_out.data(), - A_in.data(), A_in.lda()); + A_in.data(), A_in.stride(1)); else { const size_t P_mid = (P_first + P_last) / 2; @@ -396,7 +396,8 @@ namespace TSQR { } else if (P_first == P_last) { seq_.un_cache_block (A_out.extent(0), A_out.extent(1), - A_out.data(), A_out.lda(), A_in.data()); + A_out.data(), A_out.stride(1), + A_in.data()); } else { const size_t P_mid = (P_first + P_last) / 2; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp index 55ae23b63e76..dffc07743d5c 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp @@ -133,7 +133,7 @@ namespace TSQR { execute_base_case () { seq_.un_cache_block (A_out_.extent(0), A_out_.extent(1), - A_out_.data(), A_out_.lda(), + A_out_.data(), A_out_.stride(1), A_in_.data()); } }; diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 1e8282c98c7d..7b76a8b74fa0 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -284,7 +284,7 @@ namespace TSQR { // part of the QR factorization. { mat_view_type Q_top (numCols, numCols, Q_top_block.data(), - Q_top_block.lda()); + Q_top_block.stride(1)); mat_view_type R_view (numCols, numCols, R, LDR); distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); } @@ -374,7 +374,7 @@ namespace TSQR { // part of the QR factorization. { mat_view_type Q_top (numCols, numCols, Q_top_block.data(), - Q_top_block.lda()); + Q_top_block.stride(1)); mat_view_type R_view (numCols, numCols, R, LDR); distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); } @@ -453,7 +453,7 @@ namespace TSQR { deep_copy (R_view, Scalar {}); NodeOutput nodeResults = nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, - R_view.data(), R_view.lda(), + R_view.data(), R_view.stride(1), contiguousCacheBlocks); DistOutput distResults = distTsqr_->factor (R_view); return std::make_pair (nodeResults, distResults); @@ -531,7 +531,7 @@ namespace TSQR { // View of the topmost ncols_C by ncols_C block of C. mat_view_type C_top_view (ncols_C, ncols_C, C_view_top_block.data(), - C_view_top_block.lda()); + C_view_top_block.stride(1)); if (! transposed) { // C_top (small compact storage) gets a deep copy of the top @@ -540,7 +540,7 @@ namespace TSQR { // Compute in place on all processors' C_top blocks. distTsqr_->apply (applyType, C_top.extent(1), ncols_Q, C_top.data(), - C_top.lda(), factor_output.second); + C_top.stride(1), factor_output.second); // Copy the result from C_top back into the top ncols_C by // ncols_C block of C_local. @@ -567,7 +567,7 @@ namespace TSQR { // Compute in place on all processors' C_top blocks. distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.data(), - C_top.lda(), factor_output.second); + C_top.stride(1), factor_output.second); // Copy the result from C_top back into the top ncols_C by // ncols_C block of C_local. @@ -764,13 +764,13 @@ namespace TSQR { // matrix_type U (ncols, ncols, STS::zero()); const ordinal_type rank = - reveal_R_rank (ncols, R, ldr, U.data(), U.lda(), tol); + reveal_R_rank (ncols, R, ldr, U.data(), U.stride(1), tol); if (rank < ncols) { // If R is not full rank: reveal_R_rank() already computed // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1), contiguousCacheBlocks); } return rank; diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 9cbb86d28fd1..b0087ef96193 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -242,17 +242,16 @@ namespace TSQR { fill_with_zeros (MatrixViewType A, const bool contiguous_cache_blocks) const { - // Note: if the cache blocks are stored contiguously, A.lda() + // Note: if the cache blocks are stored contiguously, A.stride(1) // won't be the correct leading dimension of A, but it won't // matter: we only ever operate on A_cur here, and A_cur's // leading dimension is set correctly by split_top_block(). - while (! A.empty()) - { - // This call modifies the matrix view A, but that's OK since - // we passed the input view by copy, not by reference. - MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks); - deep_copy (A_cur, Scalar {}); - } + while (! A.empty()) { + // This call modifies the matrix view A, but that's OK since + // we passed the input view by copy, not by reference. + MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks); + deep_copy (A_cur, Scalar {}); + } } /// \brief Fill the matrix A with zeros, respecting cache blocks. @@ -328,20 +327,21 @@ namespace TSQR { // Leading dimension doesn't matter since A_out will be cache blocked. mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_in); - while (! A_in_rest.empty()) - { - if (A_out_rest.empty()) - throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); + while (! A_in_rest.empty()) { + if (A_out_rest.empty()) { + throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); + } - // This call modifies A_in_rest. - const_mat_view_type A_in_cur = split_top_block (A_in_rest, false); + // This call modifies A_in_rest. + const_mat_view_type A_in_cur = split_top_block (A_in_rest, false); - // This call modifies A_out_rest. - mat_view_type A_out_cur = split_top_block (A_out_rest, true); + // This call modifies A_out_rest. + mat_view_type A_out_cur = split_top_block (A_out_rest, true); - copy_matrix (A_in_cur.extent(0), num_cols, A_out_cur.data(), - A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); - } + copy_matrix (A_in_cur.extent(0), num_cols, + A_out_cur.data(), A_out_cur.stride(1), + A_in_cur.data(), A_in_cur.stride(1)); + } } //! "Un"-cache-block the given A_in matrix into A_out. @@ -361,20 +361,21 @@ namespace TSQR { const_mat_view_type A_in_rest (num_rows, num_cols, A_in, lda_out); mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_out); - while (! A_in_rest.empty()) - { - if (A_out_rest.empty()) - throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); + while (! A_in_rest.empty()) { + if (A_out_rest.empty()) { + throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); + } - // This call modifies A_in_rest. - const_mat_view_type A_in_cur = split_top_block (A_in_rest, true); + // This call modifies A_in_rest. + const_mat_view_type A_in_cur = split_top_block (A_in_rest, true); - // This call modifies A_out_rest. - mat_view_type A_out_cur = split_top_block (A_out_rest, false); + // This call modifies A_out_rest. + mat_view_type A_out_cur = split_top_block (A_out_rest, false); - copy_matrix (A_in_cur.extent(0), num_cols, A_out_cur.data(), - A_out_cur.lda(), A_in_cur.data(), A_in_cur.lda()); - } + copy_matrix (A_in_cur.extent(0), num_cols, + A_out_cur.data(), A_out_cur.stride(1), + A_in_cur.data(), A_in_cur.stride(1)); + } } /// \brief Return the cache block with index \c cache_block_index. @@ -411,18 +412,20 @@ namespace TSQR { // result[2] = pointer offset (A.data() + result[2]) // result[3] = leading dimension (a.k.a. stride) of the cache block std::vector result = - strategy_.cache_block_details (cache_block_index, A.extent(0), A.extent(1), - A.lda(), nrows_cache_block(), + strategy_.cache_block_details (cache_block_index, A.extent(0), + A.extent(1), A.stride(1), + nrows_cache_block(), contiguous_cache_blocks); - if (result[1] == 0) + if (result[1] == 0) { // For some reason, the cache block is empty. - return MatrixViewType (0, 0, NULL, 0); + return MatrixViewType (0, 0, nullptr, 0); + } // We expect that ordinal_type is signed, so adding signed // (ordinal_type) to unsigned (pointer) may raise compiler // warnings. return MatrixViewType (result[1], A.extent(1), - A.data() + static_cast(result[2]), + A.data() + size_t(result[2]), result[3]); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 52aa6203fdac..27ebc62b08be 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -293,7 +293,8 @@ namespace TSQR { matrix_type A (numRows, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); + matGen.fill_random_svd (numRows, numCols, A.data(), + A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); @@ -314,11 +315,11 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (numRows, numCols, A.data(), A.lda(), + combiner.factor_first (numRows, numCols, A.data(), A.stride(1), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - Q.data(), Q.lda(), work.data()); + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); } // How much time numTrials runs must take in order for @@ -344,11 +345,11 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (numRows, numCols, A.data(), A.lda(), + combiner.factor_first (numRows, numCols, A.data(), A.stride(1), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - Q.data(), Q.lda(), work.data()); + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -377,10 +378,11 @@ namespace TSQR { const Ordinal numCols, const int numTrials) { - if (numRows == 0 || numCols == 0) + if (numRows == 0 || numCols == 0) { throw std::invalid_argument("Benchmarking does not make sense for " "a matrix with either zero rows or zero " "columns."); + } TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument, "The number of trials must be positive, but " "numTrials = " << numTrials << "."); @@ -392,7 +394,8 @@ namespace TSQR { matrix_type A (numRows, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); + matGen.fill_random_svd (numRows, numCols, A.data(), + A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); @@ -410,27 +413,25 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_first (numRows, numCols, A.data(), A.lda(), - tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - Q.data(), Q.lda(), work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_first (numRows, numCols, A.data(), A.stride(1), + tau.data(), work.data()); + combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); + } // // The actual timing runs. // timer_type timer ("Combine first"); timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_first (numRows, numCols, A.data(), A.lda(), - tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - Q.data(), Q.lda(), work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_first (numRows, numCols, A.data(), A.stride(1), + tau.data(), work.data()); + combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); + } return timer.stop(); } @@ -478,12 +479,14 @@ namespace TSQR { matrix_type R (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R.data(), R.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R.data(), + R.stride(1), sigmas.data()); // Now generate a random cache block. matrix_type A (numRows, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); + matGen.fill_random_svd (numRows, numCols, A.data(), + A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); @@ -501,16 +504,15 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_inner (numRows, numCols, R.data(), R.lda(), - A.data(), A.lda(), tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_inner (numRows, numCols, R.data(), R.stride(1), + A.data(), A.stride(1), tau.data(), work.data()); + combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } // How much time numTrials runs must take in order for // numTrials to be considered sufficiently large. @@ -533,16 +535,15 @@ namespace TSQR { do { numTrials *= 2; // First value of numTrials is 4. timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_inner (numRows, numCols, R.data(), R.lda(), - A.data(), A.lda(), tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_inner (numRows, numCols, R.data(), R.stride(1), + A.data(), A.stride(1), tau.data(), work.data()); + combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -588,12 +589,12 @@ namespace TSQR { matrix_type R (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R.data(), R.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R.data(), R.stride(1), sigmas.data()); // Now generate a random cache block. matrix_type A (numRows, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.data(), A.lda(), sigmas.data()); + matGen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows + numCols, numCols); @@ -611,31 +612,35 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_inner (numRows, numCols, R.data(), R.lda(), - A.data(), A.lda(), tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_inner (numRows, numCols, + R.data(), R.stride(1), + A.data(), A.stride(1), tau.data(), + work.data()); + combiner.apply_inner (ApplyType("N"), + numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } // // The actual timing runs. // timer_type timer ("Combine cache block"); timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_inner (numRows, numCols, R.data(), R.lda(), - A.data(), A.lda(), tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_inner (numRows, numCols, + R.data(), R.stride(1), + A.data(), A.stride(1), + tau.data(), work.data()); + combiner.apply_inner (ApplyType("N"), + numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } return timer.stop(); } @@ -679,12 +684,12 @@ namespace TSQR { matrix_type R1 (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R1.data(), R1.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.data(), R2.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); @@ -702,17 +707,16 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_pair (numCols, R1.data(), R1.lda(), - R2.data(), R2.lda(), - tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_pair (numCols, R1.data(), R1.stride(1), + R2.data(), R2.stride(1), + tau.data(), work.data()); + combiner.apply_pair (ApplyType("N"), numCols, numCols, + R2.data(), R2.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } // How much time numTrials runs must take in order for // numTrials to be considered sufficiently large. @@ -735,17 +739,16 @@ namespace TSQR { do { numTrials *= 2; // First value of numTrials is 4. timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_pair (numCols, R1.data(), R1.lda(), - R2.data(), R2.lda(), - tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_pair (numCols, R1.data(), R1.stride(1), + R2.data(), R2.stride(1), + tau.data(), work.data()); + combiner.apply_pair (ApplyType("N"), numCols, numCols, + R2.data(), R2.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -787,12 +790,12 @@ namespace TSQR { matrix_type R1 (numCols, numCols); std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R1.data(), R1.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.data(), R2.lda(), sigmas.data()); + matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); @@ -810,33 +813,31 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_pair (numCols, R1.data(), R1.lda(), - R2.data(), R2.lda(), - tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_pair (numCols, R1.data(), R1.stride(1), + R2.data(), R2.stride(1), + tau.data(), work.data()); + combiner.apply_pair (ApplyType("N"), numCols, numCols, + R2.data(), R2.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } // // The actual timing runs. // timer_type timer ("Combine pair"); timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_pair (numCols, R1.data(), R1.lda(), - R2.data(), R2.lda(), - tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.lda(), tau.data(), - &Q(0, 0), Q.lda(), - &Q(numCols, 0), Q.lda(), - work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_pair (numCols, R1.data(), R1.stride(1), + R2.data(), R2.stride(1), + tau.data(), work.data()); + combiner.apply_pair (ApplyType("N"), numCols, numCols, + R2.data(), R2.stride(1), tau.data(), + &Q(0, 0), Q.stride(1), + &Q(numCols, 0), Q.stride(1), + work.data()); + } return timer.stop(); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 63c618c05746..f7e9242ac921 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -137,13 +137,13 @@ namespace TSQR { A_buf_.reshape (numRows, ncols_Q); deep_copy (A_buf_, Scalar {}); const_mat_view_type A_bot (m, ncols_Q, A, lda); - mat_view_type A_buf_bot (m, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.lda()); + mat_view_type A_buf_bot (m, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.stride(1)); deep_copy (A_buf_bot, A_bot); C_buf_.reshape (numRows, ncols_C); deep_copy (C_buf_, Scalar {}); - mat_view_type C_buf_top (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.lda()); - mat_view_type C_buf_bot (m, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.lda()); + mat_view_type C_buf_top (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.stride(1)); + mat_view_type C_buf_bot (m, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.stride(1)); mat_view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top); mat_view_type C_bot_view (m, ncols_C, C_bot, ldc_bot); deep_copy (C_buf_top, C_top_view); @@ -152,8 +152,8 @@ namespace TSQR { const std::string trans = apply_type.toString (); const int lwork = ncols_C; lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.data(), A_buf_.lda(), tau, - C_buf_.data(), C_buf_.lda(), + A_buf_.data(), A_buf_.stride(1), tau, + C_buf_.data(), C_buf_.stride(1), work, lwork); // Copy back the results. deep_copy (C_top_view, C_buf_top); @@ -178,17 +178,17 @@ namespace TSQR { // we only want to include the upper triangle in the // factorization. Thus, only copy the upper triangle of R into // the appropriate place in the buffer. - copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.lda(), R, ldr); - copy_matrix (m, n, &A_buf_(n, 0), A_buf_.lda(), A, lda); + copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.stride(1), R, ldr); + copy_matrix (m, n, &A_buf_(n, 0), A_buf_.stride(1), A, lda); const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.lda(), + lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1), tau, work, lwork); // Copy back the results. R might be a view of the upper // triangle of a cache block, so only copy into the upper // triangle of R. - copy_upper_triangle (n, n, R, ldr, &A_buf_(0, 0), A_buf_.lda()); - copy_matrix (m, n, A, lda, &A_buf_(n, 0), A_buf_.lda()); + copy_upper_triangle (n, n, R, ldr, &A_buf_(0, 0), A_buf_.stride(1)); + copy_matrix (m, n, A, lda, &A_buf_(n, 0), A_buf_.stride(1)); } void @@ -209,18 +209,18 @@ namespace TSQR { // views of some cache block (where the strict lower triangle // contains things we don't want to include in the // factorization). - copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.lda(), R_top, ldr_top); - copy_upper_triangle (n, n, &A_buf_(n, 0), A_buf_.lda(), R_bot, ldr_bot); + copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.stride(1), R_top, ldr_top); + copy_upper_triangle (n, n, &A_buf_(n, 0), A_buf_.stride(1), R_bot, ldr_bot); const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.lda(), + lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1), tau, work, lwork); // Copy back the results. Only read the upper triangles of the // two n by n row blocks of A_buf_ (this means we don't have to // zero out the strict lower triangles), and only touch the // upper triangles of R_top and R_bot. - copy_upper_triangle (n, n, R_top, ldr_top, &A_buf_(0, 0), A_buf_.lda()); - copy_upper_triangle (n, n, R_bot, ldr_bot, &A_buf_(n, 0), A_buf_.lda()); + copy_upper_triangle (n, n, R_top, ldr_top, &A_buf_(0, 0), A_buf_.stride(1)); + copy_upper_triangle (n, n, R_bot, ldr_bot, &A_buf_(n, 0), A_buf_.stride(1)); } void @@ -241,21 +241,23 @@ namespace TSQR { A_buf_.reshape (numRows, ncols_Q); deep_copy (A_buf_, Scalar {}); copy_upper_triangle (ncols_Q, ncols_Q, - &A_buf_(ncols_Q, 0), A_buf_.lda(), + &A_buf_(ncols_Q, 0), A_buf_.stride(1), R_bot, ldr_bot); C_buf_.reshape (numRows, ncols_C); - copy_matrix (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.lda(), C_top, ldc_top); - copy_matrix (ncols_Q, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.lda(), C_bot, ldc_bot); + copy_matrix (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.stride(1), C_top, ldc_top); + copy_matrix (ncols_Q, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.stride(1), C_bot, ldc_bot); const int lwork = ncols_Q; const std::string trans = apply_type.toString (); lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.data(), A_buf_.lda(), tau, - C_buf_.data(), C_buf_.lda(), + A_buf_.data(), A_buf_.stride(1), tau, + C_buf_.data(), C_buf_.stride(1), work, lwork); // Copy back the results. - copy_matrix (ncols_Q, ncols_C, C_top, ldc_top, &C_buf_(0, 0), C_buf_.lda()); - copy_matrix (ncols_Q, ncols_C, C_bot, ldc_bot, &C_buf_(ncols_Q, 0), C_buf_.lda()); + copy_matrix (ncols_Q, ncols_C, C_top, ldc_top, + &C_buf_(0, 0), C_buf_.stride(1)); + copy_matrix (ncols_Q, ncols_C, C_bot, ldc_bot, + &C_buf_(ncols_Q, 0), C_buf_.stride(1)); } private: diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 75378cf9cd40..2111ab4d6d16 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -205,7 +205,7 @@ namespace TSQR { printMatrix (std::ostream& out, const MatrixViewType& A) { - print_local_matrix (out, A.extent(0), A.extent(1), A.data(), A.lda()); + print_local_matrix (out, A.extent(0), A.extent(1), A.data(), A.stride(1)); } template @@ -215,8 +215,8 @@ namespace TSQR { const MatrixViewType& Q, const MatrixViewType& R) { - return local_verify (A.extent(0), A.extent(1), A.data(), A.lda(), - Q.data(), Q.lda(), R.data(), R.lda()); + return local_verify (A.extent(0), A.extent(1), A.data(), A.stride(1), + Q.data(), Q.stride(1), R.data(), R.stride(1)); } /// \brief Test accuracy of TSQR::Combine @@ -282,21 +282,21 @@ namespace TSQR { matrix_type R3 (numCols, numCols, Scalar(0)); matrix_type A (numRows, numCols, Scalar(0)); matgen_type matgen (gen); - matgen.fill_random_R (numCols, R1.data(), R1.lda(), &sigma_R1[0]); - matgen.fill_random_R (numCols, R2.data(), R2.lda(), &sigma_R2[0]); - matgen.fill_random_R (numCols, R3.data(), R3.lda(), &sigma_R3[0]); - matgen.fill_random_svd (numRows, numCols, A.data(), A.lda(), &sigma_A[0]); + matgen.fill_random_R (numCols, R1.data(), R1.stride(1), &sigma_R1[0]); + matgen.fill_random_R (numCols, R2.data(), R2.stride(1), &sigma_R2[0]); + matgen.fill_random_R (numCols, R3.data(), R3.stride(1), &sigma_R3[0]); + matgen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), &sigma_A[0]); if (false && debug) { cerr << endl << "First test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R1.data(), R1.lda()); - print_local_matrix (cerr, numCols, numCols, R2.data(), R2.lda()); + print_local_matrix (cerr, numCols, numCols, R1.data(), R1.stride(1)); + print_local_matrix (cerr, numCols, numCols, R2.data(), R2.stride(1)); cerr << endl; cerr << endl << "Second test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R3.data(), R3.lda()); - print_local_matrix (cerr, numRows, numCols, A.data(), A.lda()); + print_local_matrix (cerr, numCols, numCols, R3.data(), R3.stride(1)); + print_local_matrix (cerr, numRows, numCols, A.data(), A.stride(1)); cerr << endl; } @@ -308,16 +308,16 @@ namespace TSQR { matrix_type A_R3A (numRows + numCols, numCols, Scalar(0)); // Copy [R1; R2] into A_R1R2. - copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.lda(), - R1.data(), R1.lda()); - copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.lda(), - R2.data(), R2.lda()); + copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.stride(1), + R1.data(), R1.stride(1)); + copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.stride(1), + R2.data(), R2.stride(1)); // Copy [R3; A] into A_R3A. - copy_matrix (numCols, numCols, &A_R3A(0, 0), A_R3A.lda(), - R3.data(), R3.lda()); - copy_matrix (numRows, numCols, &A_R3A(numCols, 0), A_R3A.lda(), - A.data(), A.lda()); + copy_matrix (numCols, numCols, &A_R3A(0, 0), A_R3A.stride(1), + R3.data(), R3.stride(1)); + copy_matrix (numRows, numCols, &A_R3A(numCols, 0), A_R3A.stride(1), + A.data(), A.stride(1)); // Space to put the explicit Q factors. matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0)); @@ -346,32 +346,32 @@ namespace TSQR { << " by " << numCols << endl << endl; Combine< Ordinal, Scalar > combiner; - combiner.factor_pair (numCols, R1.data(), R1.lda(), R2.data(), R2.lda(), + combiner.factor_pair (numCols, R1.data(), R1.stride(1), R2.data(), R2.stride(1), &tau_R1R2[0], work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.lda(), &tau_R1R2[0], - &Q_R1R2(0, 0), Q_R1R2.lda(), - &Q_R1R2(numCols, 0), Q_R1R2.lda(), + R2.data(), R2.stride(1), &tau_R1R2[0], + &Q_R1R2(0, 0), Q_R1R2.stride(1), + &Q_R1R2(numCols, 0), Q_R1R2.stride(1), work.data()); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Copy of test problem:" << endl; print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1), - A_R1R2.data(), A_R1R2.lda()); + A_R1R2.data(), A_R1R2.stride(1)); cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1), - Q_R1R2.data(), Q_R1R2.lda()); + Q_R1R2.data(), Q_R1R2.stride(1)); cerr << endl << "-- R factor:" << endl; print_local_matrix (cerr, R1.extent(0), R1.extent(1), - R1.data(), R1.lda()); + R1.data(), R1.stride(1)); cerr << endl; } const results_type firstResults = local_verify (A_R1R2.extent(0), A_R1R2.extent(1), - A_R1R2.data(), A_R1R2.lda(), - Q_R1R2.data(), Q_R1R2.lda(), - R1.data(), R1.lda()); + A_R1R2.data(), A_R1R2.stride(1), + Q_R1R2.data(), Q_R1R2.stride(1), + R1.data(), R1.stride(1)); if (debug) cerr << "\\| A - Q*R \\|_F = " << firstResults[0] << endl << "\\| I - Q'*Q \\|_F = " << firstResults[1] << endl @@ -383,32 +383,32 @@ namespace TSQR { << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols << " and A " << numRows << " by " << numCols << endl << endl; - combiner.factor_inner (numRows, numCols, R3.data(), R3.lda(), - A.data(), A.lda(), &tau_R3A[0], work.data()); + combiner.factor_inner (numRows, numCols, R3.data(), R3.stride(1), + A.data(), A.stride(1), &tau_R3A[0], work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.lda(), &tau_R3A[0], - &Q_R3A(0, 0), Q_R3A.lda(), - &Q_R3A(numCols, 0), Q_R3A.lda(), + A.data(), A.stride(1), &tau_R3A[0], + &Q_R3A(0, 0), Q_R3A.stride(1), + &Q_R3A(numCols, 0), Q_R3A.stride(1), work.data()); if (debug) { cerr << "Results of second test problem:" << endl; cerr << "-- Copy of test problem:" << endl; print_local_matrix (cerr, A_R3A.extent(0), A_R3A.extent(1), - A_R3A.data(), A_R3A.lda()); + A_R3A.data(), A_R3A.stride(1)); cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q_R3A.extent(0), Q_R3A.extent(1), - Q_R3A.data(), Q_R3A.lda()); + Q_R3A.data(), Q_R3A.stride(1)); cerr << endl << "-- R factor:" << endl; print_local_matrix (cerr, R3.extent(0), R3.extent(1), - R3.data(), R3.lda()); + R3.data(), R3.stride(1)); cerr << endl; } const results_type secondResults = local_verify (A_R3A.extent(0), A_R3A.extent(1), - A_R3A.data(), A_R3A.lda(), - Q_R3A.data(), Q_R3A.lda(), - R3.data(), R3.lda()); + A_R3A.data(), A_R3A.stride(1), + Q_R3A.data(), Q_R3A.stride(1), + R3.data(), R3.stride(1)); if (debug) cerr << "\\| A - Q*R \\|_F = " << secondResults[0] << endl << "\\| I - Q'*Q \\|_F = " << secondResults[1] << endl @@ -474,13 +474,13 @@ namespace TSQR { // Matrix consisting of two cache blocks. matrix_type A (Ordinal(2)*numRows, numCols, Scalar(0)); // Views of the two cache blocks. - mat_view_type A1 (numRows, numCols, &A(0,0), A.lda()); - mat_view_type A2 (numRows, numCols, &A(numRows,0), A.lda()); + mat_view_type A1 (numRows, numCols, &A(0,0), A.stride(1)); + mat_view_type A2 (numRows, numCols, &A(numRows,0), A.stride(1)); // Fill the two cache blocks with random test problems. matgen_type matgen (gen); - matgen.fill_random_svd (numRows, numCols, A1.data(), A1.lda(), &sigma_A1[0]); - matgen.fill_random_svd (numRows, numCols, A2.data(), A2.lda(), &sigma_A2[0]); + matgen.fill_random_svd (numRows, numCols, A1.data(), A1.stride(1), &sigma_A1[0]); + matgen.fill_random_svd (numRows, numCols, A2.data(), A2.stride(1), &sigma_A2[0]); if (false && debug) { @@ -508,8 +508,8 @@ namespace TSQR { Q(k, k) = Scalar(1); // Two cache blocks (as views) of Q. - mat_view_type Q1 (numRows, numCols, &Q(0,0), Q.lda()); - mat_view_type Q2 (numRows, numCols, &Q(numRows,0), Q.lda()); + mat_view_type Q1 (numRows, numCols, &Q(0,0), Q.stride(1)); + mat_view_type Q2 (numRows, numCols, &Q(numRows,0), Q.stride(1)); // Two tau factor arrays, one for each cache block. vector< Scalar > tau1 (numCols); @@ -527,13 +527,13 @@ namespace TSQR { Combine< Ordinal, Scalar > combiner; // qr( A1 ) - combiner.factor_first (numRows, numCols, A1.data(), A1.lda(), + combiner.factor_first (numRows, numCols, A1.data(), A1.stride(1), &tau1[0], work.data()); // View of numCols by numCols upper triangle of A1. - mat_view_type R1 (numCols, numCols, A1.data(), A1.lda()); + mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1)); // qr( [R1; A2] ) - combiner.factor_inner (numRows, numCols, R1.data(), R1.lda(), - A2.data(), A2.lda(), &tau2[0], work.data()); + combiner.factor_inner (numRows, numCols, R1.data(), R1.stride(1), + A2.data(), A2.stride(1), &tau2[0], work.data()); // Extract (a deep copy of) the R factor. matrix_type R (R1); // Zero out everything below the diagonal of R. @@ -545,13 +545,13 @@ namespace TSQR { // (working up the matrix A,) finishing with A1. combiner.apply_inner (ApplyType::NoTranspose, numRows, numCols, numCols, - A2.data(), A2.lda(), tau2.data(), - Q1.data(), Q1.lda(), - Q2.data(), Q2.lda(), work.data()); + A2.data(), A2.stride(1), tau2.data(), + Q1.data(), Q1.stride(1), + Q2.data(), Q2.stride(1), work.data()); combiner.apply_first (ApplyType::NoTranspose, numRows, numCols, numCols, - A1.data(), A.lda(), tau1.data(), - Q1.data(), Q1.lda(), work.data()); + A1.data(), A.stride(1), tau1.data(), + Q1.data(), Q1.stride(1), work.data()); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Test matrix A:" << endl; diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 67acf2c371ee..05cc1afc71e4 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -276,8 +276,8 @@ namespace TSQR { DistTsqrHelper helper; const ordinal_type ncols = R_mine.extent(1); - std::vector< scalar_type > R_local (ncols*ncols); - copy_matrix (ncols, ncols, R_local.data(), ncols, R_mine.data(), R_mine.lda()); + std::vector R_local (ncols*ncols); + copy_matrix (ncols, ncols, R_local.data(), ncols, R_mine.data(), R_mine.stride(1)); const int P = messenger_->size(); const int my_rank = messenger_->rank(); @@ -285,7 +285,8 @@ namespace TSQR { std::vector work (ncols); helper.factor_helper (ncols, R_local, my_rank, 0, P-1, first_tag, messenger_.get(), Q_factors, tau_arrays, work); - copy_matrix (ncols, ncols, R_mine.data(), R_mine.lda(), R_local.data(), ncols); + copy_matrix (ncols, ncols, R_mine.data(), R_mine.stride(1), + R_local.data(), ncols); return std::make_pair (Q_factors, tau_arrays); } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index 7fa749cfed34..cf421d2bdfce 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -338,72 +338,71 @@ namespace TSQR { const rank_type P_mine, const rank_type P_first, const rank_type P_last, - std::vector< matrix_type >& QFactors, - std::vector< std::vector< scalar_type > >& tauArrays) + std::vector& QFactors, + std::vector>& tauArrays) { - if (P_last < P_first) - { - std::ostringstream os; - os << "Programming error in factorReduce() recursion: interval " - "[P_first, P_last] is invalid: P_first = " << P_first - << ", P_last = " << P_last << "."; - throw std::logic_error (os.str()); + if (P_last < P_first) { + std::ostringstream os; + os << "factorReduce: Interval [P_first=" << P_first + << ", P_last=" << P_last << "] is invalid."; + throw std::logic_error (os.str()); + } + else if (P_mine < P_first || P_mine > P_last) { + std::ostringstream os; + os << "factorReduce: P_mine=" << P_mine << " is not in " + << "current process rank interval [P_first=" << P_first + << ", P_last=" << P_last << "]"; + throw std::logic_error (os.str()); + } + else if (P_last == P_first) { + return; // skip singleton intervals (see explanation below) + } + else { + // Recurse on two intervals: [P_first, P_mid-1] and [P_mid, + // P_last]. For example, if [P_first, P_last] = [0, 9], P_mid + // = floor( (0+9+1)/2 ) = 5 and the intervals are [0,4] and + // [5,9]. + // + // If [P_first, P_last] = [4,6], P_mid = floor( (4+6+1)/2 ) = + // 5 and the intervals are [4,4] (a singleton) and [5,6]. The + // latter case shows that singleton intervals may arise. We + // treat them as a base case in the recursion. Process 4 + // won't be skipped completely, though; it will get combined + // with the result from [5,6]. + + // Adding 1 and doing integer division works like "ceiling." + const rank_type P_mid = (P_first + P_last + 1) / 2; + + if (P_mine < P_mid) { // Interval [P_first, P_mid-1] + factorReduce (R_mine, P_mine, P_first, P_mid - 1, + QFactors, tauArrays); } - else if (P_mine < P_first || P_mine > P_last) - { - std::ostringstream os; - os << "Programming error in factorReduce() recursion: P_mine (= " - << P_mine << ") is not in current process rank interval " - << "[P_first = " << P_first << ", P_last = " << P_last << "]"; - throw std::logic_error (os.str()); + else { // Interval [P_mid, P_last] + factorReduce (R_mine, P_mine, P_mid, P_last, + QFactors, tauArrays); } - else if (P_last == P_first) - return; // skip singleton intervals (see explanation below) - else - { - // Recurse on two intervals: [P_first, P_mid-1] and [P_mid, - // P_last]. For example, if [P_first, P_last] = [0, 9], - // P_mid = floor( (0+9+1)/2 ) = 5 and the intervals are - // [0,4] and [5,9]. - // - // If [P_first, P_last] = [4,6], P_mid = floor( (4+6+1)/2 ) - // = 5 and the intervals are [4,4] (a singleton) and [5,6]. - // The latter case shows that singleton intervals may arise. - // We treat them as a base case in the recursion. Process 4 - // won't be skipped completely, though; it will get combined - // with the result from [5,6]. - - // Adding 1 and doing integer division works like "ceiling." - const rank_type P_mid = (P_first + P_last + 1) / 2; - - if (P_mine < P_mid) // Interval [P_first, P_mid-1] - factorReduce (R_mine, P_mine, P_first, P_mid - 1, - QFactors, tauArrays); - else // Interval [P_mid, P_last] - factorReduce (R_mine, P_mine, P_mid, P_last, - QFactors, tauArrays); - - // This only does anything if P_mine is either P_first or P_mid. - if (P_mine == P_first) - { - const ordinal_type numCols = R_mine.extent(1); - matrix_type R_other (numCols, numCols); - recv_R (R_other, P_mid); - - std::vector< scalar_type > tau (numCols); - // Don't shrink the workspace array; doing so may - // require expensive reallocation every time we send / - // receive data. - resizeWork (numCols); - combine_.factor_pair (numCols, R_mine.data(), R_mine.lda(), - R_other.data(), R_other.lda(), - tau.data(), work_.data()); - QFactors.push_back (R_other); - tauArrays.push_back (tau); - } - else if (P_mine == P_mid) - send_R (R_mine, P_first); + + // This only does anything if P_mine is either P_first or P_mid. + if (P_mine == P_first) { + const ordinal_type numCols = R_mine.extent(1); + matrix_type R_other (numCols, numCols); + recv_R (R_other, P_mid); + + std::vector< scalar_type > tau (numCols); + // Don't shrink the workspace array; doing so may + // require expensive reallocation every time we send / + // receive data. + resizeWork (numCols); + combine_.factor_pair (numCols, R_mine.data(), R_mine.stride(1), + R_other.data(), R_other.stride(1), + tau.data(), work_.data()); + QFactors.push_back (R_other); + tauArrays.push_back (tau); + } + else if (P_mine == P_mid) { + send_R (R_mine, P_first); } + } } void @@ -417,71 +416,71 @@ namespace TSQR { std::vector< matrix_type >& QFactors, std::vector< std::vector< scalar_type > >& tauArrays) { - if (P_last < P_first) - { - std::ostringstream os; - os << "Programming error in explicitQBroadcast() recursion: interval" - " [P_first, P_last] is invalid: P_first = " << P_first - << ", P_last = " << P_last << "."; - throw std::logic_error (os.str()); + if (P_last < P_first) { + std::ostringstream os; + os << "explicitQBroadcast: interval [P_first=" << P_first + << ", P_last=" << P_last << "] is invalid."; + throw std::logic_error (os.str()); + } + else if (P_mine < P_first || P_mine > P_last) { + std::ostringstream os; + os << "explicitQBroadcast: P_mine=" << P_mine << " is not " + "in current process rank interval [P_first = " << P_first + << ", P_last = " << P_last << "]"; + throw std::logic_error (os.str()); + } + else if (P_last == P_first) { + return; // skip singleton intervals + } + else { + // Adding 1 and integer division works like "ceiling." + const rank_type P_mid = (P_first + P_last + 1) / 2; + rank_type newpos = curpos; + if (P_mine == P_first) { + if (curpos < 0) { + std::ostringstream os; + os << "Programming error: On the current P_first (= " + << P_first << ") proc: curpos (= " << curpos << ") < 0"; + throw std::logic_error (os.str()); + } + // Q_impl, tau: implicitly stored local Q factor. + matrix_type& Q_impl = QFactors[curpos]; + std::vector& tau = tauArrays[curpos]; + + // Apply implicitly stored local Q factor to + // [Q_mine; + // Q_other] + // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)). + // Overwrite both Q_mine and Q_other with the result. + deep_copy (Q_other, scalar_type {}); + combine_.apply_pair (ApplyType::NoTranspose, + Q_mine.extent(1), Q_impl.extent(1), + Q_impl.data(), Q_impl.stride(1), + tau.data(), + Q_mine.data(), Q_mine.stride(1), + Q_other.data(), Q_other.stride(1), + work_.data()); + // Send the resulting Q_other, and the final R factor, to P_mid. + send_Q_R (Q_other, R_mine, P_mid); + newpos = curpos - 1; } - else if (P_mine < P_first || P_mine > P_last) - { - std::ostringstream os; - os << "Programming error in explicitQBroadcast() recursion: P_mine " - "(= " << P_mine << ") is not in current process rank interval " - << "[P_first = " << P_first << ", P_last = " << P_last << "]"; - throw std::logic_error (os.str()); + else if (P_mine == P_mid) { + // P_first computed my explicit Q factor component. + // Receive it, and the final R factor, from P_first. + recv_Q_R (Q_mine, R_mine, P_first); } - else if (P_last == P_first) - return; // skip singleton intervals - else - { - // Adding 1 and integer division works like "ceiling." - const rank_type P_mid = (P_first + P_last + 1) / 2; - rank_type newpos = curpos; - if (P_mine == P_first) - { - if (curpos < 0) - { - std::ostringstream os; - os << "Programming error: On the current P_first (= " - << P_first << ") proc: curpos (= " << curpos << ") < 0"; - throw std::logic_error (os.str()); - } - // Q_impl, tau: implicitly stored local Q factor. - matrix_type& Q_impl = QFactors[curpos]; - std::vector< scalar_type >& tau = tauArrays[curpos]; - - // Apply implicitly stored local Q factor to - // [Q_mine; - // Q_other] - // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)). - // Overwrite both Q_mine and Q_other with the result. - deep_copy (Q_other, scalar_type {}); - combine_.apply_pair (ApplyType::NoTranspose, - Q_mine.extent(1), Q_impl.extent(1), - Q_impl.data(), Q_impl.lda(), tau.data(), - Q_mine.data(), Q_mine.lda(), - Q_other.data(), Q_other.lda(), work_.data()); - // Send the resulting Q_other, and the final R factor, to P_mid. - send_Q_R (Q_other, R_mine, P_mid); - newpos = curpos - 1; - } - else if (P_mine == P_mid) - // P_first computed my explicit Q factor component. - // Receive it, and the final R factor, from P_first. - recv_Q_R (Q_mine, R_mine, P_first); - if (P_mine < P_mid) // Interval [P_first, P_mid-1] - explicitQBroadcast (R_mine, Q_mine, Q_other, - P_mine, P_first, P_mid - 1, - newpos, QFactors, tauArrays); - else // Interval [P_mid, P_last] + if (P_mine < P_mid) { // Interval [P_first, P_mid-1] + explicitQBroadcast (R_mine, Q_mine, Q_other, + P_mine, P_first, P_mid - 1, + newpos, QFactors, tauArrays); + } + else { // Interval [P_mid, P_last] explicitQBroadcast (R_mine, Q_mine, Q_other, P_mine, P_mid, P_last, newpos, QFactors, tauArrays); } + } } template< class ConstMatrixType1, class ConstMatrixType2 > diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 829eee19e8dd..89f91f788cdc 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -239,7 +239,7 @@ namespace TSQR { using TSQR::Random::randomGlobalMatrix; mat_view_type A_local_view (A_local.extent(0), A_local.extent(1), - A_local.data(), A_local.lda()); + A_local.data(), A_local.stride(1)); const magnitude_type* const singVals = singularValues.data(); randomGlobalMatrix (&gen, A_local_view, singVals, ordinalMessenger.getRawPtr(), @@ -257,7 +257,7 @@ namespace TSQR { // result. if (contiguousCacheBlocks) { tsqr->cache_block (numRowsLocal, numCols, A_copy.data(), - A_local.data(), A_local.lda()); + A_local.data(), A_local.stride(1)); if (debug) { Teuchos::barrier (*comm); if (myRank == 0) @@ -273,9 +273,9 @@ namespace TSQR { // wanted. if (testFactorExplicit) { tsqr->factorExplicitRaw (A_copy.extent (0), A_copy.extent (1), - A_copy.data (), A_copy.lda (), - Q_local.data (), Q_local.lda (), - R.data (), R.lda (), + A_copy.data (), A_copy.stride (1), + Q_local.data (), Q_local.stride (1), + R.data (), R.stride (1), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); @@ -286,16 +286,16 @@ namespace TSQR { else { // Factor the (copy of the) matrix. factor_output_type factorOutput = - tsqr->factor (numRowsLocal, numCols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); + tsqr->factor (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); if (myRank == 0) cerr << "-- Finished Tsqr::factor" << endl; } // Compute the explicit Q factor in Q_local. - tsqr->explicit_Q (numRowsLocal, numCols, A_copy.data(), A_copy.lda(), - factorOutput, numCols, Q_local.data(), Q_local.lda(), + tsqr->explicit_Q (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1), + factorOutput, numCols, Q_local.data(), Q_local.stride(1), contiguousCacheBlocks); if (debug) { Teuchos::barrier (*comm); @@ -320,8 +320,8 @@ namespace TSQR { const magnitude_type tol = STM::zero(); const ordinal_type rank = tsqr->revealRankRaw (Q_local.extent (0), Q_local.extent (1), - Q_local.data (), Q_local.lda (), - R.data (), R.lda (), tol, + Q_local.data (), Q_local.stride (1), + R.data (), R.stride (1), tol, contiguousCacheBlocks); magnitude_type two_to_the_numCols = STM::one(); @@ -360,7 +360,7 @@ namespace TSQR { // un-cache-blocking Q_local, since we're done using // A_copy for other things. tsqr->un_cache_block (numRowsLocal, numCols, A_copy.data(), - A_copy.lda(), Q_local.data()); + A_copy.stride(1), Q_local.data()); // Overwrite Q_local with the un-cache-blocked Q factor. deep_copy (Q_local, A_copy); if (debug) { @@ -372,8 +372,8 @@ namespace TSQR { // Test accuracy of the factorization. const std::vector results = - global_verify (numRowsLocal, numCols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), + global_verify (numRowsLocal, numCols, A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), scalarMessenger.getRawPtr()); if (debug) { Teuchos::barrier (*comm); diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index f8a8156c19ba..87a9aaf37d0f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -174,7 +174,7 @@ namespace TSQR { // work.size() > 0, but we've already checked for that, so we // don't have to check again. combine.factor_first (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.lda(), tau.data(), work.data()); + A_top.stride(1), tau.data(), work.data()); return tau; } @@ -190,8 +190,8 @@ namespace TSQR { // tau.size() > 0 and work.size() > 0, but we've already // checked for that, so we don't have to check again. combine.factor_inner (A_cur.extent(0), A_top.extent(1), - A_top.data(), A_top.lda(), - A_cur.data(), A_cur.lda(), + A_top.data(), A_top.stride(1), + A_cur.data(), A_cur.stride(1), tau.data(), work.data()); return tau; } @@ -395,8 +395,8 @@ namespace TSQR { // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. combine.apply_first (applyType, C_top.extent(0), C_top.extent(1), - Q_top.extent(1), Q_top.data(), Q_top.lda(), - tau.data(), C_top.data(), C_top.lda(), work.data()); + Q_top.extent(1), Q_top.data(), Q_top.stride(1), + tau.data(), C_top.data(), C_top.stride(1), work.data()); } void @@ -418,10 +418,10 @@ namespace TSQR { // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. combine.apply_inner (applyType, C_cur.extent(0), C_cur.extent(1), - Q_cur.extent(1), Q_cur.data(), Q_cur.lda(), + Q_cur.extent(1), Q_cur.data(), Q_cur.stride(1), tau.data(), - C_top.data(), C_top.lda(), - C_cur.data(), C_cur.lda(), + C_top.data(), C_top.stride(1), + C_cur.data(), C_cur.stride(1), work.data()); } @@ -539,7 +539,7 @@ namespace TSQR { mat_view_type C_top_rest (C_top.extent(0) - C_top.extent(1), C_top.extent(1), C_top.data() + C_top.extent(1), - C_top.lda()); + C_top.stride(1)); deep_copy (C_top_rest, Scalar {}); } LocalOrdinal curTauIndex = cbIndices.second-1; @@ -848,8 +848,8 @@ namespace TSQR { // Q_cur := Q_temp * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent(0), numCols, numCols, Scalar (1.0), - Q_temp.data(), Q_temp.lda(), B_.data(), B_.lda(), - Scalar(0), Q_cur.data(), Q_cur.lda()); + Q_temp.data(), Q_temp.stride(1), B_.data(), B_.stride(1), + Scalar(0), Q_cur.data(), Q_cur.stride(1)); } /// \brief Multiply (in place) each cache block in the range by B_. @@ -1454,11 +1454,11 @@ namespace TSQR { "factorSecondPass: result.topBlocks[0] is an empty view." << suffix); mat_view_type R_top_square (R_top.extent(1), R_top.extent(1), - R_top.data(), R_top.lda()); + R_top.data(), R_top.stride(1)); deep_copy (R, Scalar {}); // Only copy the upper triangle of R_top into R. - copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.lda(), - R_top.data(), R_top.lda()); + copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.stride(1), + R_top.data(), R_top.stride(1)); return result; } @@ -1553,8 +1553,8 @@ namespace TSQR { // The statement below only works if R_top and R_bot have a // nonzero (and the same) number of columns, but we have already // checked that above. - combine_.factor_pair (R_top.extent(1), R_top.data(), R_top.lda(), - R_bot.data(), R_bot.lda(), tau.data(), + combine_.factor_pair (R_top.extent(1), R_top.data(), R_top.stride(1), + R_bot.data(), R_bot.stride(1), tau.data(), work_.data()); return tau; } @@ -1609,9 +1609,9 @@ namespace TSQR { // have a nonzero (and the same) number of columns, but we have // already checked that above. combine_.apply_pair (applyType, C_top.extent(1), R_bot.extent(1), - R_bot.data(), R_bot.lda(), tau.data(), - C_top.data(), C_top.lda(), - C_bot.data(), C_bot.lda(), work_.data()); + R_bot.data(), R_bot.stride(1), tau.data(), + C_top.data(), C_top.stride(1), + C_bot.data(), C_bot.stride(1), work_.data()); } void @@ -1646,7 +1646,7 @@ namespace TSQR { // affect the top ncols x ncols part of each of those blocks in // this method. mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].data(), - topBlocksOfC[0].lda()); + topBlocksOfC[0].stride(1)); if (applyType.transposed ()) { // Don't include the topmost (index 0) partition in the // iteration; that corresponds to C_top_square. @@ -1657,7 +1657,7 @@ namespace TSQR { const mat_view_type& C_cur = topBlocksOfC[partIdx]; if (! C_cur.empty()) { mat_view_type C_cur_square (numCols, numCols, C_cur.data (), - C_cur.lda ()); + C_cur.stride (1)); // If explicitQ: We've already done the first pass and // filled the top blocks of C. applyPair (applyType, factorOutput.topBlocks[partIdx], @@ -1682,7 +1682,8 @@ namespace TSQR { const mat_view_type& C_cur = topBlocksOfC[partIdx]; if (! C_cur.empty()) { mat_view_type C_cur_square (numCols, numCols, - C_cur.data (), C_cur.lda ()); + C_cur.data (), + C_cur.stride (1)); // The "first" pass (actually the last, only named // "first" by analogy with factorFirstPass()) will // fill the rest of these top blocks. For now, we diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp index b899cca15790..ab3f0411d22d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp @@ -143,7 +143,7 @@ namespace TSQR { const Ordinal ldr = numCols; // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.data(), A.lda(), true); + nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), true); if (debug) { cerr << "-- Generated test problem" << endl; @@ -151,7 +151,7 @@ namespace TSQR { if (A.extent(0) <= 30) { cerr << "A = " << endl; print_local_matrix (cerr, A.extent(0), A.extent(1), - A.data(), A.lda()); + A.data(), A.stride(1)); cerr << endl << endl; } } @@ -167,13 +167,13 @@ namespace TSQR { if (A_copy.extent(0) <= 30) { cerr << "A_copy = " << endl; print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), - A_copy.data(), A_copy.lda()); + A_copy.data(), A_copy.stride(1)); cerr << endl << endl; } } } else { - actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.lda()); + actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1)); if (debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -181,7 +181,7 @@ namespace TSQR { if (A_copy.extent(0) <= 30) { cerr << "A_copy = " << endl; print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), - A_copy.data(), A_copy.lda()); + A_copy.data(), A_copy.stride(1)); cerr << endl << endl; } } @@ -193,7 +193,7 @@ namespace TSQR { deep_copy (A2, std::numeric_limits::quiet_NaN()); } - actor.un_cache_block (numRows, numCols, A2.data(), A2.lda(), A_copy.data()); + actor.un_cache_block (numRows, numCols, A2.data(), A2.stride(1), A_copy.data()); if (matrix_equal (A, A2)) { if (debug) cerr << "-- Cache blocking test succeeded!" << endl; @@ -206,10 +206,10 @@ namespace TSQR { if (A.extent(0) <= 30 && A2.extent(0) <= 30) { cerr << "A = " << endl; print_local_matrix (cerr, A.extent(0), A.extent(1), - A.data(), A.lda()); + A.data(), A.stride(1)); cerr << endl << "A2 = " << endl; print_local_matrix (cerr, A2.extent(0), A2.extent(1), - A2.data(), A2.lda()); + A2.data(), A2.stride(1)); cerr << endl; } } @@ -231,8 +231,8 @@ namespace TSQR { // Factor the matrix and compute the explicit Q factor factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); if (debug) { cerr << "-- Finished factor()" << endl; cerr << "-- Calling explicit_Q()" << endl; @@ -245,14 +245,14 @@ namespace TSQR { mat_view_type Q_top = actor.top_block (Q.view (), contiguousCacheBlocks); mat_view_type Q_top_square (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.lda()); + Q_top.data(), Q_top.stride(1)); deep_copy (Q_top_square, Scalar {}); for (Ordinal j = 0; j < Q_top_square.extent(1); ++j) { Q_top_square(j,j) = Scalar (1.0); } } - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), - factor_output, numCols, Q.data(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), + factor_output, numCols, Q.data(), Q.stride(1), contiguousCacheBlocks); if (debug) { cerr << "-- Finished explicit_Q()" << endl; @@ -265,7 +265,7 @@ namespace TSQR { if (contiguousCacheBlocks) { // Use A_copy as temporary storage for un-cache-blocking Q. actor.un_cache_block (numRows, numCols, A_copy.data(), - A_copy.lda(), Q.data()); + A_copy.stride(1), Q.data()); deep_copy (Q, A_copy); if (debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -278,11 +278,11 @@ namespace TSQR { if (Q.extent(0) <= 30) { cerr << endl << "-- Q factor:" << endl; print_local_matrix (cerr, Q.extent(0), Q.extent(1), - Q.data(), Q.lda()); + Q.data(), Q.stride(1)); cerr << endl << endl; } cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, numCols, numCols, R.data(), R.lda()); + print_local_matrix (cerr, numCols, numCols, R.data(), R.stride(1)); cerr << endl; } @@ -411,13 +411,13 @@ namespace TSQR { deep_copy (R, Scalar {}); // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.data(), A.lda(), false); + nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), false); // Copy A into A_copy, since TSQR overwrites the input. If // specified, rearrange the data in A_copy so that the data in // each cache block is contiguously stored. if (contiguousCacheBlocks) { - actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.lda()); + actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1)); } else { deep_copy (A_copy, A); } @@ -429,14 +429,14 @@ namespace TSQR { // Factor the matrix in-place in A_copy, and extract the // resulting R factor into R. factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), - factor_output, numCols, Q.data(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), + factor_output, numCols, Q.data(), Q.stride(1), contiguousCacheBlocks); } @@ -449,14 +449,14 @@ namespace TSQR { // Factor the matrix in-place in A_copy, and extract the // resulting R factor into R. factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); + actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.lda(), - factor_output, numCols, Q.data(), Q.lda(), + actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), + factor_output, numCols, Q.data(), Q.stride(1), contiguousCacheBlocks); } const double timing = timer.stop(); diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index b7bf49202333..62e538a6d0cf 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -75,9 +75,9 @@ namespace TSQR { return false; } const ptrdiff_t nrows (A.extent(0)); - const ptrdiff_t A_lda (A.lda()); + const ptrdiff_t A_lda (A.stride(1)); const ptrdiff_t ncols (A.extent(1)); - const ptrdiff_t B_lda (B.lda()); + const ptrdiff_t B_lda (B.stride(1)); const auto* A_j = A.data(); const auto* B_j = B.data(); for (ptrdiff_t j = 0; j < ncols; ++j, A_j += A_lda, B_j += B_lda) { @@ -176,6 +176,14 @@ namespace TSQR { MatView (MatView&& view) = default; MatView& operator= (MatView&& view) = default; + constexpr ordinal_type extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); + } + + constexpr ordinal_type stride(const int r) const noexcept { + return r == 0 ? ordinal_type(1) : (r == 1 ? lda_ : ordinal_type(0)); + } + reference operator() (const ordinal_type i, const ordinal_type j) const @@ -201,26 +209,11 @@ namespace TSQR { throw std::logic_error("Attempt to reference NULL data"); } #endif // TSQR_MATVIEW_DEBUG - return A_[i + j*lda()]; - } - - constexpr ordinal_type extent(const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); - } - - constexpr ordinal_type stride(const int r) const noexcept { - return r == 0 ? ordinal_type(1) : (r == 1 ? lda_ : ordinal_type(0)); - } - - constexpr ordinal_type lda() const noexcept { - return stride(1); + return A_[i + j * this->stride(1)]; } - /// \note The function is const, only because returning A_ doesn't - /// change any members of *this. Of course one may use the - /// resulting pointer to fiddle with entries in the matrix, but - /// that doesn't affect the MatView's properties. pointer data() const { return A_; } + bool empty() const { return extent(0) == 0 || extent(1) == 0; } /// Return a "row block" (submatrix of consecutive rows in the @@ -240,7 +233,7 @@ namespace TSQR { } } #endif // TSQR_MATVIEW_DEBUG - return MatView (lastRow - firstRow + 1, extent(1), data() + firstRow, lda()); + return MatView (lastRow - firstRow + 1, extent(1), data() + firstRow, stride(1)); } /// Split off and return the top cache block of nrows_top rows. @@ -285,8 +278,8 @@ namespace TSQR { A_rest_ptr = A_top_ptr + nrows_top * extent(1); } else { - lda_top = lda(); - lda_rest = lda(); + lda_top = stride(1); + lda_rest = stride(1); A_rest_ptr = A_top_ptr + nrows_top; } MatView A_top (nrows_top, extent(1), data(), lda_top); @@ -322,8 +315,8 @@ namespace TSQR { A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); } else { - lda_bottom = lda(); - lda_rest = lda(); + lda_bottom = stride(1); + lda_rest = stride(1); A_bottom_ptr = A_rest_ptr + nrows_rest; } MatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); @@ -336,12 +329,12 @@ namespace TSQR { bool operator== (const MatView& rhs) const { return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && - lda() == rhs.lda() && data() == rhs.data(); + stride(1) == rhs.stride(1) && data() == rhs.data(); } bool operator!= (const MatView& rhs) const { return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || - lda() != rhs.lda() || data() != rhs.data(); + stride(1) != rhs.stride(1) || data() != rhs.data(); } private: @@ -360,6 +353,7 @@ namespace TSQR { using scalar_type = Scalar; using ordinal_type = Ordinal; using pointer = const Scalar*; + using reference = const Scalar&; ConstMatView () = default; @@ -385,8 +379,17 @@ namespace TSQR { ConstMatView (ConstMatView&&) = default; ConstMatView& operator= (ConstMatView&&) = default; - const scalar_type& - operator() (const ordinal_type i, const ordinal_type j) const + constexpr ordinal_type extent(const int r) const noexcept { + return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); + } + + constexpr ordinal_type stride(const int r) const noexcept { + return r == 0 ? ordinal_type(1) : (r == 1 ? lda_ : ordinal_type(0)); + } + + reference + operator() (const ordinal_type i, + const ordinal_type j) const { #ifdef TSQR_MATVIEW_DEBUG if (std::numeric_limits::is_signed) { @@ -409,15 +412,9 @@ namespace TSQR { throw std::logic_error("Attempt to reference NULL data"); } #endif // TSQR_MATVIEW_DEBUG - return A_[i + j*lda()]; - } - - constexpr ordinal_type extent(const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); + return A_[i + j * this->stride(1)]; } - ordinal_type lda() const { return lda_; } - pointer data() const { return A_; } bool empty() const { return extent(0) == 0 || extent(1) == 0; } @@ -434,7 +431,7 @@ namespace TSQR { } #endif // TSQR_MATVIEW_DEBUG return ConstMatView (lastRow - firstRow + 1, extent(1), - data() + firstRow, lda()); + data() + firstRow, stride(1)); } /// \brief Split off and return the top block. Modify *this to be @@ -474,8 +471,8 @@ namespace TSQR { A_rest_ptr = A_top_ptr + nrows_top * extent(1); } else { - lda_top = lda(); - lda_rest = lda(); + lda_top = stride(1); + lda_rest = stride(1); A_rest_ptr = A_top_ptr + nrows_top; } ConstMatView A_top (nrows_top, extent(1), data(), lda_top); @@ -511,8 +508,8 @@ namespace TSQR { A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); } else { - lda_bottom = lda(); - lda_rest = lda(); + lda_bottom = stride(1); + lda_rest = stride(1); A_bottom_ptr = A_rest_ptr + nrows_rest; } ConstMatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); @@ -525,12 +522,12 @@ namespace TSQR { bool operator== (const ConstMatView& rhs) const { return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && - lda() == rhs.lda() && data() == rhs.data(); + stride(1) == rhs.stride(1) && data() == rhs.data(); } bool operator!= (const ConstMatView& rhs) const { return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || - lda() != rhs.lda() || data() != rhs.data(); + stride(1) != rhs.stride(1) || data() != rhs.data(); } private: @@ -547,7 +544,7 @@ namespace TSQR { using ordinal_type = typename MatView::ordinal_type; const ordinal_type num_rows = tgt.extent(0); const ordinal_type num_cols = tgt.extent(1); - const ordinal_type stride = tgt.lda(); + const ordinal_type stride = tgt.stride(1); auto* tgt_j = tgt.data(); for (ordinal_type j = 0; j < num_cols; ++j, tgt_j += stride) { for (ordinal_type i = 0; i < num_rows; ++i) { diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 5b94a6c71afe..42789bad14a8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -186,8 +186,8 @@ namespace TSQR { A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (! in.empty()) { - copy_matrix (extent(0), extent(1), data(), lda(), - in.data(), in.lda()); + copy_matrix (extent(0), extent(1), data(), stride(1), + in.data(), in.stride(1)); } } @@ -198,8 +198,8 @@ namespace TSQR { /// /// This constructor allocates a new matrix and copies the /// elements of the input view into the resulting new matrix. - /// MatrixViewType must have extent(0), extent(1), data(), and lda() - /// methods that match MatView's methods. + /// MatrixViewType must have extent(0), extent(1), data(), and + /// stride(1) methods that match MatView's methods. template Matrix (const MatrixViewType& in) : nrows_ (in.extent(0)), @@ -207,8 +207,8 @@ namespace TSQR { A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (A_.size() != 0) { - copy_matrix (extent(0), extent(1), data(), lda(), - in.data(), in.lda()); + copy_matrix (extent(0), extent(1), data(), stride(1), + in.data(), in.stride(1)); } } @@ -217,7 +217,7 @@ namespace TSQR { /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. scalar_type& operator() (const ordinal_type i, const ordinal_type j) { - return A_[i + j*lda()]; + return A_[i + j*stride(1)]; } /// \brief Const reference to element (i,j) of the matrix. @@ -225,7 +225,7 @@ namespace TSQR { /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. const scalar_type& operator() (const ordinal_type i, const ordinal_type j) const { - return A_[i + j*lda()]; + return A_[i + j*stride(1)]; } //! 1-D std::vector - style access. @@ -238,7 +238,7 @@ namespace TSQR { bool operator== (const MatrixViewType& B) const { if (data() != B.data() || extent(0) != B.extent(0) || - extent(1) != B.extent(1) || lda() != B.lda()) { + extent(1) != B.extent(1) || stride(1) != B.stride(1)) { return false; } else { return true; @@ -253,8 +253,6 @@ namespace TSQR { return r == 0 ? ordinal_type(1) : (r == 1 ? nrows_ : ordinal_type(0)); } - constexpr ordinal_type lda() const noexcept { return stride(1); } - //! Whether the matrix is empty (has either zero rows or zero columns). bool empty() const { return extent(0) == 0 || extent(1) == 0; } @@ -272,13 +270,13 @@ namespace TSQR { //! A non-const view of the matrix. mat_view_type view () { - return mat_view_type (extent(0), extent(1), data(), lda()); + return mat_view_type (extent(0), extent(1), data(), stride(1)); } //! A const view of the matrix. const_mat_view_type const_view () const { return const_mat_view_type (extent(0), extent(1), - const_cast (data()), lda()); + const_cast (data()), stride(1)); } /// Change the dimensions of the matrix. Reallocate if necessary. @@ -321,7 +319,7 @@ namespace TSQR { deep_copy (Matrix& tgt, const SourceScalar& src) { MatView tgt_view (tgt.extent(0), tgt.extent(1), - tgt.data(), tgt.lda()); + tgt.data(), tgt.stride(1)); deep_copy (tgt_view, src); } @@ -334,7 +332,7 @@ namespace TSQR { { using mat_view_type = MatView; mat_view_type tgt_view (tgt.extent(0), tgt.extent(1), - tgt.data(), tgt.lda()); + tgt.data(), tgt.stride(1)); deep_copy (tgt_view, src); } } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index 2c2ad7a442dd..0114982140a5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -103,8 +103,8 @@ namespace TSQR { // Factor the (copy of the) matrix. On output, the explicit Q // factor (of A_local) is in Q_local and the R factor is in R. orthogonalizer.mgs (Q_local.extent(0), Q_local.extent(1), - Q_local.data(), Q_local.lda(), - R.data(), R.lda()); + Q_local.data(), Q_local.stride(1), + R.data(), R.stride(1)); if (b_debug) { messenger->barrier(); if (messenger->rank() == 0) @@ -211,7 +211,7 @@ namespace TSQR { scalarComm->barrier (); if (my_rank == 0) { cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); cerr << endl; } scalarComm->barrier (); @@ -219,8 +219,8 @@ namespace TSQR { // Test accuracy of the resulting factorization std::vector results = - global_verify (nrows_local, ncols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), + global_verify (nrows_local, ncols, A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), scalarComm.get()); if (b_debug) { scalarComm->barrier(); @@ -295,7 +295,7 @@ namespace TSQR { // benchmark, not a numerical verification test. (We have the // latter implemented as mgs_verify() in this file.) orthogonalizer.mgs (nrows_local, ncols, Q_local.data(), - Q_local.lda(), R.data(), R.lda()); + Q_local.stride(1), R.data(), R.stride(1)); // Timings in debug mode likely won't make sense, because Proc // 0 is outputting the debug messages to cerr. Nevertheless, // we don't put any "if(b_debug)" calls in the timing loop. diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index 85cdb46fb8e1..f27745ad7c07 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -324,7 +324,7 @@ namespace TSQR { /// \brief Return view of topmost cache block of C /// /// \param C [in] Matrix (view), supporting the usual extent(0), - /// extent(1), data(), lda() interface. + /// extent(1), data(), stride(1) interface. /// \param contiguousCacheBlocks [in] Whether the cache blocks /// in C are stored contiguously. /// @@ -335,7 +335,7 @@ namespace TSQR { /// follows: /// \code /// MatrixViewType top = this->top_block (C, contig); - /// mat_view_type square (ncols, ncols, top.data(), top.lda()); + /// mat_view_type square (ncols, ncols, top.data(), top.stride(1)); /// \endcode virtual const_mat_view_type const_top_block (const const_mat_view_type& C, @@ -355,12 +355,12 @@ namespace TSQR { /// Tsqr::apply() need, do the following: /// \code /// MatrixViewType top = this->top_block (C, contig); - /// mat_view_type square (ncols, ncols, top.data(), top.lda()); + /// mat_view_type square (ncols, ncols, top.data(), top.stride(1)); /// \endcode /// /// Models for MatrixViewType are MatView and ConstMatView. /// MatrixViewType must have member functions extent(0), extent(1), - /// data(), and lda(), and its constructor must take the same four + /// data(), and stride(1), and its constructor must take the same four /// arguments as the constructor of ConstMatView. template MatrixViewType @@ -372,7 +372,7 @@ namespace TSQR { // method. The only cast from const to nonconst may be in the // return value, but there it's legitimate since we're just // using the same constness as C has. - const_mat_view_type C_view (C.extent(0), C.extent(1), C.data(), C.lda()); + const_mat_view_type C_view (C.extent(0), C.extent(1), C.data(), C.stride(1)); const_mat_view_type C_top = const_top_block (C_view, contiguous_cache_blocks); TEUCHOS_TEST_FOR_EXCEPTION(C_top.extent(0) < C_top.extent(1), std::logic_error, @@ -384,7 +384,7 @@ namespace TSQR { using pointer = typename MatrixViewType::pointer; return MatrixViewType (C_top.extent(0), C_top.extent(1), const_cast (C_top.data()), - C_top.lda()); + C_top.stride(1)); } /// \brief Does factor() compute R with nonnegative diagonal? @@ -519,9 +519,9 @@ namespace TSQR { "developers."; Scalar svd_lwork_scalar {}; - lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.lda(), - singular_values.data(), U_view.data(), U_view.lda(), - VT.data(), VT.lda(), &svd_lwork_scalar, svd_lwork, + lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.stride(1), + singular_values.data(), U_view.data(), U_view.stride(1), + VT.data(), VT.stride(1), &svd_lwork_scalar, svd_lwork, svd_rwork.data()); // LAPACK returns the workspace array length as a Scalar. We // have to convert it back to an Ordinal in order to allocate @@ -561,9 +561,9 @@ namespace TSQR { // Compute SVD $B := U \Sigma V^*$. B is overwritten, which is // why we copied R into B (so that we don't overwrite R if R is // full rank). - lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.lda(), - singular_values.data(), U_view.data(), U_view.lda(), - VT.data(), VT.lda(), svd_work.data(), svd_lwork, + lapack.GESVD ('A', 'A', ncols, ncols, B.data(), B.stride(1), + singular_values.data(), U_view.data(), U_view.stride(1), + VT.data(), VT.stride(1), svd_work.data(), svd_lwork, svd_rwork.data()); // // Compute the numerical rank of B, using the given relative @@ -633,14 +633,14 @@ namespace TSQR { // Compute numerical rank of the R factor using the SVD. // Store the left singular vectors in U. const Ordinal rank = - reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol); + reveal_R_rank (ncols, R, ldr, U.data(), U.stride(1), tol); // If R is full rank, we're done. Otherwise, reveal_R_rank() // already computed the SVD \f$R = U \Sigma V^*\f$ of (the // input) R, and overwrote R with \f$\Sigma V^*\f$. Now, we // compute \f$Q := Q \cdot U\f$, respecting cache blocks of Q. if (rank < ncols) { - Q_times_B (nrows, ncols, Q, ldq, U.data(), U.lda(), + Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1), contiguousCacheBlocks); } return rank; diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp index 040818017815..530dba578814 100644 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp @@ -235,7 +235,7 @@ namespace TSQR { err_ << "-- Finished DistTsqr::factor" << endl; } // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); + par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); if (debug_) { scalarComm_->barrier(); if (myRank == 0) { @@ -244,8 +244,8 @@ namespace TSQR { } // Verify the factorization result_type result = - global_verify (numCols, numCols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), + global_verify (numCols, numCols, A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), scalarComm_.get()); if (debug_) { scalarComm_->barrier(); @@ -279,15 +279,15 @@ namespace TSQR { printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get()); if (myRank == 0) { err_ << std::endl << "Computed R factor:" << std::endl; - print_local_matrix (err_, R.extent(0), R.extent(1), R.data(), R.lda()); + print_local_matrix (err_, R.extent(0), R.extent(1), R.data(), R.stride(1)); err_ << std::endl; } } // Verify the factorization result_type result = - global_verify (numCols, numCols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), + global_verify (numCols, numCols, A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), scalarComm_.get()); if (debug_) { scalarComm_->barrier(); @@ -574,7 +574,7 @@ namespace TSQR { // overwritten on output) factor_output_type factorOutput = par.factor (R.view()); // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); + par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); } // Now do the actual timing runs. Benchmark DistTsqr @@ -587,7 +587,7 @@ namespace TSQR { // overwritten on output) factor_output_type factorOutput = par.factor (R.view()); // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.lda(), factorOutput); + par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); } // Cumulative timing on this MPI process. // "Cumulative" means the elapsed time of numTrials executions. diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index 1f6035759d3d..60b043d09e9e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -194,7 +194,8 @@ namespace TSQR { const ordinal_type ncols = R_stack.extent(1); // Copy data from top ncols x ncols block of R_stack into R_local. - const_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); + const_view_type R_stack_view_first (ncols, ncols, R_stack.data(), + R_stack.stride(1)); deep_copy (R_local, R_stack_view_first); // Loop through all other processors, sending each the next @@ -202,7 +203,7 @@ namespace TSQR { RMessenger< ordinal_type, scalar_type > sender (messenger); for (int destProc = 1; destProc < nprocs; ++destProc) { const scalar_type* const R_ptr = R_stack.data() + destProc*ncols; - const_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); + const_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.stride(1)); sender.send (R_stack_view_cur, destProc); } } @@ -234,14 +235,16 @@ namespace TSQR { const ordinal_type ncols = R_stack.extent(1); // Copy data from R_local into top ncols x ncols block of R_stack. - mat_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.lda()); + mat_view_type R_stack_view_first (ncols, ncols, R_stack.data(), + R_stack.stride(1)); deep_copy (R_stack_view_first, R_local); // Loop through all other processors, fetching their matrix data. RMessenger< ordinal_type, scalar_type > receiver (messenger); for (int srcProc = 1; srcProc < nprocs; ++srcProc) { const scalar_type* const R_ptr = R_stack.data() + srcProc*ncols; - mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.lda()); + mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, + R_stack.stride(1)); // Fill (the lower triangle) with zeros, since // RMessenger::recv() only writes to the upper triangle. deep_copy (R_stack_view_cur, scalar_type {}); diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 2b0f075aad5b..75db87ce86aa 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -63,7 +63,7 @@ namespace TSQR { const ordinal_type nrows = A.extent(0); const ordinal_type ncols = A.extent(1); - const ordinal_type lda = A.lda(); + const ordinal_type lda = A.stride(1); if (nrows == lda) { // A is stored contiguously. const ordinal_type nelts = nrows * ncols; @@ -120,7 +120,7 @@ namespace TSQR { // Generate a random ncols by ncols upper triangular matrix R // with the given singular values. Matrix R (ncols, ncols, scalar_type {}); - matGen.fill_random_R (ncols, R.data(), R.lda(), singular_values); + matGen.fill_random_R (ncols, R.data(), R.stride(1), singular_values); // Broadcast R to all the processors. scalarMessenger->broadcast (R.data(), ncols*ncols, rootProc); @@ -128,7 +128,7 @@ namespace TSQR { // Generate (for myself) a random nrowsLocal x ncols // orthogonal matrix, stored in explicit form. Matrix Q_local (nrowsLocal, ncols); - matGen.explicit_Q (nrowsLocal, ncols, Q_local.data(), Q_local.lda()); + matGen.explicit_Q (nrowsLocal, ncols, Q_local.data(), Q_local.stride(1)); // Scale the (local) orthogonal matrix by the number of // processors P, to make the columns of the global matrix Q @@ -147,9 +147,9 @@ namespace TSQR { // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.data(), Q_local.lda(), - R.data(), R.lda(), - scalar_type(0), A_local.data(), A_local.lda()); + scalar_type(1), Q_local.data(), Q_local.stride(1), + R.data(), R.stride(1), + scalar_type(0), A_local.data(), A_local.stride(1)); for (int recvProc = 1; recvProc < nprocs; ++recvProc) { // Ask the receiving processor how big (i.e., how many rows) @@ -163,7 +163,7 @@ namespace TSQR { // Compute a random nrowsRemote * ncols orthogonal // matrix Q_local, for the current receiving processor. - matGen.explicit_Q (nrowsRemote, ncols, Q_local.data(), Q_local.lda()); + matGen.explicit_Q (nrowsRemote, ncols, Q_local.data(), Q_local.stride(1)); // Send Q_local to the current receiving processor. scalarMessenger->send (Q_local.data(), nrowsRemote*ncols, recvProc, 0); @@ -201,9 +201,9 @@ namespace TSQR { // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, - scalar_type(1), Q_local.data(), Q_local.lda(), - R.data(), R.lda(), - scalar_type(0), A_local.data(), A_local.lda()); + scalar_type(1), Q_local.data(), Q_local.stride(1), + R.data(), R.stride(1), + scalar_type(0), A_local.data(), A_local.stride(1)); } } } // namespace Random diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index e2293cc2b21a..513f7816a090 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -160,7 +160,7 @@ namespace TSQR { implicit_Q (MatrixViewType& Q, typename MatrixViewType::scalar_type tau[]) { - implicit_Q (Q.extent(0), Q.extent(1), Q.data(), Q.lda(), tau); + implicit_Q (Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), tau); } void @@ -195,16 +195,16 @@ namespace TSQR { Scalar _lwork1, _lwork2; Impl::Lapack lapack; lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, - U.data(), U.lda(), tau_U.data(), + U.data(), U.stride(1), tau_U.data(), A, lda, &_lwork1, -1); if (STS::isComplex) { lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, - V.data(), V.lda(), tau_V.data(), + V.data(), V.stride(1), tau_V.data(), A, lda, &_lwork2, -1); } else { lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, - V.data(), V.lda(), tau_V.data(), + V.data(), V.stride(1), tau_V.data(), A, lda, &_lwork2, -1); } @@ -215,16 +215,16 @@ namespace TSQR { // Apply U to the left side of A, and V^H to the right side of A. lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, - U.data(), U.lda(), tau_U.data(), + U.data(), U.stride(1), tau_U.data(), A, lda, work.data(), lwork); if (STS::isComplex) { lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, - V.data(), V.lda(), tau_V.data(), + V.data(), V.stride(1), tau_V.data(), A, lda, work.data(), lwork); } else { lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, - V.data(), V.lda(), tau_V.data(), + V.data(), V.stride(1), tau_V.data(), A, lda, work.data(), lwork); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp index 32d907da714b..727c50019482 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp @@ -167,7 +167,7 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), true); + nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true); if (save_matrices) { string filename = "A_" + shortDatatype + ".txt"; @@ -175,7 +175,7 @@ namespace TSQR { cerr << "-- Saving test problem to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, A.data(), A.lda()); + print_local_matrix (fileOut, nrows, ncols, A.data(), A.stride(1)); fileOut.close(); } @@ -193,7 +193,7 @@ namespace TSQR { } } else { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); if (b_debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -205,7 +205,7 @@ namespace TSQR { if (std::numeric_limits::has_quiet_NaN) { deep_copy (A2, std::numeric_limits::quiet_NaN ()); } - actor.un_cache_block (nrows, ncols, A2.data (), A2.lda (), + actor.un_cache_block (nrows, ncols, A2.data (), A2.stride (1), A_copy.data ()); if (matrix_equal (A, A2)) { if (b_debug) { @@ -226,7 +226,7 @@ namespace TSQR { // This is only for diagnostic purposes. numCacheBlocks = actor.factor_num_cache_blocks (nrows, ncols, A_copy.data(), - A_copy.lda(), contiguous_cache_blocks); + A_copy.stride(1), contiguous_cache_blocks); // In debug mode, report how many cache blocks factor() will use. if (b_debug) { cerr << "-- Number of cache blocks factor() will use: " @@ -237,8 +237,8 @@ namespace TSQR { typedef typename SequentialTsqr::FactorOutput factor_output_type; factor_output_type factorOutput = - actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished SequentialTsqr::factor" << endl; } @@ -248,12 +248,12 @@ namespace TSQR { cerr << "-- Saving R factor to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str ()); - print_local_matrix (fileOut, ncols, ncols, R.data (), R.lda ()); + print_local_matrix (fileOut, ncols, ncols, R.data (), R.stride (1)); fileOut.close (); } actor.explicit_Q (nrows, ncols, A_copy.data(), lda, factorOutput, - ncols, Q.data(), Q.lda(), contiguous_cache_blocks); + ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished SequentialTsqr::explicit_Q" << endl; } @@ -263,7 +263,7 @@ namespace TSQR { // currently support contiguous cache blocks. if (contiguous_cache_blocks) { // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.lda(), Q.data()); + actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data()); deep_copy (Q, A_copy); if (b_debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -276,14 +276,14 @@ namespace TSQR { cerr << "-- Saving Q factor to \"" << filename << "\"" << endl; } std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, Q.data(), Q.lda()); + print_local_matrix (fileOut, nrows, ncols, Q.data(), Q.stride(1)); fileOut.close(); } // Print out the R factor if (false && b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); cerr << endl; } @@ -466,8 +466,8 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data (), A.lda (), true); - + nodeTestProblem (generator, nrows, ncols, + A.data (), A.stride (1), true); if (b_debug) { cerr << "-- Generated test problem" << endl; } @@ -480,7 +480,7 @@ namespace TSQR { // Now determine the required workspace for the factorization. const Ordinal lwork = - lworkQueryLapackQr (lapack, nrows, ncols, A_copy.lda ()); + lworkQueryLapackQr (lapack, nrows, ncols, A_copy.stride (1)); std::vector work (lwork); std::vector tau (ncols); @@ -488,7 +488,7 @@ namespace TSQR { // the strict lower triangle of R. deep_copy (R, Scalar {}); - lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.lda(), + lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.stride(1), tau.data(), work.data(), lwork); // Copy out the R factor from A_copy (where we computed the QR // factorization in place) into R. @@ -496,7 +496,7 @@ namespace TSQR { if (b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); cerr << endl; } @@ -932,11 +932,9 @@ namespace TSQR { timer.start(); for (int trialNum = 0; trialNum < numTrials; ++trialNum) { // Factor the matrix and extract the resulting R factor - typedef typename SequentialTsqr::FactorOutput - factor_output_type; - factor_output_type factorOutput = + auto factorOutput = actor.factor (numRows, numCols, A_copy.data(), lda, - R.data(), R.lda(), contiguousCacheBlocks); + R.data(), R.stride(1), contiguousCacheBlocks); // Compute the explicit Q factor. Unlike with LAPACK QR, // this doesn't happen in place: the implicit Q factor is // stored in A_copy, and the explicit Q factor is written to diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index e0a09da84780..6cfc31e84e05 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -144,8 +144,8 @@ namespace TSQR { // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), - Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), - A_cur.lda (), Scalar (0), ATA.data (), ATA.lda ()); + Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), + A_cur.stride (1), Scalar (0), ATA.data (), ATA.stride (1)); // Process the remaining cache blocks in order. while (! A_rest.empty ()) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); @@ -153,8 +153,8 @@ namespace TSQR { // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), - Scalar (1), A_cur.data (), A_cur.lda (), A_cur.data (), - A_cur.lda (), Scalar (1), ATA.data (), ATA.lda ()); + Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), + A_cur.stride (1), Scalar (1), ATA.data (), ATA.stride (1)); } } else { @@ -163,12 +163,12 @@ namespace TSQR { // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, nrows, Scalar (1), A, lda, A, lda, - Scalar (0), ATA.data (), ATA.lda ()); + Scalar (0), ATA.data (), ATA.stride (1)); } // Compute the Cholesky factorization of ATA in place, so that // A^T * A = R^T * R, where R is ncols x ncols upper triangular. - lapack.POTRF ('U', ncols, ATA.data(), ATA.lda()); + lapack.POTRF ('U', ncols, ATA.data(), ATA.stride(1)); // FIXME (mfh 22 June 2010, mfh 21 Nov 2019) The right thing to // do on failure of above would be to resort to a rank-revealing // factorization, as Stathopoulos and Wu (2002) do with their @@ -176,7 +176,7 @@ namespace TSQR { // Copy out the R factor fill_matrix (ncols, ncols, R, ldr, Scalar {}); - copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.stride(1)); // Compute A := A * R^{-1}. We do this in place in A, using // BLAS' TRSM with the R factor (form POTRF) stored in the upper @@ -194,15 +194,17 @@ namespace TSQR { // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place. blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.extent (0), ncols, Scalar (1), ATA.data (), ATA.lda (), - A_cur.data (), A_cur.lda ()); + A_cur.extent (0), ncols, + Scalar (1.0), ATA.data (), ATA.stride (1), + A_cur.data (), A_cur.stride (1)); // Process the remaining cache blocks in order. while (! A_rest.empty ()) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, - A_cur.extent (0), ncols, Scalar (1), ATA.data (), ATA.lda (), - A_cur.data (), A_cur.lda ()); + A_cur.extent (0), ncols, + Scalar (1.0), ATA.data (), ATA.stride (1), + A_cur.data (), A_cur.stride (1)); } } @@ -294,7 +296,7 @@ namespace TSQR { /// must have at least as many rows as columns. For a square /// ncols by ncols block, as needed in TSQR::Tsqr::apply(), if /// the output is ret, do mat_view_type(ncols, ncols, ret.data(), - /// ret.lda()) to get an ncols by ncols block. + /// ret.stride(1)) to get an ncols by ncols block. template< class MatrixViewType > MatrixViewType top_block (const MatrixViewType& C, diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 6754e9011a10..f057fc6d0a38 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -159,9 +159,9 @@ namespace TSQR { std::vector& work) const { const LocalOrdinal ncols = A_top.extent(1); - combine.factor_first (A_top.extent(0), ncols, A_top.data(), A_top.lda(), + combine.factor_first (A_top.extent(0), ncols, A_top.data(), A_top.stride(1), tau.data(), work.data()); - return mat_view_type(ncols, ncols, A_top.data(), A_top.lda()); + return mat_view_type(ncols, ncols, A_top.data(), A_top.stride(1)); } /// Apply the Q factor of the first (topmost) cache blocks, as @@ -178,8 +178,8 @@ namespace TSQR { { const LocalOrdinal nrowsLocal = Q_first.extent(0); combine.apply_first (applyType, nrowsLocal, C_first.extent(1), - Q_first.extent(1), Q_first.data(), Q_first.lda(), - tau.data(), C_first.data(), C_first.lda(), work.data()); + Q_first.extent(1), Q_first.data(), Q_first.stride(1), + tau.data(), C_first.data(), C_first.stride(1), work.data()); } void @@ -197,9 +197,9 @@ namespace TSQR { combine.apply_inner (apply_type, nrows_local, ncols_C, ncols_Q, - Q_cur.data(), C_cur.lda(), tau.data(), - C_top.data(), C_top.lda(), - C_cur.data(), C_cur.lda(), work.data()); + Q_cur.data(), C_cur.stride(1), tau.data(), + C_top.data(), C_top.stride(1), + C_cur.data(), C_cur.stride(1), work.data()); } void @@ -212,8 +212,8 @@ namespace TSQR { const LocalOrdinal nrows_local = A_cur.extent(0); const LocalOrdinal ncols = A_cur.extent(1); - combine.factor_inner (nrows_local, ncols, R.data(), R.lda(), - A_cur.data(), A_cur.lda(), tau.data(), + combine.factor_inner (nrows_local, ncols, R.data(), R.stride(1), + A_cur.data(), A_cur.stride(1), tau.data(), work.data()); } @@ -487,7 +487,7 @@ namespace TSQR { fill_matrix (ncols, ncols, R, ldr, Teuchos::ScalarTraits::zero()); // Copy out the upper triangle of the R factor from A into R. - copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.stride(1)); } /// \brief Compute the QR factorization of the matrix A. @@ -541,7 +541,7 @@ namespace TSQR { // R_view (a view of the topmost cache block of A) into the R // output argument. fill_matrix (ncols, ncols, R, ldr, Scalar(0)); - copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.lda()); + copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.stride(1)); return tau_arrays; } @@ -762,8 +762,8 @@ namespace TSQR { deep_copy (Q_cur_copy, Q_cur); // Q_cur := Q_cur_copy * B. blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent (0), ncols, ncols, - Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.lda (), - B, ldb, Scalar {}, Q_cur.data (), Q_cur.lda ()); + Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.stride (1), + B, ldb, Scalar {}, Q_cur.data (), Q_cur.stride (1)); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp index 2d333e266b81..ad86d8c3d206 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp @@ -121,7 +121,7 @@ namespace TSQR { const Ordinal ldr = ncols; // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), true); + nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true); if (b_debug) { cerr << "-- Generated test problem" << endl; @@ -137,7 +137,7 @@ namespace TSQR { } } else { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); if (b_debug) { cerr << "-- Reorganized test matrix to have contiguous " "cache blocks" << endl; @@ -148,7 +148,7 @@ namespace TSQR { if (std::numeric_limits< Scalar >::has_quiet_NaN) { deep_copy (A2, std::numeric_limits< Scalar >::quiet_NaN()); } - actor.un_cache_block (nrows, ncols, A2.data(), A2.lda(), A_copy.data()); + actor.un_cache_block (nrows, ncols, A2.data(), A2.stride(1), A_copy.data()); if (matrix_equal (A, A2)) { if (b_debug) { cerr << "-- Cache blocking test succeeded!" << endl; @@ -166,13 +166,13 @@ namespace TSQR { // Factor the matrix and compute the explicit Q factor factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), R.data(), - R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), R.data(), + R.stride(1), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished TbbTsqr::factor" << endl; } - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), factor_output, - ncols, Q.data(), Q.lda(), contiguous_cache_blocks); + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), factor_output, + ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); if (b_debug) { cerr << "-- Finished TbbTsqr::explicit_Q" << endl; } @@ -183,7 +183,7 @@ namespace TSQR { // cache blocks. if (contiguous_cache_blocks) { // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.lda(), Q.data()); + actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data()); deep_copy (Q, A_copy); if (b_debug) { cerr << "-- Un-cache-blocked output Q factor" << endl; @@ -193,7 +193,7 @@ namespace TSQR { // Print out the R factor if (b_debug) { cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); cerr << endl; } @@ -298,13 +298,13 @@ namespace TSQR { deep_copy (R, scalar_type {}); // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.lda(), false); + nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), false); // Copy A into A_copy, since TSQR overwrites the input. If // specified, rearrange the data in A_copy so that the data in // each cache block is contiguously stored. if (contiguous_cache_blocks) { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.lda()); + actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); } else { deep_copy (A_copy, A); @@ -318,14 +318,14 @@ namespace TSQR { // resulting R factor into R. typedef typename node_tsqr_type::FactorOutput factor_output_type; factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguous_cache_blocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), - factor_output, ncols, Q.data(), Q.lda(), + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), + factor_output, ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); } @@ -339,14 +339,14 @@ namespace TSQR { // resulting R factor into R. typedef typename node_tsqr_type::FactorOutput factor_output_type; factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguous_cache_blocks); + actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguous_cache_blocks); // Compute the explicit Q factor (which was stored // implicitly in A_copy and factor_output) and store in Q. // We don't need to un-cache-block the output, because we // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.lda(), - factor_output, ncols, Q.data(), Q.lda(), + actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), + factor_output, ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); } const double tbb_tsqr_timing = timer.stop(); diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index c52a485c79d5..fb6c532b3e54 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -86,7 +86,7 @@ namespace TSQR { // If specified, rearrange cache blocks in the copy. if (contiguousCacheBlocks) { tsqr.cache_block (nrows_local, ncols, A_copy.data(), - A_local.data(), A_local.lda()); + A_local.data(), A_local.stride(1)); if (b_debug) { scalarComm->barrier (); if (scalarComm->rank () == 0) @@ -110,8 +110,8 @@ namespace TSQR { else { // Factor the (copy of the) matrix. factor_output_type factorOutput = - tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); + tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); if (scalarComm->rank () == 0) @@ -120,8 +120,8 @@ namespace TSQR { // Compute the explicit Q factor in Q_local tsqr.explicit_Q (nrows_local, - ncols, A_copy.data(), A_copy.lda(), factorOutput, - ncols, Q_local.data(), Q_local.lda(), + ncols, A_copy.data(), A_copy.stride(1), factorOutput, + ncols, Q_local.data(), Q_local.stride(1), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); @@ -137,7 +137,7 @@ namespace TSQR { // We can use A_copy as scratch space for un-cache-blocking // Q_local, since we're done using A_copy for other things. tsqr.un_cache_block (nrows_local, ncols, A_copy.data(), - A_copy.lda(), Q_local.data()); + A_copy.stride(1), Q_local.data()); // Overwrite Q_local with the un-cache-blocked Q factor. deep_copy (Q_local, A_copy); @@ -321,7 +321,7 @@ namespace TSQR { scalarComm->barrier (); if (my_rank == 0) { cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.lda()); + print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); cerr << endl; } scalarComm->barrier (); @@ -329,8 +329,8 @@ namespace TSQR { // Test accuracy of the resulting factorization std::vector< magnitude_type > results = - global_verify (nrows_local, ncols, A_local.data(), A_local.lda(), - Q_local.data(), Q_local.lda(), R.data(), R.lda(), + global_verify (nrows_local, ncols, A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), scalarComm.get()); if (b_debug) { scalarComm->barrier (); @@ -443,7 +443,7 @@ namespace TSQR { if (contiguousCacheBlocks) { tsqr.cache_block (nrows_local, ncols, A_copy.data(), - A_local.data(), A_local.lda()); + A_local.data(), A_local.stride(1)); if (b_debug) { messenger->barrier (); if (messenger->rank () == 0) { @@ -475,50 +475,46 @@ namespace TSQR { const bool testFactorExplicit = true; double tsqr_timing; - if (testFactorExplicit) - { - timer.start(); - for (int trial_num = 0; trial_num < ntrials; ++trial_num) - tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), - contiguousCacheBlocks); - tsqr_timing = timer.stop(); - } - else - { - timer.start(); - for (int trial_num = 0; trial_num < ntrials; ++trial_num) - { - // Factor the matrix and compute the explicit Q factor. - // Don't worry about the fact that we're overwriting the - // input; this is a benchmark, not a numerical verification - // test. (We have the latter implemented as tsqr_verify() - // in this file.) For the same reason, don't worry about - // un-cache-blocking the output (when cache blocks are - // stored contiguously). - factor_output_type factor_output = - tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.lda(), - R.data(), R.lda(), contiguousCacheBlocks); - tsqr.explicit_Q (nrows_local, - ncols, A_copy.data(), A_copy.lda(), factor_output, - ncols, Q_local.data(), Q_local.lda(), + if (testFactorExplicit) { + timer.start(); + for (int trial_num = 0; trial_num < ntrials; ++trial_num) + tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), contiguousCacheBlocks); - // Timings in debug mode likely won't make sense, because - // Proc 0 is outputting the debug messages to cerr. - // Nevertheless, we don't put any "if(b_debug)" calls in the - // timing loop. - } - // Compute the resulting total time (in seconds) to execute - // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q(). The - // time may differ on different MPI processes. - tsqr_timing = timer.stop(); + tsqr_timing = timer.stop(); + } + else { + timer.start(); + for (int trial_num = 0; trial_num < ntrials; ++trial_num) { + // Factor the matrix and compute the explicit Q factor. + // Don't worry about the fact that we're overwriting the + // input; this is a benchmark, not a numerical verification + // test. (We have the latter implemented as tsqr_verify() + // in this file.) For the same reason, don't worry about + // un-cache-blocking the output (when cache blocks are + // stored contiguously). + factor_output_type factor_output = + tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), contiguousCacheBlocks); + tsqr.explicit_Q (nrows_local, + ncols, A_copy.data(), A_copy.stride(1), factor_output, + ncols, Q_local.data(), Q_local.stride(1), + contiguousCacheBlocks); + // Timings in debug mode likely won't make sense, because + // Proc 0 is outputting the debug messages to cerr. + // Nevertheless, we don't put any "if(b_debug)" calls in the + // timing loop. } + // Compute the resulting total time (in seconds) to execute + // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q(). The + // time may differ on different MPI processes. + tsqr_timing = timer.stop(); + } - if (b_debug) - { - messenger->barrier(); - if (messenger->rank() == 0) - cerr << "-- Finished timing loop" << endl; - } + if (b_debug) { + messenger->barrier(); + if (messenger->rank() == 0) + cerr << "-- Finished timing loop" << endl; + } return tsqr_timing; } diff --git a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp index e907b5fccf5c..c1caa7adfb9f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp @@ -86,7 +86,7 @@ namespace TSQR { // Print the remote matrix data // out << "Processor " << my_rank << ":" << endl; print_local_matrix (out, A_local.extent(0), A_local.extent(1), - A_local.data(), A_local.lda()); + A_local.data(), A_local.stride(1)); // Space for remote matrix data. Other processors are allowed // to have different nrows_local values; we make space as @@ -130,7 +130,7 @@ namespace TSQR { // Print the remote matrix data // out << "Processor " << proc << ":" << endl; - print_local_matrix (out, dims[0], dims[0], A_remote.data(), A_remote.lda()); + print_local_matrix (out, dims[0], dims[0], A_remote.data(), A_remote.stride(1)); } } else From e8fc0b606cb065c1a35894bad203c876467a35dc Mon Sep 17 00:00:00 2001 From: Alexander Heinlein Date: Tue, 26 Nov 2019 14:40:25 +0100 Subject: [PATCH 16/50] std::cerr for warnings. --- .../frosch/src/Adapters/Thyra_FROSchFactory_def.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp b/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp index 7f461d67484c..3185596f2b91 100644 --- a/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/Adapters/Thyra_FROSchFactory_def.hpp @@ -422,7 +422,7 @@ namespace Thyra { repeatedMap = rcp_dynamic_cast(xTpetraRepeatedMap); } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA - if (comm->getRank()==0) std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; + if (comm->getRank()==0) std::cerr << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; #endif } } @@ -447,7 +447,7 @@ namespace Thyra { } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA if (comm->getRank()==0) { - std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; + std::cerr << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; } #endif } @@ -473,7 +473,7 @@ namespace Thyra { } else { #ifdef HAVE_SHYLU_DDFROSCH_EPETRA if (comm->getRank()==0) { - std::cout << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; + std::cerr << "FROSch::FROSchFactory : WARNING: Cannot retrieve Epetra objects from ParameterList. Use Xpetra instead." << std::endl; } #endif } From 7531aee30ca6f87010124f404cbc67b31b919341 Mon Sep 17 00:00:00 2001 From: Alexander Heinlein Date: Tue, 26 Nov 2019 16:37:47 +0100 Subject: [PATCH 17/50] Move getData() out of for loops. --- .../src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp | 1 + .../frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp | 3 ++- .../FROSch_HarmonicCoarseOperator_decl.hpp | 3 +++ .../FROSch_HarmonicCoarseOperator_def.hpp | 10 +++++----- .../FROSch_IPOUHarmonicCoarseOperator_decl.hpp | 1 + .../FROSch_IPOUHarmonicCoarseOperator_def.hpp | 4 +++- .../FROSch_OverlappingOperator_def.hpp | 3 ++- 7 files changed, 17 insertions(+), 8 deletions(-) diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp index 6d2c4975068e..2151d49cbdd3 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp @@ -90,6 +90,7 @@ namespace FROSch { using GOVec = Array; using SCVec = Array; + using ConstSCVecPtr = ArrayRCP; public: diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp index 0040fe58fc31..868c86644248 100644 --- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp @@ -137,10 +137,11 @@ namespace FROSch { for (UN i=0; igetNumVectors(); j++) { + ConstSCVecPtr unassembledSubspaceBasesData = UnassembledSubspaceBases_[i]->getData(j); for (UN k=0; kgetLocalLength(); k++) { FROSCH_ASSERT(itmpgetNumVectors(),"FROSch::CoarseSpace : ERROR: itmp>=AssembledBasis_->getNumVectors()"); FROSCH_ASSERT(k+Offsets_[i]getLocalLength(),"FROSch::CoarseSpace : ERROR: k+Offsets_[i]>=AssembledBasis_->getLocalLength()"); - AssembledBasis_->replaceLocalValue(k+Offsets_[i],itmp,UnassembledSubspaceBases_[i]->getData(j)[k]); + AssembledBasis_->replaceLocalValue(k+Offsets_[i],itmp,unassembledSubspaceBasesData[k]); } itmp++; } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp index 41d103608b97..03d3a4108ebb 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp @@ -93,7 +93,10 @@ namespace FROSch { using GOVec = typename SchwarzOperator::GOVec; using GOVecView = typename SchwarzOperator::GOVecView; using GOVec2D = typename SchwarzOperator::GOVec2D; + using SCVec = typename SchwarzOperator::SCVec; + using SCVecPtr = typename SchwarzOperator::SCVecPtr; + using ConstSCVecPtr = typename SchwarzOperator::ConstSCVecPtr; public: diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp index f9f22a0aa87d..cc6f3df2735f 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp @@ -461,9 +461,10 @@ namespace FROSch { XMultiVectorPtr mVPhiGamma = MultiVectorFactory::Build(kIGamma->getDomainMap(),coarseMap->getNodeNumElements()); if (AssembledInterfaceCoarseSpace_->hasAssembledBasis()) { for (UN i=0; igetAssembledBasis()->getNumVectors(); i++) { + ConstSCVecPtr AssembledInterfaceCoarseSpaceData = AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i); for (UN j=0; jgetAssembledBasis()->getLocalLength(); j++) { - mVPhiGamma->replaceLocalValue(j,i,AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i)[j]); - mVPhi->replaceLocalValue(indicesGammaDofsAll[j],i,AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i)[j]); + mVPhiGamma->replaceLocalValue(j,i,AssembledInterfaceCoarseSpaceData[j]); + mVPhi->replaceLocalValue(indicesGammaDofsAll[j],i,AssembledInterfaceCoarseSpaceData[j]); } } } @@ -534,11 +535,10 @@ namespace FROSch { } for (UN j=0; jgetData(itmp); for (UN ii=0; iigetData(itmp)[k]; - mVPhi->replaceLocalValue(indicesIDofsAll[k],itmp,mVPhiI->getData(itmp)[k]); + mVPhi->replaceLocalValue(indicesIDofsAll[k],itmp,mVPhiIData[k]); } } itmp++; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_decl.hpp index 37e485cfb800..eb581727df7c 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_decl.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_decl.hpp @@ -111,6 +111,7 @@ namespace FROSch { using SCVec = typename SchwarzOperator::SCVec; using SCVecPtr = typename SchwarzOperator::SCVecPtr; + using ConstSCVecPtr = typename SchwarzOperator::ConstSCVecPtr; using BoolVecPtr = typename SchwarzOperator::BoolVecPtr; diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_def.hpp index ada56ed7a570..d417f37bd468 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_IPOUHarmonicCoarseOperator_def.hpp @@ -271,9 +271,11 @@ namespace FROSch { XMapPtr serialInterfaceMap = MapFactory::Build(nullSpaceBasis->getMap()->lib(),this->GammaDofs_[blockId].size(),this->GammaDofs_[blockId].size(),0,this->SerialComm_); XMultiVectorPtr interfaceNullspaceBasis = MultiVectorFactory::Build(serialInterfaceMap,nullSpaceBasis->getNumVectors()); for (UN i=0; igetNumVectors(); i++) { + SCVecPtr interfaceNullspaceBasisData = interfaceNullspaceBasis->getDataNonConst(i); + ConstSCVecPtr nullSpaceBasisData = nullSpaceBasis->getData(i); for (UN k=0; kDofsPerNode_[blockId]; k++) { for (UN j=0; jgetNumNodes(); j++) { - interfaceNullspaceBasis->getDataNonConst(i)[interface->getGammaDofID(j,k)] = nullSpaceBasis->getData(i)[nullSpaceBasis->getMap()->getLocalElement(interface->getGlobalDofID(j,k))]; + interfaceNullspaceBasisData[interface->getGammaDofID(j,k)] = nullSpaceBasisData[nullSpaceBasis->getMap()->getLocalElement(interface->getGlobalDofID(j,k))]; } } } diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_OverlappingOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_OverlappingOperator_def.hpp index 154e7a6d50bd..98b352bb08cf 100644 --- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_OverlappingOperator_def.hpp +++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_OverlappingOperator_def.hpp @@ -137,10 +137,11 @@ namespace FROSch { GO globID = 0; LO localID = 0; for (UN i=0; igetData(i); for (UN j=0; jgetNodeNumElements(); j++) { globID = y.getMap()->getGlobalElement(j); localID = YOverlap_->getMap()->getLocalElement(globID); - XTmp_->getDataNonConst(i)[j] = YOverlap_->getData(i)[localID]; + XTmp_->getDataNonConst(i)[j] = YOverlapData[localID]; } } } else { From db7c7b489b132306518c1f444b4a83e163ebdddd Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 10:36:50 -0700 Subject: [PATCH 18/50] ML: enabling more experimental maxwell stuff --- packages/ml/src/Coarsen/ml_amg.h | 4 + .../ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp | 233 ++++++++++-------- packages/ml/test/RefMaxwell/cxx_main.cpp | 8 + 3 files changed, 148 insertions(+), 97 deletions(-) diff --git a/packages/ml/src/Coarsen/ml_amg.h b/packages/ml/src/Coarsen/ml_amg.h index 83ba44d344f9..fd1f2b6fd06f 100644 --- a/packages/ml/src/Coarsen/ml_amg.h +++ b/packages/ml/src/Coarsen/ml_amg.h @@ -205,6 +205,10 @@ int ML_AMG_UpdateVertexStates(int N_remaining_vertices, char vertex_state[], int ML_AMG_CompatibleRelaxation(int *CF_array, ML_Operator *Amat, int *Ncoarse, int limit); +int ML_AMG_Identity_Getrows(ML_Operator *data, int N_requested_rows, + int requested_rows[], int allocated_space, int columns[], + double values[], int row_lengths[]); + #ifndef ML_CPP #ifdef __cplusplus } diff --git a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp index 7287560f2f44..4afac9af8c3b 100644 --- a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp +++ b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp @@ -876,6 +876,7 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P if(OutputLevel>=15) very_verbose=verbose=true; if(OutputLevel > 5) {very_verbose=false;verbose=true;} else very_verbose=verbose=false; + int printlevel=ML_Get_PrintLevel(); /* Wrap A in a ML_Operator */ ML_Operator* A_ML = ML_Operator_Create(ml_comm); @@ -901,102 +902,149 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P double MatThreshold = List.get("aggregation: material: threshold",0.0); int MaxMatLevels = List.get("aggregation: material: max levels",10); - // Setup the Fine Coordinates - ML_Aggregate_Viz_Stats fine_grid; - fine_grid.x=0; fine_grid.y=0; fine_grid.z=0; fine_grid.material=0; - RefMaxwell_SetupCoordinates(A_ML,List,fine_grid.x,fine_grid.y,fine_grid.z,fine_grid.material); - - // FIXME: We need to allow this later - TEUCHOS_TEST_FOR_EXCEPTION(UseAux && UseMaterial, std::logic_error,"RefMaxwell_Aggregate_Nodes: Cannot use material and aux aggregation at the same time"); - - ML_Aggregate_Create(&MLAggr); - ML_Aggregate_Set_MaxLevels(MLAggr, 2); - ML_Aggregate_Set_StartLevel(MLAggr, 0); - ML_Aggregate_Set_Threshold(MLAggr, Threshold); - if(RowSum_Threshold > 0.0) ML_Aggregate_Set_RowSum_Threshold(MLAggr, RowSum_Threshold); - ML_Aggregate_Set_MaxCoarseSize(MLAggr,1); - MLAggr->cur_level = 0; - ML_Aggregate_Set_Reuse(MLAggr); - ML_Aggregate_Set_Do_QR(MLAggr,doQR); - - if(DampingFactor > 0.0) { - ML_Aggregate_Set_DampingFactor(MLAggr,DampingFactor); - ML_Aggregate_Set_DampingSweeps(MLAggr,PSmSweeps,0); + bool useSA = true; + if(List.isParameter("default values")) { + std::string default_values = List.get("default values","Classical-AMG"); + if(default_values == "Classical-AMG") + useSA = false; } - if( EigType == "cg" ) ML_Operator_Set_SpectralNormScheme_Calc(A_ML); - else if( EigType == "Anorm" ) ML_Operator_Set_SpectralNormScheme_Anorm(A_ML); - else if( EigType == "Anasazi" ) ML_Operator_Set_SpectralNormScheme_Anasazi(A_ML); - else if( EigType == "power-method" ) ML_Operator_Set_SpectralNormScheme_PowerMethod(A_ML); - else { - if(!A.Comm().MyPID()) printf("%s Unsupported (1,1) block eigenvalue type(%s), resetting to cg\n",PrintMsg.c_str(),EigType.c_str()); - ML_Operator_Set_SpectralNormScheme_Calc(A_ML); - } - ML_Operator_Set_SpectralNorm_Iterations(A_ML, NumEigenIts); - - MLAggr->keep_agg_information = 1; - P = ML_Operator_Create(ml_comm); - - /* Process Teuchos Options */ - if (CoarsenType == "Uncoupled") - ML_Aggregate_Set_CoarsenScheme_Uncoupled(MLAggr); - else if (CoarsenType == "Uncoupled-MIS"){ - ML_Aggregate_Set_CoarsenScheme_UncoupledMIS(MLAggr); - } - else if (CoarsenType == "METIS"){ - ML_Aggregate_Set_CoarsenScheme_METIS(MLAggr); - ML_Aggregate_Set_NodesPerAggr(0, MLAggr, 0, NodesPerAggr); - }/*end if*/ - else { - if(!A.Comm().MyPID()) printf("%s Unsupported (1,1) block aggregation type(%s), resetting to uncoupled-mis\n",PrintMsg.c_str(),CoarsenType.c_str()); - ML_Aggregate_Set_CoarsenScheme_UncoupledMIS(MLAggr); - } + if(useSA) { + /* Use SA */ - /* Setup Aux Data (aux) */ - if(UseAux) { - A_ML->aux_data->enable=1; - A_ML->aux_data->threshold=AuxThreshold; - A_ML->aux_data->max_level=MaxAuxLevels; - ML_Init_Aux(A_ML,fine_grid.x,fine_grid.y,fine_grid.z); - if(verbose && !A.Comm().MyPID()) { - printf("%s Using auxiliary matrix\n",PrintMsg.c_str()); - printf("%s aux threshold = %e\n",PrintMsg.c_str(),A_ML->aux_data->threshold); + // Setup the Fine Coordinates + ML_Aggregate_Viz_Stats fine_grid; + fine_grid.x=0; fine_grid.y=0; fine_grid.z=0; fine_grid.material=0; + RefMaxwell_SetupCoordinates(A_ML,List,fine_grid.x,fine_grid.y,fine_grid.z,fine_grid.material); + + // FIXME: We need to allow this later + TEUCHOS_TEST_FOR_EXCEPTION(UseAux && UseMaterial, std::logic_error,"RefMaxwell_Aggregate_Nodes: Cannot use material and aux aggregation at the same time"); + + ML_Aggregate_Create(&MLAggr); + ML_Aggregate_Set_MaxLevels(MLAggr, 2); + ML_Aggregate_Set_StartLevel(MLAggr, 0); + ML_Aggregate_Set_Threshold(MLAggr, Threshold); + if(RowSum_Threshold > 0.0) ML_Aggregate_Set_RowSum_Threshold(MLAggr, RowSum_Threshold); + ML_Aggregate_Set_MaxCoarseSize(MLAggr,1); + MLAggr->cur_level = 0; + ML_Aggregate_Set_Reuse(MLAggr); + ML_Aggregate_Set_Do_QR(MLAggr,doQR); + + if(DampingFactor > 0.0) { + ML_Aggregate_Set_DampingFactor(MLAggr,DampingFactor); + ML_Aggregate_Set_DampingSweeps(MLAggr,PSmSweeps,0); } - } - - /* Setup Aux Data (material) */ - if(UseMaterial) { - A_ML->aux_data->enable=1; - A_ML->aux_data->threshold=MatThreshold; - A_ML->aux_data->max_level=MaxMatLevels; - ML_Init_Material(A_ML,fine_grid.material); - if(verbose && !A.Comm().MyPID()) { - printf("%s Using material matrix\n",PrintMsg.c_str()); - printf("%s material threshold = %e\n",PrintMsg.c_str(),A_ML->aux_data->threshold); + + if( EigType == "cg" ) ML_Operator_Set_SpectralNormScheme_Calc(A_ML); + else if( EigType == "Anorm" ) ML_Operator_Set_SpectralNormScheme_Anorm(A_ML); + else if( EigType == "Anasazi" ) ML_Operator_Set_SpectralNormScheme_Anasazi(A_ML); + else if( EigType == "power-method" ) ML_Operator_Set_SpectralNormScheme_PowerMethod(A_ML); + else { + if(!A.Comm().MyPID()) printf("%s Unsupported (1,1) block eigenvalue type(%s), resetting to cg\n",PrintMsg.c_str(),EigType.c_str()); + ML_Operator_Set_SpectralNormScheme_Calc(A_ML); } - } + ML_Operator_Set_SpectralNorm_Iterations(A_ML, NumEigenIts); + + MLAggr->keep_agg_information = 1; + P = ML_Operator_Create(ml_comm); + + /* Process Teuchos Options */ + if (CoarsenType == "Uncoupled") + ML_Aggregate_Set_CoarsenScheme_Uncoupled(MLAggr); + else if (CoarsenType == "Uncoupled-MIS"){ + ML_Aggregate_Set_CoarsenScheme_UncoupledMIS(MLAggr); + } + else if (CoarsenType == "METIS"){ + ML_Aggregate_Set_CoarsenScheme_METIS(MLAggr); + ML_Aggregate_Set_NodesPerAggr(0, MLAggr, 0, NodesPerAggr); + }/*end if*/ + else { + if(!A.Comm().MyPID()) printf("%s Unsupported (1,1) block aggregation type(%s), resetting to uncoupled-mis\n",PrintMsg.c_str(),CoarsenType.c_str()); + ML_Aggregate_Set_CoarsenScheme_UncoupledMIS(MLAggr); + } + + /* Setup Aux Data (aux) */ + if(UseAux) { + A_ML->aux_data->enable=1; + A_ML->aux_data->threshold=AuxThreshold; + A_ML->aux_data->max_level=MaxAuxLevels; + ML_Init_Aux(A_ML,fine_grid.x,fine_grid.y,fine_grid.z); + if(verbose && !A.Comm().MyPID()) { + printf("%s Using auxiliary matrix\n",PrintMsg.c_str()); + printf("%s aux threshold = %e\n",PrintMsg.c_str(),A_ML->aux_data->threshold); + } + } + + /* Setup Aux Data (material) */ + if(UseMaterial) { + A_ML->aux_data->enable=1; + A_ML->aux_data->threshold=MatThreshold; + A_ML->aux_data->max_level=MaxMatLevels; + ML_Init_Material(A_ML,fine_grid.material); + if(verbose && !A.Comm().MyPID()) { + printf("%s Using material matrix\n",PrintMsg.c_str()); + printf("%s material threshold = %e\n",PrintMsg.c_str(),A_ML->aux_data->threshold); + } + } + + /* Aggregate Nodes */ + if(verbose) ML_Set_PrintLevel(10); + NumAggregates = ML_Aggregate_Coarsen(MLAggr,A_ML, &P, ml_comm); + if(verbose) ML_Set_PrintLevel(printlevel); + + /* Project down the coordinates, if we need to, using Ptent. Note NumPDEs always = 1 */ + if(fine_grid.x || fine_grid.y || fine_grid.z || fine_grid.material) + RefMaxwell_Project_Coordinates(1,P,&fine_grid,pack); + + /* Do prolongator smoothing, if requested */ + if(PSmSweeps && DampingFactor != 0.0) { + if(verbose && !A.Comm().MyPID()) printf("%s Smoothing Prolongator w/ Damping Factor %e\n",PrintMsg.c_str(),DampingFactor); + ML_Operator * smooP = ML_Operator_Create(ml_comm); + ML_Smooth_Prolongator(A_ML,MLAggr,PSmSweeps,P,smooP); + ML_Operator_Destroy(&P); + P = smooP; + } + if(very_verbose) printf("[%d] %s %d aggregates created invec_leng=%d\n",A.Comm().MyPID(),PrintMsg.c_str(),NumAggregates,P->invec_leng); - /* Aggregate Nodes */ - int printlevel=ML_Get_PrintLevel(); - if(verbose) ML_Set_PrintLevel(10); - NumAggregates = ML_Aggregate_Coarsen(MLAggr,A_ML, &P, ml_comm); - - /* Project down the coordinates, if we need to, using Ptent. Note NumPDEs always = 1 */ - if(fine_grid.x || fine_grid.y || fine_grid.z || fine_grid.material) - RefMaxwell_Project_Coordinates(1,P,&fine_grid,pack); - - /* Do prolongator smoothing, if requested */ - if(PSmSweeps && DampingFactor != 0.0) { - if(verbose && !A.Comm().MyPID()) printf("%s Smoothing Prolongator w/ Damping Factor %e\n",PrintMsg.c_str(),DampingFactor); - ML_Operator * smooP = ML_Operator_Create(ml_comm); - ML_Smooth_Prolongator(A_ML,MLAggr,PSmSweeps,P,smooP); - ML_Operator_Destroy(&P); - P = smooP; + /* Cleanup */ + if(fine_grid.x) ML_free(fine_grid.x); + if(fine_grid.y) ML_free(fine_grid.y); + if(fine_grid.z) ML_free(fine_grid.z); + if(fine_grid.material) ML_free(fine_grid.material); + + ML_qr_fix_Destroy(); + if(UseAux) ML_Finalize_Aux(A_ML); + if(UseMaterial) ML_Finalize_Aux(A_ML); + + } + else { + /* Use Classical */ + ML_AMG *ml_amg; + ML_AMG_Create( &ml_amg ); + ML_AMG_Set_Threshold(ml_amg,Threshold); + ML_AMG_Set_MaxLevels(ml_amg,2); + ML_AMG_Set_MaxCoarseSize(ml_amg,1); + P = ML_Operator_Create(ml_comm); + ML_Operator * Pmatrix = ML_Operator_Create(ml_comm); + + if(verbose) ML_Set_PrintLevel(10); + NumAggregates = ML_AMG_Coarsen(ml_amg, A_ML, &Pmatrix, ml_comm); + if(verbose) ML_Set_PrintLevel(printlevel); + + ML_Operator* AMGIdentity = ML_Operator_Create(ml_comm); + ML_Operator_Set_ApplyFuncData(AMGIdentity, A_ML->invec_leng, + A_ML->outvec_leng, (void*) A_ML, + A_ML->matvec->Nrows, NULL, 0); + ML_Operator_Set_Getrow(AMGIdentity, A_ML->getrow->Nrows, + ML_AMG_Identity_Getrows); + ML_CommInfoOP_Clone(&(AMGIdentity->getrow->pre_comm),A_ML->getrow->pre_comm); + ML_2matmult(AMGIdentity, Pmatrix, P, ML_CSR_MATRIX ); + + /* Cleanup */ + ML_Operator_Destroy(&AMGIdentity); + ML_Operator_Destroy(&Pmatrix); } - - if(verbose) ML_Set_PrintLevel(printlevel); - if(very_verbose) printf("[%d] %s %d aggregates created invec_leng=%d\n",A.Comm().MyPID(),PrintMsg.c_str(),NumAggregates,P->invec_leng); if(verbose){ int globalAggs=0; @@ -1007,15 +1055,6 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P } } - /* Cleanup */ - if(fine_grid.x) ML_free(fine_grid.x); - if(fine_grid.y) ML_free(fine_grid.y); - if(fine_grid.z) ML_free(fine_grid.z); - if(fine_grid.material) ML_free(fine_grid.material); - - ML_qr_fix_Destroy(); - if(UseAux) ML_Finalize_Aux(A_ML); - if(UseMaterial) ML_Finalize_Aux(A_ML); ML_Operator_Destroy(&A_ML); return 0; diff --git a/packages/ml/test/RefMaxwell/cxx_main.cpp b/packages/ml/test/RefMaxwell/cxx_main.cpp index d0df35afb7c5..b53946295d3d 100644 --- a/packages/ml/test/RefMaxwell/cxx_main.cpp +++ b/packages/ml/test/RefMaxwell/cxx_main.cpp @@ -369,6 +369,9 @@ bool matrix_read(Epetra_ActiveComm &Comm){ Teuchos::ParameterList List_AMG = Build_Teuchos_List(N,coord_ptr,"coarse: type","Amesos-KLU","max levels",1); List_AMG.sublist("refmaxwell: 11list").sublist("edge matrix free: coarse").set("default values","Classical-AMG"); + Teuchos::ParameterList List_AMG_sp = Build_Teuchos_List(N,coord_ptr,"coarse: type","Amesos-KLU","max levels",1); + List_AMG_sp.sublist("refmaxwell: 11list").set("default values","Classical-AMG"); + /* Do Tests */ Epetra_Vector lhs(EdgeMap,true); int status1, status2 = 0; @@ -444,6 +447,11 @@ bool matrix_read(Epetra_ActiveComm &Comm){ if(!Comm.MyPID()) printf("*** Test 18 ***\n"); rpc_test_additive_newconstructor(Comm,List_AMG,*SM,*M1,*M0inv,*D0,x_exact,lhs,rhs,false); + /* Test w/ classical special prolongator */ + if(!Comm.MyPID()) printf("*** Test 19 ***\n"); + rpc_test_additive_newconstructor(Comm,List_AMG_sp,*SM,*M1,*M0inv,*D0,x_exact,lhs,rhs,false); + + delete M0; delete M1e; From 9c21a70bf1eba1af923a1b167886cd2e45a06ed2 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 10:38:40 -0700 Subject: [PATCH 19/50] ML: enabling more experimental maxwell stuff --- .../ml/src/RefMaxwell/ml_EdgeMatrixFreePreconditioner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ml/src/RefMaxwell/ml_EdgeMatrixFreePreconditioner.cpp b/packages/ml/src/RefMaxwell/ml_EdgeMatrixFreePreconditioner.cpp index 7c3518a5a46c..c2b64b8a681f 100644 --- a/packages/ml/src/RefMaxwell/ml_EdgeMatrixFreePreconditioner.cpp +++ b/packages/ml/src/RefMaxwell/ml_EdgeMatrixFreePreconditioner.cpp @@ -186,7 +186,7 @@ int ML_Epetra::EdgeMatrixFreePreconditioner::BuildProlongator(const Epetra_Multi if(!mcoord && dim!=(xcoord!=0) + (ycoord!=0) + (zcoord!=0) ) build_coarse_coords=false; /* Do the aggregation */ - ML_Aggregate_Struct * MLAggr; + ML_Aggregate_Struct * MLAggr=0; ML_Operator *P; int NumAggregates; int rv=ML_Epetra::RefMaxwell_Aggregate_Nodes(*TMT_Matrix_,List_,ml_comm_,std::string("EMFP (level 0) :"), @@ -283,7 +283,7 @@ int ML_Epetra::EdgeMatrixFreePreconditioner::BuildProlongator(const Epetra_Multi /* Cleanup */ - ML_Aggregate_Destroy(&MLAggr); + if(MLAggr) ML_Aggregate_Destroy(&MLAggr); ML_Operator_Destroy(&P); ML_Operator_Destroy(&AbsD0_ML); ML_Operator_Destroy(&AbsD0P); From c618d7467182b24e9a7917019572e03840344307 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 11:07:46 -0700 Subject: [PATCH 20/50] ML: enabling more experimental maxwell stuff --- packages/ml/src/Coarsen/ml_amg.c | 19 +++++++++++++++++++ packages/ml/src/Coarsen/ml_amg.h | 5 +++++ packages/ml/src/Coarsen/ml_amg_MIS.c | 26 ++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/packages/ml/src/Coarsen/ml_amg.c b/packages/ml/src/Coarsen/ml_amg.c index ca8d58047136..381709457c24 100644 --- a/packages/ml/src/Coarsen/ml_amg.c +++ b/packages/ml/src/Coarsen/ml_amg.c @@ -76,6 +76,8 @@ int ML_AMG_Create( ML_AMG **amg ) (*amg)->post_aztec_proc_config = NULL; (*amg)->post_aztec_status = NULL; (*amg)->post_function = NULL; + /*cms*/ + (*amg)->rowsum_threshold = -1.0; /* defaults to off */ return 0; } @@ -200,6 +202,23 @@ int ML_AMG_Set_Threshold( ML_AMG *amg, double thresh ) return 0; } +/* ************************************************************************* */ +/* set/reset rowsum threshold */ +/* ------------------------------------------------------------------------- */ + +int ML_AMG_Set_RowSum_Threshold( ML_AMG *amg, double epsilon ) +{ + if ( amg->ML_id != ML_ID_AMG ) + { + printf("ML_AMG_Set_RowSum_Threshold : wrong object. \n"); + exit(-1); + } + if ( epsilon > 0.0 ) amg->rowsum_threshold = epsilon; + else amg->rowsum_threshold = -1.0; + + return 0; +} + /* ************************************************************************* */ /* set max number of levels and other level information */ /* ------------------------------------------------------------------------- */ diff --git a/packages/ml/src/Coarsen/ml_amg.h b/packages/ml/src/Coarsen/ml_amg.h index fd1f2b6fd06f..45f6feaa6818 100644 --- a/packages/ml/src/Coarsen/ml_amg.h +++ b/packages/ml/src/Coarsen/ml_amg.h @@ -96,6 +96,9 @@ typedef struct ML_AMG_Struct struct AZ_MATRIX_STRUCT *, struct AZ_PREC_STRUCT *); */ + /*cms*/ + double rowsum_threshold; /**< for dropping sub-CFL rows in reaction-diffusion */ + } ML_AMG; /* ************************************************************************* */ @@ -149,6 +152,8 @@ extern int ML_AMG_Set_CoarsenScheme_MIS( ML_AMG *amg ); extern int ML_AMG_Set_Threshold( ML_AMG *amg, double epsilon ); +extern int ML_AMG_Set_RowSum_Threshold( ML_AMG *, double epsilon ); + /* ------------------------------------------------------------------------- */ /* functions for performing coarsening */ /* ------------------------------------------------------------------------- */ diff --git a/packages/ml/src/Coarsen/ml_amg_MIS.c b/packages/ml/src/Coarsen/ml_amg_MIS.c index 851e78df0d3a..7bb1552db35a 100644 --- a/packages/ml/src/Coarsen/ml_amg_MIS.c +++ b/packages/ml/src/Coarsen/ml_amg_MIS.c @@ -56,6 +56,7 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, ML_CommInfoOP *mat_comm; struct ML_CSR_MSRdata *csr_data; ML_Aggregate_Comm *aggr_comm; + double rowsum_threshold; /* ============================================================= */ /* get the machine information and matrix references */ @@ -68,6 +69,7 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, Nrows = Amatrix->outvec_leng; sys_unk_filter = 0; mat_comm = Amatrix->getrow->pre_comm; + rowsum_threshold = ml_amg->rowsum_threshold; /* ============================================================= */ /* if system AMG (unknown approach) is requested, communicate */ @@ -235,6 +237,8 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, rowptr[0] = 0; for (i = 0; i < Nrows; i++) { + int itmp = total_nnz; + rowsum = 0; ML_get_matrix_row(Amatrix, 1, &i, &allocated, &rowi_col, &rowi_val, &rowi_N, 0); if ( sys_unk_filter ) @@ -248,13 +252,17 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, rowmax = 0.0; if ( diag >= 0. ) { - for (j = 0; j < rowi_N; j++) - if (rowi_col[j] != i) rowmax = ML_min(rowmax, rowi_val[j]); + for (j = 0; j < rowi_N; j++) { + if (rowi_col[j] != i) rowmax = ML_min(rowmax, rowi_val[j]); + rowsum+=rowi_col[j]; + } } else { - for (j = 0; j < rowi_N; j++) - if (rowi_col[j] != i) rowmax = ML_max(rowmax, rowi_val[j]); + for (j = 0; j < rowi_N; j++) { + if (rowi_col[j] != i) rowmax = ML_max(rowmax, rowi_val[j]); + rowsum+=rowi_col[j]; + } } rowmax *= epsilon; if ( diag >= 0. ) @@ -279,6 +287,16 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, } } } + /* Reaction-diffusion dropping. If the rowsum is sufficiently than the diagonal, then + we should be in a reaction-limited regime at this node and can afford to drop *all* + connections, effectively turning this guy into a Dirichlet unknown. We trust that + the smoother is sufficient for these unknowns - CMS 7/23/19 */ + if(rowsum_threshold > 0.0 && fabs(rowsum) > fabs(diag) * rowsum_threshold) { + column[itmp]=i; + values[itmp]=diag; + total_nnz = itmp; + } + rowptr[i+1] = total_nnz; } ML_free( rowi_col ); From 85f12a06fac4a0b8d3b6ecfec283088ac469f97e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 11:38:41 -0700 Subject: [PATCH 21/50] TSQR: Replace ConstMatView with MatView of const 1. Replace ConstMatView with MatView. 2. Replace scalar_type typedef in MatView and Matrix with non_const_value_type and const_value_type, as in Kokkos::View. The goal is to replace MatView with Kokkos::View. --- .../tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp | 2 +- .../tsqr/src/TbbTsqr_CacheBlockTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp | 2 +- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 2 +- .../tsqr/src/TbbTsqr_RevealRankTask.hpp | 2 +- .../tsqr/src/TbbTsqr_TbbParallelTsqr.hpp | 2 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp | 2 +- .../tsqr/src/TbbTsqr_UnCacheBlockTask.hpp | 4 +- packages/tpetra/tsqr/src/Tsqr.hpp | 2 +- .../tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 65 +++--- .../tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp | 32 +-- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 2 +- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 7 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 26 +-- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 206 +----------------- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 31 ++- packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp | 26 ++- packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp | 10 +- packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp | 33 ++- .../tsqr/src/Tsqr_Random_GlobalMatrix.hpp | 34 ++- .../tsqr/src/Tsqr_Random_MatrixGenerator.hpp | 2 +- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 2 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 4 +- packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp | 24 +- packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp | 53 +++-- .../tsqr/src/Tsqr_printGlobalMatrix.hpp | 147 ++++++------- 26 files changed, 267 insertions(+), 457 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp index 4edacbee0255..0caff734b512 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp @@ -54,7 +54,7 @@ namespace TSQR { class ApplyTask : public tbb::task { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair split_t; typedef std::pair const_split_t; typedef std::pair top_blocks_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp index a70ae206da55..8827a1ce4091 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp @@ -56,7 +56,7 @@ namespace TSQR { class CacheBlockTask : public tbb::task { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair split_t; typedef std::pair const_split_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp index 0a93f2723ac5..b0ce1e40f6c2 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp @@ -52,7 +52,7 @@ namespace TSQR { class ExplicitQTask : public tbb::task { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; private: typedef std::pair split_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index 8072f55ab4aa..19b4372ccc2f 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -54,7 +54,7 @@ namespace TSQR { class FactorTask : public tbb::task { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair split_t; typedef std::pair const_split_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp index e03ea15f5660..7a3162b2f9a4 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp @@ -59,7 +59,7 @@ namespace TSQR { class RevealRankTask : public tbb::task { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair split_type; typedef SequentialTsqr seq_tsqr_type; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp index db8a354f09ac..c86123c42d8b 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp @@ -72,7 +72,7 @@ namespace TSQR { class TbbParallelTsqr { private: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair split_t; typedef std::pair const_split_t; typedef std::pair top_blocks_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp index a18e0c643509..e7f79fb0c15d 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp @@ -163,7 +163,7 @@ namespace TSQR { Partitioner partitioner_; typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair const_split_t; typedef std::pair split_t; typedef std::pair top_blocks_t; diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp index dffc07743d5c..dc8068c2d9eb 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp @@ -55,8 +55,8 @@ namespace TSQR { template class UnCacheBlockTask : public tbb::task { public: - typedef MatView< LocalOrdinal, Scalar > mat_view_type; - typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view_type; + typedef MatView mat_view_type; + typedef MatView const_mat_view_type; typedef std::pair< mat_view_type, mat_view_type > split_t; typedef std::pair< const_mat_view_type, const_mat_view_type > const_split_t; diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 7b76a8b74fa0..31d1be6b9d01 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -95,7 +95,7 @@ namespace TSQR { class Tsqr { public: typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef Matrix matrix_type; typedef Scalar scalar_type; diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index b0087ef96193..fbd423b49d6a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -69,21 +69,20 @@ namespace TSQR { template class CacheBlocker { private: - typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + using mat_view_type = MatView; + using const_mat_view_type = MatView; void validate () { - if (nrows_cache_block_ < ncols_) - { - std::ostringstream os; - os << "The typical cache block size is too small. Only " - << nrows_cache_block_ << " rows fit, but every cache block needs " - "at least as many rows as the number of columns " << ncols_ - << " in the matrix."; - throw std::logic_error (os.str()); - } + if (nrows_cache_block_ < ncols_) { + std::ostringstream os; + os << "The typical cache block size is too small. Only " + << nrows_cache_block_ << " rows fit, but every cache block needs " + "at least as many rows as the number of columns " << ncols_ + << " in the matrix."; + throw std::logic_error (os.str()); + } } public: @@ -105,17 +104,13 @@ namespace TSQR { nrows_ (num_rows), ncols_ (num_cols), strategy_ (strategy), - nrows_cache_block_ (strategy_.cache_block_num_rows (extent(1))) + nrows_cache_block_ (strategy_.cache_block_num_rows (ncols_)) { validate (); } //! Default constructor, so that CacheBlocker is DefaultConstructible. - CacheBlocker () : - nrows_ (0), - ncols_ (0), - nrows_cache_block_ (strategy_.cache_block_num_rows (extent(1))) - {} + CacheBlocker () = default; //! Copy constructor CacheBlocker (const CacheBlocker& rhs) : @@ -444,10 +439,10 @@ namespace TSQR { private: //! Number of rows in the matrix to block. - Ordinal nrows_; + Ordinal nrows_ = 0; //! Number of columns in the matrix to block. - Ordinal ncols_; + Ordinal ncols_ = 0; //! Strategy used to break the matrix into cache blocks. CacheBlockingStrategy strategy_; @@ -458,7 +453,7 @@ namespace TSQR { /// quantity each time, but we choose to cache the computed value /// here. For an explanation of "typical," see the documentation /// of \c nrows_cache_block(). - Ordinal nrows_cache_block_; + Ordinal nrows_cache_block_ = 0; /// \brief Number of rows in a "typical" cache block. /// @@ -483,21 +478,16 @@ namespace TSQR { public std::iterator { public: - typedef MatrixViewType view_type; - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; + using view_type = MatrixViewType; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = typename MatrixViewType::non_const_value_type; /// \brief Default constructor. /// /// \note To implementers: We only implement a default constructor /// because all iterators (e.g., TrivialIterator) must be /// DefaultConstructible. - CacheBlockRangeIterator () : - A_ (0, 0, NULL, 0), - curInd_ (0), - reverse_ (false), - contiguousCacheBlocks_ (false) - {} + CacheBlockRangeIterator () = default; /// \brief Standard constructor. /// @@ -593,9 +583,9 @@ namespace TSQR { private: MatrixViewType A_; CacheBlocker blocker_; - ordinal_type curInd_; - bool reverse_; - bool contiguousCacheBlocks_; + ordinal_type curInd_ = 0; + bool reverse_ = false; + bool contiguousCacheBlocks_ = false; }; /// \class CacheBlockRange @@ -618,13 +608,12 @@ namespace TSQR { template class CacheBlockRange { public: - typedef MatrixViewType view_type; - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; + using view_type = MatrixViewType; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = typename MatrixViewType::non_const_value_type; - /// \typedef iterator - /// \brief Type of an iterator over the range of cache blocks. - typedef CacheBlockRangeIterator iterator; + //! Type of an iterator over the range of cache blocks. + using iterator = CacheBlockRangeIterator; /// \brief Constructor /// diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp index c4575b98b3ac..af18ad5cee10 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp @@ -145,25 +145,25 @@ namespace TSQR { const std::string& additionalData) { using std::endl; - - typedef typename CombineType::ordinal_type ordinal_type; - typedef typename CombineType::scalar_type scalar_type; - typedef CombineBenchmarker - benchmarker_type; - - TEUCHOS_TEST_FOR_EXCEPTION(cacheBlockNumTrials < 1, std::invalid_argument, - "The number of trials for the cache block benchmark " - "must be positive, but you specified cacheBlockNum" - "Trials = " << cacheBlockNumTrials << "."); - TEUCHOS_TEST_FOR_EXCEPTION(pairNumTrials < 1, std::invalid_argument, - "The number of trials for the pair benchmark must be " - "positive, but you specified pairNumTrials = " - << pairNumTrials << "."); + using ordinal_type = typename CombineType::ordinal_type; + using scalar_type = typename CombineType::scalar_type; + using benchmarker_type = + CombineBenchmarker; + + TEUCHOS_TEST_FOR_EXCEPTION + (cacheBlockNumTrials < 1, std::invalid_argument, + "The number of trials for the cache block benchmark must be " + "positive, but you specified cacheBlockNumTrials=" + << cacheBlockNumTrials << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (pairNumTrials < 1, std::invalid_argument, + "The number of trials for the pair benchmark must be " + "positive, but you specified pairNumTrials=" + << pairNumTrials << "."); benchmarker_type b (iseed); std::pair results; - results.first = - b.benchmarkPair (numCols, pairNumTrials); + results.first = b.benchmarkPair (numCols, pairNumTrials); results.second = b.benchmarkCacheBlock (numRows, numCols, cacheBlockNumTrials); diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index f7e9242ac921..9295f1499122 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -67,7 +67,7 @@ namespace TSQR { typedef Ordinal ordinal_type; typedef Scalar scalar_type; typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; typedef MatView mat_view_type; /// \brief Does the R factor have a nonnegative diagonal? diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 2111ab4d6d16..56515d6ec629 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -56,7 +56,6 @@ #include #include - namespace TSQR { namespace Test { @@ -210,7 +209,11 @@ namespace TSQR { template static - std::vector::magnitudeType> + std::vector< + typename Teuchos::ScalarTraits< + typename MatrixViewType::non_const_value_type + >::magnitudeType + > localVerify (const MatrixViewType& A, const MatrixViewType& Q, const MatrixViewType& R) diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 87a9aaf37d0f..c5ddab5cfc6c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -364,8 +364,8 @@ namespace TSQR { template class ApplyFirstPass { public: - typedef ConstMatView const_mat_view_type; - typedef MatView mat_view_type; + using const_mat_view_type = MatView; + using mat_view_type = MatView; private: ApplyType applyType_; @@ -439,8 +439,8 @@ namespace TSQR { const std::pair cbIndices, const int partitionIndex) const { - typedef CacheBlockRange const_range_type; - typedef CacheBlockRange range_type; + using const_range_type = CacheBlockRange; + using range_type = CacheBlockRange; const char suffix[] = " Please report this bug to the Tpetra developers."; if (cbIndices.first >= cbIndices.second) { @@ -695,10 +695,10 @@ namespace TSQR { template class CacheBlockFunctor { private: - typedef ConstMatView const_mat_view_type; - typedef MatView mat_view_type; - typedef CacheBlockRange const_range_type; - typedef CacheBlockRange range_type; + using const_mat_view_type = MatView; + using mat_view_type = MatView; + using const_range_type = CacheBlockRange; + using range_type = CacheBlockRange; const_mat_view_type A_in_; mat_view_type A_out_; @@ -818,9 +818,9 @@ namespace TSQR { template class MultFunctor { private: - typedef ConstMatView const_mat_view_type; - typedef MatView mat_view_type; - typedef CacheBlockRange range_type; + using const_mat_view_type = MatView; + using mat_view_type = MatView; + using range_type = CacheBlockRange; mat_view_type Q_; const_mat_view_type B_; @@ -1103,8 +1103,8 @@ namespace TSQR { typedef LocalOrdinal local_ordinal_type; typedef Scalar scalar_type; - typedef ConstMatView const_mat_view_type; - typedef MatView mat_view_type; + using const_mat_view_type = MatView; + using mat_view_type = MatView; /// \typedef FactorOutput /// \brief Part of the implicit Q representation returned by factor(). diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 62e538a6d0cf..761c175d4ab6 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -48,6 +48,7 @@ #endif // TSQR_MATVIEW_DEBUG #include #include +#include namespace TSQR { @@ -134,11 +135,6 @@ namespace TSQR { }; #endif // TSQR_MATVIEW_DEBUG - - // Forward declaration - template - class ConstMatView; - // Forward declaration template class Matrix; @@ -149,10 +145,13 @@ namespace TSQR { template class MatView { public: - using scalar_type = Scalar; + using non_const_value_type = typename std::remove_const::type; + using const_value_type = const non_const_value_type; using ordinal_type = Ordinal; using pointer = Scalar*; + using const_pointer = const Scalar*; using reference = Scalar&; + using const_reference = const Scalar&; MatView () = default; @@ -166,7 +165,7 @@ namespace TSQR { A_(A) { #ifdef TSQR_MATVIEW_DEBUG - MatViewVerify:: + MatViewVerify:: verify (num_rows, num_cols, A, leading_dim); #endif // TSQR_MATVIEW_DEBUG } @@ -344,199 +343,6 @@ namespace TSQR { pointer A_ = nullptr; }; - /// \class ConstMatView - /// - /// A read-only view of a column-oriented matrix. - template - class ConstMatView { - public: - using scalar_type = Scalar; - using ordinal_type = Ordinal; - using pointer = const Scalar*; - using reference = const Scalar&; - - ConstMatView () = default; - - /// \note g++ with -Wall wants A_ to be initialized after lda_, - /// otherwise it emits a compiler warning. - ConstMatView (const ordinal_type num_rows, - const ordinal_type num_cols, - const scalar_type* const A, - const ordinal_type leading_dim) : - nrows_(num_rows), - ncols_(num_cols), - lda_(leading_dim), - A_(A) - { -#ifdef TSQR_MATVIEW_DEBUG - MatViewVerify:: - verify (num_rows, num_cols, A, leading_dim); -#endif // TSQR_MATVIEW_DEBUG - } - - ConstMatView (const ConstMatView&) = default; - ConstMatView& operator= (const ConstMatView&) = default; - ConstMatView (ConstMatView&&) = default; - ConstMatView& operator= (ConstMatView&&) = default; - - constexpr ordinal_type extent(const int r) const noexcept { - return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); - } - - constexpr ordinal_type stride(const int r) const noexcept { - return r == 0 ? ordinal_type(1) : (r == 1 ? lda_ : ordinal_type(0)); - } - - reference - operator() (const ordinal_type i, - const ordinal_type j) const - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed) { - if (i < 0 || i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); - } - else if (j < 0 || j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); - } - } - else { - if (i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); - } - else if (j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); - } - } - if (A_ == nullptr) { - throw std::logic_error("Attempt to reference NULL data"); - } -#endif // TSQR_MATVIEW_DEBUG - return A_[i + j * this->stride(1)]; - } - - pointer data() const { return A_; } - - bool empty() const { return extent(0) == 0 || extent(1) == 0; } - - /// Return a "row block" (submatrix of consecutive rows in the - /// inclusive range [firstRow,lastRow]). - ConstMatView - rowBlock (const ordinal_type firstRow, - const ordinal_type lastRow) const - { -#ifdef TSQR_MATVIEW_DEBUG - if (firstRow < 0 || lastRow >= extent(0)) { - throw std::invalid_argument ("Row range invalid"); - } -#endif // TSQR_MATVIEW_DEBUG - return ConstMatView (lastRow - firstRow + 1, extent(1), - data() + firstRow, stride(1)); - } - - /// \brief Split off and return the top block. Modify *this to be - /// the "rest" of the matrix. - /// - /// \note Only use this method to split off a single cache block. - /// It breaks if you try to use it otherwise. - /// - /// \param nrows_top [in] Number of rows in the top block (which - /// this method returns) - /// - /// \param b_contiguous_blocks [in] Whether or not the entries of - /// the top block are stored contiguously in *this. The default - /// is no (false). - /// - /// \return The top block of nrows_top rows. Data is a shallow - /// copy of the data in *this. - ConstMatView split_top (const ordinal_type nrows_top, - const bool b_contiguous_blocks = false) - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed && nrows_top < 0) { - throw std::invalid_argument ("nrows_top < 0"); - } - if (nrows_top > extent(0)) { - throw std::invalid_argument ("nrows_top > nrows"); - } -#endif // TSQR_MATVIEW_DEBUG - - pointer const A_top_ptr = data(); - pointer A_rest_ptr; - const ordinal_type nrows_rest = extent(0) - nrows_top; - ordinal_type lda_top, lda_rest; - if (b_contiguous_blocks) { - lda_top = nrows_top; - lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * extent(1); - } - else { - lda_top = stride(1); - lda_rest = stride(1); - A_rest_ptr = A_top_ptr + nrows_top; - } - ConstMatView A_top (nrows_top, extent(1), data(), lda_top); - A_ = A_rest_ptr; - nrows_ = nrows_rest; - lda_ = lda_rest; - - return A_top; - } - - /// \brief Split off and return the bottom block. Modify *this to - /// be the "rest" of the matrix. - ConstMatView - split_bottom (const ordinal_type nrows_bottom, - const bool b_contiguous_blocks = false) - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed && nrows_bottom < 0) { - throw std::invalid_argument ("nrows_bottom < 0"); - } - if (nrows_bottom > extent(0)) { - throw std::invalid_argument ("nrows_bottom > nrows"); - } -#endif // TSQR_MATVIEW_DEBUG - - pointer const A_rest_ptr = data(); - pointer A_bottom_ptr; - const ordinal_type nrows_rest = extent(0) - nrows_bottom; - ordinal_type lda_bottom, lda_rest; - if (b_contiguous_blocks) { - lda_bottom = nrows_bottom; - lda_rest = extent(0) - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); - } - else { - lda_bottom = stride(1); - lda_rest = stride(1); - A_bottom_ptr = A_rest_ptr + nrows_rest; - } - ConstMatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); - A_ = A_rest_ptr; - nrows_ = nrows_rest; - lda_ = lda_rest; - - return A_bottom; - } - - bool operator== (const ConstMatView& rhs) const { - return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && - stride(1) == rhs.stride(1) && data() == rhs.data(); - } - - bool operator!= (const ConstMatView& rhs) const { - return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || - stride(1) != rhs.stride(1) || data() != rhs.data(); - } - - private: - ordinal_type nrows_ = 0; - ordinal_type ncols_ = 0; - ordinal_type lda_ = 0; - pointer A_ = nullptr; - }; - template void deep_copy (const MatView& tgt, const SourceScalar& src) diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 42789bad14a8..5380c7023711 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -64,13 +64,18 @@ namespace TSQR { template class Matrix { public: - using scalar_type = Scalar; + using non_const_value_type = typename std::remove_const::type; + static_assert (std::is_same::value, + "Scalar must be nonconst."); + using const_value_type = const non_const_value_type; using ordinal_type = Ordinal; - using pointer = scalar_type*; - using const_pointer = const scalar_type*; + using pointer = Scalar*; + using const_pointer = const Scalar*; + using reference = Scalar&; + using const_reference = const Scalar&; - using mat_view_type = MatView; - using const_mat_view_type = ConstMatView; + using mat_view_type = MatView; + using const_mat_view_type = MatView; private: static bool @@ -169,7 +174,7 @@ namespace TSQR { //! Constructor with dimensions and fill datum. Matrix (const ordinal_type num_rows, const ordinal_type num_cols, - const scalar_type& value) : + const non_const_value_type& value) : nrows_ (num_rows), ncols_ (num_cols), A_ (verified_alloc_size (num_rows, num_cols), value) @@ -216,7 +221,8 @@ namespace TSQR { /// /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. - scalar_type& operator() (const ordinal_type i, const ordinal_type j) { + reference operator() (const ordinal_type i, + const ordinal_type j) { return A_[i + j*stride(1)]; } @@ -224,12 +230,13 @@ namespace TSQR { /// /// \param i [in] Zero-based row index of the matrix. /// \param j [in] Zero-based column index of the matrix. - const scalar_type& operator() (const ordinal_type i, const ordinal_type j) const { + const_reference operator() (const ordinal_type i, + const ordinal_type j) const { return A_[i + j*stride(1)]; } //! 1-D std::vector - style access. - scalar_type& operator[] (const ordinal_type i) { + reference operator[] (const ordinal_type i) { return A_[i]; } @@ -276,7 +283,7 @@ namespace TSQR { //! A const view of the matrix. const_mat_view_type const_view () const { return const_mat_view_type (extent(0), extent(1), - const_cast (data()), stride(1)); + const_cast (data()), stride(1)); } /// Change the dimensions of the matrix. Reallocate if necessary. @@ -311,7 +318,7 @@ namespace TSQR { /// The matrix is stored using one-dimensional storage with /// column-major (Fortran-style) indexing. This makes Matrix /// compatible with the BLAS and LAPACK. - std::vector A_; + std::vector A_; }; template @@ -319,7 +326,7 @@ namespace TSQR { deep_copy (Matrix& tgt, const SourceScalar& src) { MatView tgt_view (tgt.extent(0), tgt.extent(1), - tgt.data(), tgt.stride(1)); + tgt.data(), tgt.stride(1)); deep_copy (tgt_view, src); } diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index 0114982140a5..6bd5406e13eb 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -83,12 +83,12 @@ namespace TSQR { template< class MgsType > class MgsVerifier { public: - typedef MgsType mgs_type; - typedef typename MgsType::ordinal_type ordinal_type; - typedef typename MgsType::scalar_type scalar_type; - typedef Matrix< ordinal_type, scalar_type > matrix_type; - typedef MessengerBase< scalar_type > messenger_type; - typedef Teuchos::RCP< messenger_type > messenger_ptr; + using mgs_type = MgsType; + using ordinal_type = typename MgsType::ordinal_type; + using scalar_type = typename MgsType::scalar_type; + using matrix_type = Matrix; + using messenger_type = MessengerBase; + using messenger_ptr = Teuchos::RCP; static void verify (mgs_type& orthogonalizer, @@ -107,13 +107,14 @@ namespace TSQR { R.data(), R.stride(1)); if (b_debug) { messenger->barrier(); - if (messenger->rank() == 0) + if (messenger->rank() == 0) { cerr << "-- Finished MGS::mgs" << endl; + } } } }; - template< class Ordinal, class Scalar, class Generator > + template void verifyMgs (const std::string& which, Generator& generator, @@ -267,14 +268,15 @@ namespace TSQR { template static double // returns timing in s do_mgs_benchmark (MgsBase& orthogonalizer, - Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local, - Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& R, + Matrix& Q_local, + Matrix& R, const int num_trials, const bool human_readable) { - typedef typename MgsBase::ordinal_type ordinal_type; using std::cout; - + using ordinal_type = typename MgsBase::ordinal_type; TSQR::Test::verifyTimerConcept(); const ordinal_type nrows_local = Q_local.extent(0); diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index f27745ad7c07..155081ca8d38 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -93,7 +93,7 @@ namespace TSQR { typedef Scalar scalar_type; typedef FactorOutputType factor_output_type; typedef MatView mat_view_type; - typedef ConstMatView const_mat_view_type; + typedef MatView const_mat_view_type; //! Constructor NodeTsqr() = default; @@ -358,10 +358,10 @@ namespace TSQR { /// mat_view_type square (ncols, ncols, top.data(), top.stride(1)); /// \endcode /// - /// Models for MatrixViewType are MatView and ConstMatView. - /// MatrixViewType must have member functions extent(0), extent(1), - /// data(), and stride(1), and its constructor must take the same four - /// arguments as the constructor of ConstMatView. + /// A model for MatrixViewType is MatView. MatrixViewType must + /// have member functions extent(0), extent(1), data(), and + /// stride(1), and its constructor must take the same four + /// arguments as the constructor of MatView. template MatrixViewType top_block (const MatrixViewType& C, diff --git a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp index 60b043d09e9e..23e2d61c1d31 100644 --- a/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_RMessenger.hpp @@ -134,14 +134,13 @@ namespace TSQR { void pack (const ConstMatrixViewType& R) { - typedef typename ConstMatrixViewType::scalar_type view_scalar_type; - typedef typename ConstMatrixViewType::ordinal_type view_ordinal_type; - typedef typename std::vector< Scalar >::iterator iter_type; + using view_scalar_type = typename ConstMatrixViewType::non_const_value_type; + using view_ordinal_type = typename ConstMatrixViewType::ordinal_type; const view_ordinal_type ncols = R.extent(1); const Ordinal buf_length = buffer_length (ncols); buffer_.resize (buf_length); - iter_type iter = buffer_.begin(); + auto iter = buffer_.begin(); for (view_ordinal_type j = 0; j < ncols; ++j) { const view_scalar_type* const R_j = &R(0,j); std::copy (R_j, R_j + (j+1), iter); @@ -181,18 +180,17 @@ namespace TSQR { void scatterStack (const ConstMatrixViewType& R_stack, MatrixViewType& R_local, - const Teuchos::RCP >& messenger) + const Teuchos::RCP >& messenger) { - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; - typedef ConstMatView< ordinal_type, scalar_type > const_view_type; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = typename MatrixViewType::non_const_value_type; + using const_view_type = MatView; const int nprocs = messenger->size(); const int my_rank = messenger->rank(); if (my_rank == 0) { const ordinal_type ncols = R_stack.extent(1); - // Copy data from top ncols x ncols block of R_stack into R_local. const_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.stride(1)); @@ -200,9 +198,9 @@ namespace TSQR { // Loop through all other processors, sending each the next // ncols x ncols block of R_stack. - RMessenger< ordinal_type, scalar_type > sender (messenger); + RMessenger sender (messenger); for (int destProc = 1; destProc < nprocs; ++destProc) { - const scalar_type* const R_ptr = R_stack.data() + destProc*ncols; + auto R_ptr = R_stack.data() + destProc*ncols; const_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.stride(1)); sender.send (R_stack_view_cur, destProc); } @@ -222,27 +220,26 @@ namespace TSQR { void gatherStack (MatrixViewType& R_stack, ConstMatrixViewType& R_local, - const Teuchos::RCP >& messenger) + const Teuchos::RCP>& messenger) { - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; - typedef MatView mat_view_type; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = typename MatrixViewType::non_const_value_type; + using mat_view_type = MatView; const int nprocs = messenger->size(); const int my_rank = messenger->rank(); if (my_rank == 0) { const ordinal_type ncols = R_stack.extent(1); - // Copy data from R_local into top ncols x ncols block of R_stack. mat_view_type R_stack_view_first (ncols, ncols, R_stack.data(), R_stack.stride(1)); deep_copy (R_stack_view_first, R_local); // Loop through all other processors, fetching their matrix data. - RMessenger< ordinal_type, scalar_type > receiver (messenger); + RMessenger receiver (messenger); for (int srcProc = 1; srcProc < nprocs; ++srcProc) { - const scalar_type* const R_ptr = R_stack.data() + srcProc*ncols; + auto R_ptr = R_stack.data() + srcProc*ncols; mat_view_type R_stack_view_cur (ncols, ncols, R_ptr, R_stack.stride(1)); // Fill (the lower triangle) with zeros, since diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp index 75db87ce86aa..4f082b04bac3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_GlobalMatrix.hpp @@ -56,43 +56,41 @@ namespace TSQR { template static void scaleMatrix (MatrixViewType& A, - const typename MatrixViewType::scalar_type& denom) + const typename MatrixViewType::non_const_value_type& denom) { - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; - - const ordinal_type nrows = A.extent(0); - const ordinal_type ncols = A.extent(1); - const ordinal_type lda = A.stride(1); + using LO = typename MatrixViewType::ordinal_type; + const LO nrows = A.extent(0); + const LO ncols = A.extent(1); + const LO lda = A.stride(1); if (nrows == lda) { // A is stored contiguously. - const ordinal_type nelts = nrows * ncols; - scalar_type* const A_ptr = A.data (); - for (ordinal_type k = 0; k < nelts; ++k) { + const LO nelts = nrows * ncols; + auto A_ptr = A.data (); + for (LO k = 0; k < nelts; ++k) { A_ptr[k] /= denom; } } else { // Each column of A is stored contiguously. - for (ordinal_type j = 0; j < ncols; ++j) { - scalar_type* const A_j = &A(0,j); - for (ordinal_type i = 0; i < nrows; ++i) { + for (LO j = 0; j < ncols; ++j) { + auto A_j = &A(0,j); + for (LO i = 0; i < nrows; ++i) { A_j[i] /= denom; } } } } - template< class MatrixViewType, class Generator > + template void randomGlobalMatrix (Generator* const pGenerator, MatrixViewType& A_local, - const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[], - MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger, - MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger) + const typename Teuchos::ScalarTraits::magnitudeType singular_values[], + MessengerBase< typename MatrixViewType::ordinal_type>* const ordinalMessenger, + MessengerBase< typename MatrixViewType::non_const_value_type>* const scalarMessenger) { using Teuchos::NO_TRANS; using ordinal_type = typename MatrixViewType::ordinal_type; - using scalar_type = typename MatrixViewType::scalar_type; + using scalar_type = typename MatrixViewType::non_const_value_type; using STS = Teuchos::ScalarTraits; const int rootProc = 0; diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index 513f7816a090..399f13fa8fde 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -158,7 +158,7 @@ namespace TSQR { template< class MatrixViewType > void implicit_Q (MatrixViewType& Q, - typename MatrixViewType::scalar_type tau[]) + typename MatrixViewType::non_const_value_type tau[]) { implicit_Q (Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), tau); } diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index 6cfc31e84e05..9a84fa299ad3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -65,7 +65,7 @@ namespace TSQR { class SequentialCholeskyQR { private: using mat_view_type = MatView; - using const_mat_view_type = ConstMatView; + using const_mat_view_type = MatView; using blas_type = Impl::SystemBlas; public: diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index f057fc6d0a38..d8c604dad86f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -119,10 +119,10 @@ namespace TSQR { using ordinal_type = LocalOrdinal; using scalar_type = Scalar; using mat_view_type = MatView; - using const_mat_view_type = ConstMatView; + using const_mat_view_type = MatView; using magnitude_type = typename Teuchos::ScalarTraits::magnitudeType; using FactorOutput = typename NodeTsqr>>::factor_output_type; + std::vector>>::factor_output_type; private: /// \brief Factor the first cache block of the matrix. diff --git a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp index c61f4051e08a..f18b66f897fe 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp @@ -73,28 +73,30 @@ namespace TSQR { distributedTestProblem (Generator& generator, MatrixViewType& A_local, MessengerBase* const ordinalComm, - MessengerBase* const scalarComm) + MessengerBase* const scalarComm) { - typedef typename MatrixViewType::ordinal_type ordinal_type; - typedef typename MatrixViewType::scalar_type scalar_type; - typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type; + using ordinal_type = typename MatrixViewType::ordinal_type; + using scalar_type = + typename MatrixViewType::non_const_value_type; + using magnitude_type = + typename Teuchos::ScalarTraits::magnitudeType; const int myRank = scalarComm->rank(); const ordinal_type ncols = A_local.extent(1); - if (myRank == 0) { // Generate some singular values for the test problem. std::vector singular_values (ncols); singular_values[0] = 1.0; - for (ordinal_type k = 1; k < ncols; ++k) - singular_values[k] = singular_values[k-1] / double(2); + for (ordinal_type k = 1; k < ncols; ++k) { + singular_values[k] = singular_values[k-1] / magnitude_type(2.0); + } // Generate the test problem. All MPI processes // participate, but only Proc 0 generates the (pseudo)random // numbers. TSQR::Random::randomGlobalMatrix (&generator, A_local, - &singular_values[0], ordinalComm, - scalarComm); + singular_values.data (), + ordinalComm, scalarComm); } else { // This helps C++ deduce the type; the values aren't read on @@ -104,8 +106,8 @@ namespace TSQR { // All MPI processes participate in the distribution of the // test matrix. TSQR::Random::randomGlobalMatrix (&generator, A_local, - &singular_values[0], ordinalComm, - scalarComm); + singular_values.data (), + ordinalComm, scalarComm); } } } // namespace Test diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp index fb6c532b3e54..dea7317ad040 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp @@ -59,13 +59,13 @@ namespace TSQR { template class TsqrVerifier { public: - typedef TsqrType tsqr_type; - typedef typename tsqr_type::scalar_type scalar_type; - typedef typename tsqr_type::ordinal_type ordinal_type; - typedef Matrix matrix_type; - typedef typename tsqr_type::FactorOutput factor_output_type; - typedef MessengerBase messenger_type; - typedef Teuchos::RCP messenger_ptr; + using tsqr_type = TsqrType; + using scalar_type = typename tsqr_type::scalar_type; + using ordinal_type = typename tsqr_type::ordinal_type; + using matrix_type = Matrix; + using factor_output_type = typename tsqr_type::FactorOutput; + using messenger_type = MessengerBase; + using messenger_ptr = Teuchos::RCP; static void verify (tsqr_type& tsqr, @@ -99,34 +99,40 @@ namespace TSQR { const bool testFactorExplicit = true; if (testFactorExplicit) { - tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), - contiguousCacheBlocks); + tsqr.factorExplicit (A_copy.view(), Q_local.view(), + R.view(), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); - if (scalarComm->rank () == 0) + if (scalarComm->rank () == 0) { cerr << "-- Finished Tsqr::factorExplicit" << endl; + } } } else { // Factor the (copy of the) matrix. factor_output_type factorOutput = - tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); + tsqr.factor (nrows_local, ncols, + A_copy.data(), A_copy.stride(1), + R.data(), R.stride(1), + contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); - if (scalarComm->rank () == 0) + if (scalarComm->rank () == 0) { cerr << "-- Finished Tsqr::factor" << endl; + } } // Compute the explicit Q factor in Q_local tsqr.explicit_Q (nrows_local, - ncols, A_copy.data(), A_copy.stride(1), factorOutput, + ncols, A_copy.data(), A_copy.stride(1), + factorOutput, ncols, Q_local.data(), Q_local.stride(1), contiguousCacheBlocks); if (b_debug) { scalarComm->barrier (); - if (scalarComm->rank () == 0) + if (scalarComm->rank () == 0) { cerr << "-- Finished Tsqr::explicit_Q" << endl; + } } } @@ -143,8 +149,9 @@ namespace TSQR { if (b_debug) { scalarComm->barrier (); - if (scalarComm->rank () == 0) + if (scalarComm->rank () == 0) { cerr << "-- Un-cache-blocked output Q factor" << endl; + } } } } @@ -422,11 +429,15 @@ namespace TSQR { double do_tsqr_benchmark (const std::string& which, TsqrBase& tsqr, - const Teuchos::RCP< MessengerBase< typename TsqrBase::scalar_type > >& messenger, - const Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_local, - Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_copy, - Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& Q_local, - Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& R, + const Teuchos::RCP>& messenger, + const Matrix& A_local, + Matrix& A_copy, + Matrix& Q_local, + Matrix& R, const int ntrials, const bool contiguousCacheBlocks, const bool human_readable, diff --git a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp index c1caa7adfb9f..ee33481e952e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_printGlobalMatrix.hpp @@ -59,100 +59,95 @@ namespace TSQR { /// \param out [out] Output stream to which to write the matrix (on /// MPI Proc 0 only, relative to the underlying communicator). /// \param A_local [in] Each MPI process' part of the matrix. - /// \param scalarComm [in/out] Communicator wrapper for - /// ConstMatrixViewType::scalar_type objects. + /// \param scalarComm [in/out] Communicator wrapper for scalar + /// objects. /// \param ordinalComm [in/out] Communicator wrapper for /// ConstMatrixViewType::ordinal_type objects. template void printGlobalMatrix (std::ostream& out, const ConstMatrixViewType& A_local, - MessengerBase* const scalarComm, + MessengerBase* const scalarComm, MessengerBase* const ordinalComm) - { - typedef typename ConstMatrixViewType::ordinal_type LocalOrdinal; - typedef typename ConstMatrixViewType::scalar_type Scalar; - typedef Teuchos::ScalarTraits STS; - using std::endl; + { + using LocalOrdinal = typename ConstMatrixViewType::ordinal_type; + using Scalar = typename ConstMatrixViewType::non_const_value_type; + using STS = Teuchos::ScalarTraits; + using std::endl; - const int myRank = scalarComm->rank (); - const int nprocs = scalarComm->size (); - const LocalOrdinal nrowsLocal = A_local.extent(0); - const LocalOrdinal ncols = A_local.extent(1); - const Scalar quiet_NaN = STS::nan(); + const int myRank = scalarComm->rank (); + const int nprocs = scalarComm->size (); + const LocalOrdinal nrowsLocal = A_local.extent(0); + const LocalOrdinal ncols = A_local.extent(1); + const Scalar quiet_NaN = STS::nan(); - if (myRank == 0) - { - // Print the remote matrix data - // out << "Processor " << my_rank << ":" << endl; - print_local_matrix (out, A_local.extent(0), A_local.extent(1), - A_local.data(), A_local.stride(1)); + if (myRank == 0) { + // Print the remote matrix data + print_local_matrix (out, A_local.extent(0), A_local.extent(1), + A_local.data(), A_local.stride(1)); - // Space for remote matrix data. Other processors are allowed - // to have different nrows_local values; we make space as - // necessary. - Matrix A_remote (nrowsLocal, ncols, quiet_NaN); + // Space for remote matrix data. Other processes are allowed to + // have different nrows_local values; we make space as needed. + Matrix A_remote (nrowsLocal, ncols, quiet_NaN); - // Loop through all the other processors in order. - // Fetch their matrix data and print it. - for (int srcProc = 1; srcProc < nprocs; ++srcProc) - { - // Get processor proc's local matrix dimensions - LocalOrdinal dims[2]; - ordinalComm->recv (&dims[0], 2, srcProc, 0); + // Loop through all the other processes in order. Fetch their + // matrix data and print it. + for (int srcProc = 1; srcProc < nprocs; ++srcProc) { + // Get local matrix dimensions + LocalOrdinal dims[2]; + ordinalComm->recv (&dims[0], 2, srcProc, 0); - // Make space for the remote matrix data. - // - // mfh 13 Oct 2010: Teuchos::OrdinalTraits does not - // currently have this feature. It's OK to use - // std::numeric_limits, since ordinal types in Trilinos - // are intended to be built-in types (like int or long - // long int). std::numeric_limits only promises to work - // for built-in types, unless someone has defined an - // appropriate specialization. Teuchos::ScalarTraits, - // in contrast, has to work for non-built-in Scalar - // types, like ARPREC or QD floating-point numbers. - if (std::numeric_limits::is_signed) - { - if (dims[0] <= 0 || dims[1] <= 0) - throw std::runtime_error ("Invalid dimensions of remote matrix"); - } - else - { - if (dims[0] == 0 || dims[1] == 0) - throw std::runtime_error ("Invalid dimensions of remote matrix"); - } - A_remote.reshape (dims[0], dims[1]); + // Make space for the remote matrix data. + // + // mfh 13 Oct 2010: Teuchos::OrdinalTraits does not currently + // have this feature. It's OK to use std::numeric_limits, + // since ordinal types in Trilinos are intended to be built-in + // types (like int or long long int). std::numeric_limits + // only promises to work for built-in types, unless someone + // has defined an appropriate specialization. + // Teuchos::ScalarTraits, in contrast, has to work for + // non-built-in Scalar types, like ARPREC or QD floating-point + // numbers. + if (std::numeric_limits::is_signed) { + if (dims[0] <= 0 || dims[1] <= 0) { + throw std::runtime_error ("Invalid dimensions of remote matrix"); + } + } + else { + if (dims[0] == 0 || dims[1] == 0) { + throw std::runtime_error ("Invalid dimensions of remote matrix"); + } + } + A_remote.reshape (dims[0], dims[1]); - // Receive the remote matrix data, which we assume is - // stored contiguously. - scalarComm->recv (A_remote.data(), dims[0]*dims[1], srcProc, 0); + // Receive the remote matrix data, which we assume is + // stored contiguously. + scalarComm->recv (A_remote.data(), dims[0]*dims[1], srcProc, 0); - // Print the remote matrix data - // out << "Processor " << proc << ":" << endl; - print_local_matrix (out, dims[0], dims[0], A_remote.data(), A_remote.stride(1)); - } - } - else - { - // Send my local matrix dimensions to proc 0. - int rootProc = 0; - LocalOrdinal dims[2]; + // Print the remote matrix data + // out << "Processor " << proc << ":" << endl; + print_local_matrix (out, dims[0], dims[0], A_remote.data(), + A_remote.stride(1)); + } + } + else { + // Send my local matrix dimensions to proc 0. + int rootProc = 0; + LocalOrdinal dims[2]; - dims[0] = nrowsLocal; - dims[1] = ncols; - ordinalComm->send (dims, 2, rootProc, 0); + dims[0] = nrowsLocal; + dims[1] = ncols; + ordinalComm->send (dims, 2, rootProc, 0); - // Create a (contiguous) buffer and copy the data into it. - Matrix< LocalOrdinal, Scalar > A_buf (nrowsLocal, ncols, quiet_NaN); - deep_copy (A_buf, A_local); + // Create a (contiguous) buffer and copy the data into it. + Matrix< LocalOrdinal, Scalar > A_buf (nrowsLocal, ncols, quiet_NaN); + deep_copy (A_buf, A_local); - // Send the actual data to proc 0. - scalarComm->send (A_buf.data(), nrowsLocal*ncols, rootProc, 0); - } - scalarComm->barrier (); + // Send the actual data to proc 0. + scalarComm->send (A_buf.data(), nrowsLocal*ncols, rootProc, 0); } - + scalarComm->barrier (); + } } // namespace TSQR #endif // __Tsqr_printGlobalMatrix_hpp From 48e09ec7d298363da217ffc17604d2a640ed967b Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 12:11:55 -0700 Subject: [PATCH 22/50] TSQR: Clean up Combine test a bit --- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 395 +++++++++--------- 1 file changed, 198 insertions(+), 197 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 56515d6ec629..9bf1d50a25b5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -328,96 +328,94 @@ namespace TSQR { // Fill the explicit Q factor matrices with the first numCols // columns of the identity matrix. - for (Ordinal k = 0; k < numCols; ++k) - { - Q_R1R2(k, k) = Scalar(1); - Q_R3A(k, k) = Scalar(1); - } + for (Ordinal k = 0; k < numCols; ++k) { + Q_R1R2(k, k) = Scalar(1.0); + Q_R3A(k, k) = Scalar(1.0); + } // tau factor arrays, one for each factorization test. - vector< Scalar > tau_R1R2 (numCols); - vector< Scalar > tau_R3A (numCols); + vector tau_R1R2 (numCols); + vector tau_R3A (numCols); // Workspace array for factorization and applying the Q factor. // We recycle this workspace for all tests. - vector< Scalar > work (numCols); + vector work (numCols); - if (debug) + if (debug) { cerr << endl << "----------------------------------------" << endl << "TSQR::Combine first test problem:" << endl << "qr( [R1; R2] ), with R1 and R2 " << numCols << " by " << numCols << endl << endl; - - Combine< Ordinal, Scalar > combiner; - combiner.factor_pair (numCols, R1.data(), R1.stride(1), R2.data(), R2.stride(1), - &tau_R1R2[0], work.data()); + } + Combine combiner; + combiner.factor_pair (numCols, R1.data(), R1.stride(1), + R2.data(), R2.stride(1), + tau_R1R2.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), &tau_R1R2[0], + R2.data(), R2.stride(1), tau_R1R2.data(), &Q_R1R2(0, 0), Q_R1R2.stride(1), &Q_R1R2(numCols, 0), Q_R1R2.stride(1), work.data()); - if (debug) - { - cerr << "Results of first test problem:" << endl; - cerr << "-- Copy of test problem:" << endl; - print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1), - A_R1R2.data(), A_R1R2.stride(1)); - cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1), - Q_R1R2.data(), Q_R1R2.stride(1)); - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, R1.extent(0), R1.extent(1), - R1.data(), R1.stride(1)); - cerr << endl; - } + if (debug) { + cerr << "Results of first test problem:" << endl; + cerr << "-- Copy of test problem:" << endl; + print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1), + A_R1R2.data(), A_R1R2.stride(1)); + cerr << endl << "-- Q factor:" << endl; + print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1), + Q_R1R2.data(), Q_R1R2.stride(1)); + cerr << endl << "-- R factor:" << endl; + print_local_matrix (cerr, R1.extent(0), R1.extent(1), + R1.data(), R1.stride(1)); + cerr << endl; + } const results_type firstResults = local_verify (A_R1R2.extent(0), A_R1R2.extent(1), A_R1R2.data(), A_R1R2.stride(1), Q_R1R2.data(), Q_R1R2.stride(1), R1.data(), R1.stride(1)); - if (debug) + if (debug) { cerr << "\\| A - Q*R \\|_F = " << firstResults[0] << endl << "\\| I - Q'*Q \\|_F = " << firstResults[1] << endl << "\\| A \\|_A = " << firstResults[2] << endl; - - if (debug) cerr << endl << "----------------------------------------" << endl << "TSQR::Combine second test problem:" << endl << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols << " and A " << numRows << " by " << numCols << endl << endl; - - combiner.factor_inner (numRows, numCols, R3.data(), R3.stride(1), - A.data(), A.stride(1), &tau_R3A[0], work.data()); + } + combiner.factor_inner (numRows, numCols, + R3.data(), R3.stride(1), + A.data(), A.stride(1), + tau_R3A.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), &tau_R3A[0], + A.data(), A.stride(1), tau_R3A.data(), &Q_R3A(0, 0), Q_R3A.stride(1), &Q_R3A(numCols, 0), Q_R3A.stride(1), work.data()); - if (debug) - { - cerr << "Results of second test problem:" << endl; - cerr << "-- Copy of test problem:" << endl; - print_local_matrix (cerr, A_R3A.extent(0), A_R3A.extent(1), - A_R3A.data(), A_R3A.stride(1)); - cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q_R3A.extent(0), Q_R3A.extent(1), - Q_R3A.data(), Q_R3A.stride(1)); - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, R3.extent(0), R3.extent(1), - R3.data(), R3.stride(1)); - cerr << endl; - } + if (debug) { + cerr << "Results of second test problem:" << endl; + cerr << "-- Copy of test problem:" << endl; + print_local_matrix (cerr, A_R3A.extent(0), A_R3A.extent(1), + A_R3A.data(), A_R3A.stride(1)); + cerr << endl << "-- Q factor:" << endl; + print_local_matrix (cerr, Q_R3A.extent(0), Q_R3A.extent(1), + Q_R3A.data(), Q_R3A.stride(1)); + cerr << endl << "-- R factor:" << endl; + print_local_matrix (cerr, R3.extent(0), R3.extent(1), + R3.data(), R3.stride(1)); + cerr << endl; + } const results_type secondResults = local_verify (A_R3A.extent(0), A_R3A.extent(1), A_R3A.data(), A_R3A.stride(1), Q_R3A.data(), Q_R3A.stride(1), R3.data(), R3.stride(1)); - if (debug) + if (debug) { cerr << "\\| A - Q*R \\|_F = " << secondResults[0] << endl << "\\| I - Q'*Q \\|_F = " << secondResults[1] << endl << "\\| A \\|_A = " << secondResults[2] << endl; - - vector< magnitude_type > finalResults; + } + vector finalResults; finalResults.push_back (firstResults[0]); finalResults.push_back (firstResults[1]); finalResults.push_back (firstResults[2]); @@ -428,11 +426,7 @@ namespace TSQR { return finalResults; } - - - - /// \brief Simulate one combine step of Sequential TSQR - /// + //! Simulate one combine step of Sequential TSQR template static std::vector::magnitudeType> verifyCombineSeqTemplate (TSQR::Random::NormalGenerator& gen, @@ -485,17 +479,16 @@ namespace TSQR { matgen.fill_random_svd (numRows, numCols, A1.data(), A1.stride(1), &sigma_A1[0]); matgen.fill_random_svd (numRows, numCols, A2.data(), A2.stride(1), &sigma_A2[0]); - if (false && debug) - { - cerr << endl << "Test problem:" << endl; - cerr << endl << "Original matrix:" << endl; - printMatrix (cerr, A); - cerr << endl << "First cache block:" << endl; - printMatrix (cerr, A1); - cerr << endl << "Second cache block:" << endl; - printMatrix (cerr, A2); - cerr << endl; - } + if (false && debug) { + cerr << endl << "Test problem:" << endl; + cerr << endl << "Original matrix:" << endl; + printMatrix (cerr, A); + cerr << endl << "First cache block:" << endl; + printMatrix (cerr, A1); + cerr << endl << "Second cache block:" << endl; + printMatrix (cerr, A2); + cerr << endl; + } // Copy of the resulting test problem, stored as one dense // matrix rather than as two blocks. We will use A_copy to @@ -507,42 +500,56 @@ namespace TSQR { matrix_type Q (Ordinal(2) * numRows, numCols, Scalar(0)); // Fill Q with the first numCols columns of the identity matrix. - for (Ordinal k = 0; k < numCols; ++k) - Q(k, k) = Scalar(1); + for (Ordinal k = 0; k < numCols; ++k) { + // FIXME (mfh 26 Nov 2019) I'm assuming I can write to the + // Matrix or MatView on host, outside of Kokkos. TSQR always + // assumed this, but if we want to use Kokkos, we'll need to + // get rid of that assumption. + Q(k, k) = Scalar(1.0); + } // Two cache blocks (as views) of Q. mat_view_type Q1 (numRows, numCols, &Q(0,0), Q.stride(1)); mat_view_type Q2 (numRows, numCols, &Q(numRows,0), Q.stride(1)); // Two tau factor arrays, one for each cache block. - vector< Scalar > tau1 (numCols); - vector< Scalar > tau2 (numCols); + vector tau1 (numCols); + vector tau2 (numCols); // Workspace array for factorization and applying the Q factor. // We recycle this workspace for all tests. - vector< Scalar > work (numCols); + vector work (numCols); - if (debug) + if (debug) { cerr << endl << "----------------------------------------" << endl << "TSQR::Combine SequentialTsqr simulation with 2 cache blocks:" << endl << "qr( [A1; A2] ), with A1 and A2 being each " << numRows << " by " << numCols << endl << endl; - - Combine< Ordinal, Scalar > combiner; + } + Combine combiner; // qr( A1 ) - combiner.factor_first (numRows, numCols, A1.data(), A1.stride(1), - &tau1[0], work.data()); + combiner.factor_first (numRows, numCols, + A1.data(), A1.stride(1), + tau1.data(), work.data()); // View of numCols by numCols upper triangle of A1. mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1)); // qr( [R1; A2] ) - combiner.factor_inner (numRows, numCols, R1.data(), R1.stride(1), - A2.data(), A2.stride(1), &tau2[0], work.data()); + combiner.factor_inner (numRows, numCols, + R1.data(), R1.stride(1), + A2.data(), A2.stride(1), + tau2.data(), work.data()); // Extract (a deep copy of) the R factor. matrix_type R (R1); // Zero out everything below the diagonal of R. - for (Ordinal j = 0; j < numCols; ++j) - for (Ordinal i = j+1; i < numCols; ++i) - R(i,j) = Scalar(0); + for (Ordinal j = 0; j < numCols; ++j) { + for (Ordinal i = j+1; i < numCols; ++i) { + // FIXME (mfh 26 Nov 2019) I'm assuming I can write to the + // Matrix or MatView on host, outside of Kokkos. TSQR + // always assumed this in the past, but if we want to use + // Kokkos, we'll need to get rid of that assumption. + R(i,j) = Scalar {}; + } + } // Compute the explicit Q factor, by starting with A2 and // (working up the matrix A,) finishing with A1. @@ -575,7 +582,6 @@ namespace TSQR { return results; } - void verifyCombine (const int numRows, const int numCols, @@ -616,130 +622,125 @@ namespace TSQR { // output data. bool doPrintFieldNames = printFieldNames; - if (! simulateSequentialTsqr) - { - if (testReal) + if (! simulateSequentialTsqr) { + if (testReal) { + { + NormalGenerator normgenS (iseed); + const vector resultsS = + verifyCombineTemplate (normgenS, normgenS, numRows, + numCols, debug); + // Only print field names (if at all) once per run, for + // the first data type. + printResults (string("float"), numRows, numCols, + resultsS, doPrintFieldNames); + // Print field names at most once. + doPrintFieldNames = false; + // Fetch the pseudorandom seed from the previous test. + normgenS.getSeed (iseed); + } + { + NormalGenerator normgenD (iseed); + const vector resultsD = + verifyCombineTemplate (normgenD, normgenD, numRows, + numCols, debug); + printResults (string("double"), numRows, numCols, + resultsD, doPrintFieldNames); + doPrintFieldNames = false; + normgenD.getSeed (iseed); + } + } + + if (testComplex) + { +#ifdef HAVE_KOKKOSTSQR_COMPLEX { - { - NormalGenerator normgenS (iseed); - const vector resultsS = - verifyCombineTemplate (normgenS, normgenS, numRows, - numCols, debug); - // Only print field names (if at all) once per run, for - // the first data type. - printResults (string("float"), numRows, numCols, - resultsS, doPrintFieldNames); - // Print field names at most once. - doPrintFieldNames = false; - // Fetch the pseudorandom seed from the previous test. - normgenS.getSeed (iseed); - } - { - NormalGenerator normgenD (iseed); - const vector resultsD = - verifyCombineTemplate (normgenD, normgenD, numRows, - numCols, debug); - printResults (string("double"), numRows, numCols, - resultsD, doPrintFieldNames); - doPrintFieldNames = false; - normgenD.getSeed (iseed); - } + NormalGenerator > normgenC (iseed); + NormalGenerator normgenS (iseed); + const vector resultsC = + verifyCombineTemplate (normgenC, normgenS, numRows, + numCols, debug); + printResults (string("complex"), numRows, numCols, + resultsC, doPrintFieldNames); + doPrintFieldNames = false; + // Even though normgenC and normgenS each updated the + // random seed independently, for now we just fetch the + // updated seed from normgenC. This should still + // produce reproducible results. + normgenC.getSeed (iseed); } - - if (testComplex) { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - { - NormalGenerator > normgenC (iseed); - NormalGenerator normgenS (iseed); - const vector resultsC = - verifyCombineTemplate (normgenC, normgenS, numRows, - numCols, debug); - printResults (string("complex"), numRows, numCols, - resultsC, doPrintFieldNames); - doPrintFieldNames = false; - // Even though normgenC and normgenS each updated the - // random seed independently, for now we just fetch the - // updated seed from normgenC. This should still - // produce reproducible results. - normgenC.getSeed (iseed); - } - { - NormalGenerator > normgenZ (iseed); - NormalGenerator normgenD (iseed); - const vector resultsZ = - verifyCombineTemplate (normgenZ, normgenD, numRows, - numCols, debug); - printResults (string("complex"), numRows, numCols, - resultsZ, doPrintFieldNames); - doPrintFieldNames = false; - normgenZ.getSeed (iseed); - } + NormalGenerator > normgenZ (iseed); + NormalGenerator normgenD (iseed); + const vector resultsZ = + verifyCombineTemplate (normgenZ, normgenD, numRows, + numCols, debug); + printResults (string("complex"), numRows, numCols, + resultsZ, doPrintFieldNames); + doPrintFieldNames = false; + normgenZ.getSeed (iseed); + } #else // NOT HAVE_KOKKOSTSQR_COMPLEX - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, - "Trilinos was not built with " - "complex arithmetic support"); + TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, + "Trilinos was not built with " + "complex arithmetic support"); #endif // HAVE_KOKKOSTSQR_COMPLEX - } + } + } + else { // simulateSequentialTsqr + if (testReal) { + { + NormalGenerator normgenS (iseed); + const vector resultsS = + verifyCombineSeqTemplate (normgenS, normgenS, numRows, + numCols, debug); + printSimSeqTsqrResults (string("float"), numRows, numCols, + resultsS, doPrintFieldNames); + doPrintFieldNames = false; + normgenS.getSeed (iseed); + } + { + NormalGenerator normgenD (iseed); + const vector resultsD = + verifyCombineSeqTemplate (normgenD, normgenD, numRows, + numCols, debug); + printSimSeqTsqrResults (string("double"), numRows, numCols, + resultsD, doPrintFieldNames); + doPrintFieldNames = false; + normgenD.getSeed (iseed); + } } - else // simulateSequentialTsqr - { - if (testReal) - { - { - NormalGenerator normgenS (iseed); - const vector resultsS = - verifyCombineSeqTemplate (normgenS, normgenS, numRows, - numCols, debug); - printSimSeqTsqrResults (string("float"), numRows, numCols, - resultsS, doPrintFieldNames); - doPrintFieldNames = false; - normgenS.getSeed (iseed); - } - { - NormalGenerator normgenD (iseed); - const vector resultsD = - verifyCombineSeqTemplate (normgenD, normgenD, numRows, - numCols, debug); - printSimSeqTsqrResults (string("double"), numRows, numCols, - resultsD, doPrintFieldNames); - doPrintFieldNames = false; - normgenD.getSeed (iseed); - } - } - if (testComplex) - { + if (testComplex) { #ifdef HAVE_KOKKOSTSQR_COMPLEX - { - NormalGenerator > normgenC (iseed); - NormalGenerator normgenS (iseed); - const vector resultsC = - verifyCombineSeqTemplate (normgenC, normgenS, numRows, - numCols, debug); - printSimSeqTsqrResults (string("complex"), numRows, numCols, - resultsC, doPrintFieldNames); - doPrintFieldNames = false; - normgenC.getSeed (iseed); - } - { - NormalGenerator > normgenZ (iseed); - NormalGenerator normgenD (iseed); - const vector resultsZ = - verifyCombineSeqTemplate (normgenZ, normgenD, numRows, - numCols, debug); - printSimSeqTsqrResults (string("complex"), numRows, - numCols, resultsZ, doPrintFieldNames); - doPrintFieldNames = false; - normgenZ.getSeed (iseed); - } + { + NormalGenerator > normgenC (iseed); + NormalGenerator normgenS (iseed); + const vector resultsC = + verifyCombineSeqTemplate (normgenC, normgenS, numRows, + numCols, debug); + printSimSeqTsqrResults (string("complex"), numRows, numCols, + resultsC, doPrintFieldNames); + doPrintFieldNames = false; + normgenC.getSeed (iseed); + } + { + NormalGenerator > normgenZ (iseed); + NormalGenerator normgenD (iseed); + const vector resultsZ = + verifyCombineSeqTemplate (normgenZ, normgenD, numRows, + numCols, debug); + printSimSeqTsqrResults (string("complex"), numRows, + numCols, resultsZ, doPrintFieldNames); + doPrintFieldNames = false; + normgenZ.getSeed (iseed); + } #else // NOT HAVE_KOKKOSTSQR_COMPLEX - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, - "Trilinos was not built with " - "complex arithmetic support"); + TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, + "Trilinos was not built with " + "complex arithmetic support"); #endif // HAVE_KOKKOSTSQR_COMPLEX - } } + } } } // namespace Test } // namespace TSQR From 32790f3f9c65d300899af7f67009e5c1633befe5 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 12:51:13 -0700 Subject: [PATCH 23/50] ML: More experimental coarsening --- packages/ml/src/Coarsen/ml_amg_MIS.c | 33 +++++++++++++------ .../ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp | 25 ++++++++------ packages/ml/test/RefMaxwell/cxx_main.cpp | 8 ++++- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/packages/ml/src/Coarsen/ml_amg_MIS.c b/packages/ml/src/Coarsen/ml_amg_MIS.c index 7bb1552db35a..3d938b9f1f57 100644 --- a/packages/ml/src/Coarsen/ml_amg_MIS.c +++ b/packages/ml/src/Coarsen/ml_amg_MIS.c @@ -58,6 +58,9 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, ML_Aggregate_Comm *aggr_comm; double rowsum_threshold; +#ifdef ML_ROWSUM_DEBUG + int num_points_reset_for_rowsum=0; +#endif /* ============================================================= */ /* get the machine information and matrix references */ /* ============================================================= */ @@ -104,6 +107,8 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, { printf("ML_AMG_CoarsenMIS : current level = %d\n", ml_amg->cur_level); printf("ML_AMG_CoarsenMIS : current eps = %e\n", epsilon); + if(rowsum_threshold > 0.0) + printf("ML_AMG_CoarsenMIS : current rowsum = %e\n", rowsum_threshold); } /* ============================================================= */ @@ -237,8 +242,7 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, rowptr[0] = 0; for (i = 0; i < Nrows; i++) { - int itmp = total_nnz; - rowsum = 0; + int itmp = total_nnz; ML_get_matrix_row(Amatrix, 1, &i, &allocated, &rowi_col, &rowi_val, &rowi_N, 0); if ( sys_unk_filter ) @@ -247,24 +251,25 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, if (sys_info[rowi_col[j]] != sys_info[i]) rowi_val[j] = 0.0; } diag = 0.0; - for (j = 0; j < rowi_N; j++) + rowsum = 0.0; + for (j = 0; j < rowi_N; j++) { if ( rowi_col[j] == i ) diag = rowi_val[j]; + rowsum+=rowi_val[j]; + } + rowmax = 0.0; if ( diag >= 0. ) { - for (j = 0; j < rowi_N; j++) { + for (j = 0; j < rowi_N; j++) if (rowi_col[j] != i) rowmax = ML_min(rowmax, rowi_val[j]); - rowsum+=rowi_col[j]; - } } else { - for (j = 0; j < rowi_N; j++) { + for (j = 0; j < rowi_N; j++) if (rowi_col[j] != i) rowmax = ML_max(rowmax, rowi_val[j]); - rowsum+=rowi_col[j]; - } } rowmax *= epsilon; + if ( diag >= 0. ) { for (j = 0; j < rowi_N; j++) @@ -294,11 +299,19 @@ int ML_AMG_CoarsenMIS( ML_AMG *ml_amg, ML_Operator *Amatrix, if(rowsum_threshold > 0.0 && fabs(rowsum) > fabs(diag) * rowsum_threshold) { column[itmp]=i; values[itmp]=diag; - total_nnz = itmp; + total_nnz = itmp+1; +#ifdef ML_ROWSUM_DEBUG + num_points_reset_for_rowsum++; +#endif } rowptr[i+1] = total_nnz; } +#ifdef ML_ROWSUM_DEBUG + printf("ML_ROWSUM_DEBUG: # points reset via rowsum = %d\n",num_points_reset_for_rowsum); +#endif + + ML_free( rowi_col ); ML_free( rowi_val ); dtemp = A_nnz; diff --git a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp index 4afac9af8c3b..07a562ed0665 100644 --- a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp +++ b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp @@ -909,14 +909,14 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P useSA = false; } + // Setup the Fine Coordinates + ML_Aggregate_Viz_Stats fine_grid; + fine_grid.x=0; fine_grid.y=0; fine_grid.z=0; fine_grid.material=0; + RefMaxwell_SetupCoordinates(A_ML,List,fine_grid.x,fine_grid.y,fine_grid.z,fine_grid.material); + if(useSA) { /* Use SA */ - // Setup the Fine Coordinates - ML_Aggregate_Viz_Stats fine_grid; - fine_grid.x=0; fine_grid.y=0; fine_grid.z=0; fine_grid.material=0; - RefMaxwell_SetupCoordinates(A_ML,List,fine_grid.x,fine_grid.y,fine_grid.z,fine_grid.material); - // FIXME: We need to allow this later TEUCHOS_TEST_FOR_EXCEPTION(UseAux && UseMaterial, std::logic_error,"RefMaxwell_Aggregate_Nodes: Cannot use material and aux aggregation at the same time"); @@ -1007,11 +1007,6 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P if(very_verbose) printf("[%d] %s %d aggregates created invec_leng=%d\n",A.Comm().MyPID(),PrintMsg.c_str(),NumAggregates,P->invec_leng); /* Cleanup */ - if(fine_grid.x) ML_free(fine_grid.x); - if(fine_grid.y) ML_free(fine_grid.y); - if(fine_grid.z) ML_free(fine_grid.z); - if(fine_grid.material) ML_free(fine_grid.material); - ML_qr_fix_Destroy(); if(UseAux) ML_Finalize_Aux(A_ML); if(UseMaterial) ML_Finalize_Aux(A_ML); @@ -1022,6 +1017,7 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P ML_AMG *ml_amg; ML_AMG_Create( &ml_amg ); ML_AMG_Set_Threshold(ml_amg,Threshold); + if(RowSum_Threshold > 0.0) ML_AMG_Set_RowSum_Threshold(ml_amg, RowSum_Threshold); ML_AMG_Set_MaxLevels(ml_amg,2); ML_AMG_Set_MaxCoarseSize(ml_amg,1); P = ML_Operator_Create(ml_comm); @@ -1040,6 +1036,10 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P ML_CommInfoOP_Clone(&(AMGIdentity->getrow->pre_comm),A_ML->getrow->pre_comm); ML_2matmult(AMGIdentity, Pmatrix, P, ML_CSR_MATRIX ); + /* Project down the coordinates, if we need to, using Ptent. Note NumPDEs always = 1 */ + if(fine_grid.x || fine_grid.y || fine_grid.z || fine_grid.material) + RefMaxwell_Project_Coordinates(1,P,&fine_grid,pack); + /* Cleanup */ ML_Operator_Destroy(&AMGIdentity); ML_Operator_Destroy(&Pmatrix); @@ -1055,6 +1055,11 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P } } + /* Cleanup */ + if(fine_grid.x) ML_free(fine_grid.x); + if(fine_grid.y) ML_free(fine_grid.y); + if(fine_grid.z) ML_free(fine_grid.z); + if(fine_grid.material) ML_free(fine_grid.material); ML_Operator_Destroy(&A_ML); return 0; diff --git a/packages/ml/test/RefMaxwell/cxx_main.cpp b/packages/ml/test/RefMaxwell/cxx_main.cpp index b53946295d3d..c0be58d53cdc 100644 --- a/packages/ml/test/RefMaxwell/cxx_main.cpp +++ b/packages/ml/test/RefMaxwell/cxx_main.cpp @@ -372,6 +372,10 @@ bool matrix_read(Epetra_ActiveComm &Comm){ Teuchos::ParameterList List_AMG_sp = Build_Teuchos_List(N,coord_ptr,"coarse: type","Amesos-KLU","max levels",1); List_AMG_sp.sublist("refmaxwell: 11list").set("default values","Classical-AMG"); + Teuchos::ParameterList List_AMG_sprs = Build_Teuchos_List(N,coord_ptr,"coarse: type","Amesos-KLU","max levels",1); + List_AMG_sprs.sublist("refmaxwell: 11list").set("default values","Classical-AMG"); + List_AMG_sprs.sublist("refmaxwell: 11list").set("aggregation: rowsum threshold",0.9); + /* Do Tests */ Epetra_Vector lhs(EdgeMap,true); int status1, status2 = 0; @@ -433,7 +437,6 @@ bool matrix_read(Epetra_ActiveComm &Comm){ if(!Comm.MyPID()) printf("*** Test 15 ***\n"); rpc_test_additive_newconstructor(Comm,List_Rowsum,*SM,*M1,*M0inv,*D0,x_exact,lhs,rhs,false); - /* Test w/ material */ lhs.PutScalar(0.0); if(!Comm.MyPID()) printf("*** Test 16 ***\n"); @@ -451,6 +454,9 @@ bool matrix_read(Epetra_ActiveComm &Comm){ if(!Comm.MyPID()) printf("*** Test 19 ***\n"); rpc_test_additive_newconstructor(Comm,List_AMG_sp,*SM,*M1,*M0inv,*D0,x_exact,lhs,rhs,false); + /* Test w/ classical special prolongator and rowsum */ + if(!Comm.MyPID()) printf("*** Test 20 ***\n"); + rpc_test_additive_newconstructor(Comm,List_AMG_sprs,*SM,*M1,*M0inv,*D0,x_exact,lhs,rhs,false); From a0fa31355ec72db195def24c06f221a7854ccafa Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 13:36:02 -0700 Subject: [PATCH 24/50] ML: Default fix --- packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp index 07a562ed0665..626aa21e16d0 100644 --- a/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp +++ b/packages/ml/src/RefMaxwell/ml_RefMaxwell_Utils.cpp @@ -904,7 +904,7 @@ int ML_Epetra::RefMaxwell_Aggregate_Nodes(const Epetra_CrsMatrix & A, Teuchos::P bool useSA = true; if(List.isParameter("default values")) { - std::string default_values = List.get("default values","Classical-AMG"); + std::string default_values = List.get("default values","SA"); if(default_values == "Classical-AMG") useSA = false; } From cdc3fc5844b3c31381d648c3f5036580e6ce4969 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 14:10:24 -0700 Subject: [PATCH 25/50] TSQR: Start replacing copy_matrix Add new partition_2x1 function to help with a particular use case. --- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 40 ++++++++++--------- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 32 +++++++++++++++ packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 26 ++++++++++++ 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 9bf1d50a25b5..1efb3d6cfd36 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -290,31 +290,35 @@ namespace TSQR { matgen.fill_random_R (numCols, R3.data(), R3.stride(1), &sigma_R3[0]); matgen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), &sigma_A[0]); - if (false && debug) - { - cerr << endl << "First test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R1.data(), R1.stride(1)); - print_local_matrix (cerr, numCols, numCols, R2.data(), R2.stride(1)); - cerr << endl; - - cerr << endl << "Second test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R3.data(), R3.stride(1)); - print_local_matrix (cerr, numRows, numCols, A.data(), A.stride(1)); - cerr << endl; - } + if (false && debug) { + cerr << endl << "First test problem:" << endl; + print_local_matrix (cerr, numCols, numCols, R1.data(), R1.stride(1)); + print_local_matrix (cerr, numCols, numCols, R2.data(), R2.stride(1)); + cerr << endl; + + cerr << endl << "Second test problem:" << endl; + print_local_matrix (cerr, numCols, numCols, R3.data(), R3.stride(1)); + print_local_matrix (cerr, numRows, numCols, A.data(), A.stride(1)); + cerr << endl; + } // Space to put the original test problem, expressed as one // dense matrix rather than in two blocks. These will be deep // copies of the test problems, since the test problem matrices // will be overwritten by the factorizations. - matrix_type A_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0)); - matrix_type A_R3A (numRows + numCols, numCols, Scalar(0)); + matrix_type A_R1R2 (Ordinal(2) * numCols, numCols, Scalar {}); + matrix_type A_R3A (numRows + numCols, numCols, Scalar {}); // Copy [R1; R2] into A_R1R2. - copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.stride(1), - R1.data(), R1.stride(1)); - copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.stride(1), - R2.data(), R2.stride(1)); + { + auto A_R1R2_views = partition_2x1 (A_R1R2, numCols); + // copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.stride(1), + // R1.data(), R1.stride(1)); + // copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.stride(1), + // R2.data(), R2.stride(1)); + deep_copy (A_R1R2_views.first, R1); + deep_copy (A_R1R2_views.second, R2); + } // Copy [R3; A] into A_R3A. copy_matrix (numCols, numCols, &A_R3A(0, 0), A_R3A.stride(1), diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 761c175d4ab6..b9314d462d60 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -385,6 +385,38 @@ namespace TSQR { } } } + + template + std::pair + partition_2x1 (const MatViewType& A, + const typename MatViewType::ordinal_type nrows_top, + const bool b_contiguous_blocks = false) + { + using ordinal_type = typename MatViewType::ordinal_type; + using pointer = typename MatViewType::pointer; + + const ordinal_type ncols = A.extent(1); + pointer const A_top_ptr = A.data(); + const ordinal_type nrows_bot = A.extent(0) - nrows_top; + + pointer A_bot_ptr; + ordinal_type lda_top, lda_bot; + if (b_contiguous_blocks) { + lda_top = nrows_top; + lda_bot = nrows_bot; + A_bot_ptr = A_top_ptr + nrows_top * A.extent(1); + } + else { // assume column major (LayoutLeft, in Kokkos terms) + lda_top = A.stride(1); + lda_bot = A.stride(1); + A_bot_ptr = A_top_ptr + nrows_top; + } + + MatViewType A_top (nrows_top, ncols, A_top_ptr, lda_top); + MatViewType A_bot (nrows_bot, ncols, A_bot_ptr, lda_bot); + return {A_top, A_bot}; + } + } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 5380c7023711..e54a91601808 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -342,6 +342,32 @@ namespace TSQR { tgt.data(), tgt.stride(1)); deep_copy (tgt_view, src); } + + // Matrix is a container, so the version of data() that returns a + // nonconst pointer must be nonconst. + template + std::pair, MatView> + partition_2x1 (Matrix& A, + const typename Matrix::ordinal_type nrows_top, + const bool b_contiguous_blocks = false) + { + MatView A_view (A.extent(0), A.extent(1), + A.data(), A.stride(1)); + return partition_2x1 (A_view, nrows_top, b_contiguous_blocks); + } + + // Matrix is a container, so the version of data() that returns a + // nonconst pointer must be nonconst. + template + std::pair, MatView> + partition_2x1 (const Matrix& A, + const typename Matrix::ordinal_type nrows_top, + const bool b_contiguous_blocks = false) + { + MatView A_view (A.extent(0), A.extent(1), + A.data(), A.stride(1)); + return partition_2x1 (A_view, nrows_top, b_contiguous_blocks); + } } // namespace TSQR #endif // __TSQR_Tsqr_Matrix_hpp From 3d41d906a84aa0e6e42b9074153a4345ac0aa7cf Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 14:18:20 -0700 Subject: [PATCH 26/50] TSQR: Split SequentialTsqr test to exercise more cases We now test both the contiguous cache blocks and noncontiguous cache blocks cases. --- packages/tpetra/tsqr/test/CMakeLists.txt | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/packages/tpetra/tsqr/test/CMakeLists.txt b/packages/tpetra/tsqr/test/CMakeLists.txt index efdd959bb082..26bc2e6a0cb6 100644 --- a/packages/tpetra/tsqr/test/CMakeLists.txt +++ b/packages/tpetra/tsqr/test/CMakeLists.txt @@ -16,13 +16,27 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( NUM_MPI_PROCS 1 ) -# Performance and accuracy test suite for TSQR::SequentialTsqr -# (sequential cache-blocked TSQR). -TRIBITS_ADD_EXECUTABLE_AND_TEST( +# Test TSQR::SequentialTsqr (sequential cache-blocked TSQR). +TRIBITS_ADD_EXECUTABLE( SequentialTsqr SOURCES Tsqr_TestSeqTsqr.cpp COMM serial mpi - ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=50000 --contiguous-cache-blocks" + ) + +TRIBITS_ADD_TEST( + SequentialTsqr + NAME SequentialTsqr_contiguousCacheBlocks + COMM serial mpi + ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000 --contiguous-cache-blocks" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 1 + ) + +TRIBITS_ADD_TEST( + SequentialTsqr + NAME SequentialTsqr_noncontiguousCacheBlocks + COMM serial mpi + ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) From a040d586585fd4c542029a3f14700fa95a667e99 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 14:31:20 -0700 Subject: [PATCH 27/50] TSQR: Replace more uses of copy_matrix --- packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp | 14 ++------------ packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 4 ---- packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp | 14 ++++++++------ 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index fbd423b49d6a..56c9ef51f076 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -326,16 +326,11 @@ namespace TSQR { if (A_out_rest.empty()) { throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); } - // This call modifies A_in_rest. const_mat_view_type A_in_cur = split_top_block (A_in_rest, false); - // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, true); - - copy_matrix (A_in_cur.extent(0), num_cols, - A_out_cur.data(), A_out_cur.stride(1), - A_in_cur.data(), A_in_cur.stride(1)); + deep_copy (A_out_cur, A_in_cur); } } @@ -360,16 +355,11 @@ namespace TSQR { if (A_out_rest.empty()) { throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); } - // This call modifies A_in_rest. const_mat_view_type A_in_cur = split_top_block (A_in_rest, true); - // This call modifies A_out_rest. mat_view_type A_out_cur = split_top_block (A_out_rest, false); - - copy_matrix (A_in_cur.extent(0), num_cols, - A_out_cur.data(), A_out_cur.stride(1), - A_in_cur.data(), A_in_cur.stride(1)); + deep_copy (A_out_cur, A_in_cur); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 1efb3d6cfd36..6c44bebc1ed4 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -312,10 +312,6 @@ namespace TSQR { // Copy [R1; R2] into A_R1R2. { auto A_R1R2_views = partition_2x1 (A_R1R2, numCols); - // copy_matrix (numCols, numCols, &A_R1R2(0, 0), A_R1R2.stride(1), - // R1.data(), R1.stride(1)); - // copy_matrix (numCols, numCols, &A_R1R2(numCols, 0), A_R1R2.stride(1), - // R2.data(), R2.stride(1)); deep_copy (A_R1R2_views.first, R1); deep_copy (A_R1R2_views.second, R2); } diff --git a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp index 93e0cdeff89d..91fcb13f10e1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp @@ -42,6 +42,7 @@ #include "Tsqr_Util.hpp" #include "Tsqr_Impl_SystemBlas.hpp" +#include "Tsqr_Matrix.hpp" #include #include #include // std::pair, std::make_pair @@ -214,18 +215,19 @@ namespace TSQR { typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; - std::vector AbsResid (nrows * ncols, - std::numeric_limits::quiet_NaN ()); - const Ordinal AbsResid_stride = nrows; + MatView A_view (nrows, ncols, A, lda); + Matrix AbsResid (nrows, ncols, + std::numeric_limits::quiet_NaN ()); Impl::SystemBlas blas; const magnitude_type ONE (1); // A_copy := A_copy - Q * R - copy_matrix (nrows, ncols, &AbsResid[0], AbsResid_stride, A, lda); + deep_copy (AbsResid, A_view); blas.GEMM (NO_TRANS, NO_TRANS, nrows, ncols, ncols, -ONE, Q, ldq, R, ldr, - ONE, &AbsResid[0], AbsResid_stride); + ONE, AbsResid.data(), AbsResid.stride(1)); - return local_frobenius_norm (nrows, ncols, &AbsResid[0], AbsResid_stride); + return local_frobenius_norm (nrows, ncols, AbsResid.data(), + AbsResid.stride(1)); } From e24a57d90e31aef3fa8f09dcbe1653570970e17c Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 14:38:48 -0700 Subject: [PATCH 28/50] TSQR: Fix build for when complex Scalar types are disabled --- packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp | 3 +++ packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp | 4 ++++ packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp | 3 +++ packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp index 3bee59a96325..51d105b6bc68 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp @@ -114,8 +114,11 @@ compute_explicit_Q(const int m, const int n, const int k, \ TSQR_IMPL_LAPACK_IMPL( float ) TSQR_IMPL_LAPACK_IMPL( double ) + +#ifdef HAVE_KOKKOSTSQR_COMPLEX TSQR_IMPL_LAPACK_IMPL( std::complex ) TSQR_IMPL_LAPACK_IMPL( std::complex ) +#endif // HAVE_KOKKOSTSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp index 88e615ab637d..392f2aa4f6c4 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp @@ -1,6 +1,7 @@ #ifndef TSQR_IMPL_LAPACK_HPP #define TSQR_IMPL_LAPACK_HPP +#include "Tsqr_ConfigDefs.hpp" #include "Tsqr_Impl_RawQR.hpp" #include @@ -69,8 +70,11 @@ public: \ TSQR_IMPL_LAPACK_DECL( float ) TSQR_IMPL_LAPACK_DECL( double ) + +#ifdef HAVE_KOKKOSTSQR_COMPLEX TSQR_IMPL_LAPACK_DECL( std::complex ) TSQR_IMPL_LAPACK_DECL( std::complex ) +#endif // HAVE_KOKKOSTSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp index c93b6e53c219..bc19ef78be03 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp @@ -89,8 +89,11 @@ TRSM(const Teuchos::ESide side, const Teuchos::EUplo uplo, \ TSQR_IMPL_SYSTEMBLAS_IMPL( float ) TSQR_IMPL_SYSTEMBLAS_IMPL( double ) + +#ifdef HAVE_KOKKOSTSQR_COMPLEX TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) +#endif // HAVE_KOKKOSTSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp index b4156adf9e79..1e49ddc266c8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp @@ -1,6 +1,7 @@ #ifndef TSQR_IMPL_SYSTEMBLAS_HPP #define TSQR_IMPL_SYSTEMBLAS_HPP +#include "Tsqr_ConfigDefs.hpp" #include "Tsqr_Impl_RawBlas.hpp" #include "Teuchos_BLAS_types.hpp" #include @@ -56,8 +57,11 @@ public: \ TSQR_IMPL_SYSTEMBLAS_DECL( float ) TSQR_IMPL_SYSTEMBLAS_DECL( double ) + +#ifdef HAVE_KOKKOSTSQR_COMPLEX TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) +#endif // HAVE_KOKKOSTSQR_COMPLEX } // namespace Impl } // namespace TSQR From 70b58e264eefc902fa56c2976099ad263327b256 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 14:50:19 -0700 Subject: [PATCH 29/50] TSQR: Remove more uses of copy_matrix --- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 9295f1499122..426ac07755c8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -178,8 +178,15 @@ namespace TSQR { // we only want to include the upper triangle in the // factorization. Thus, only copy the upper triangle of R into // the appropriate place in the buffer. - copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.stride(1), R, ldr); - copy_matrix (m, n, &A_buf_(n, 0), A_buf_.stride(1), A, lda); + MatView R_view (n, n, R, ldr); + MatView A_buf_top (n, n, A_buf_.data(), + A_buf_.stride(1)); + deep_copy (A_buf_top, R_view); + + MatView A_view (m, n, A, lda); + MatView A_buf_bot (m, n, &A_buf_(n, 0), + A_buf_.stride(1)); + deep_copy (A_buf_bot, A_view); const int lwork = n; lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1), @@ -187,8 +194,9 @@ namespace TSQR { // Copy back the results. R might be a view of the upper // triangle of a cache block, so only copy into the upper // triangle of R. - copy_upper_triangle (n, n, R, ldr, &A_buf_(0, 0), A_buf_.stride(1)); - copy_matrix (m, n, A, lda, &A_buf_(n, 0), A_buf_.stride(1)); + copy_upper_triangle (n, n, R, ldr, A_buf_top.data(), + A_buf_top.stride(1)); + deep_copy (A_view, A_buf_bot); } void @@ -244,8 +252,17 @@ namespace TSQR { &A_buf_(ncols_Q, 0), A_buf_.stride(1), R_bot, ldr_bot); C_buf_.reshape (numRows, ncols_C); - copy_matrix (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.stride(1), C_top, ldc_top); - copy_matrix (ncols_Q, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.stride(1), C_bot, ldc_bot); + + using view_type = MatView; + view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top); + view_type C_buf_top (ncols_Q, ncols_C, + C_buf_.data (), C_buf_.stride (1)); + deep_copy (C_buf_top, C_top_view); + + view_type C_bot_view (ncols_Q, ncols_C, C_bot, ldc_bot); + view_type C_buf_bot (ncols_Q, ncols_C, + &C_buf_(ncols_Q, 0), C_buf_.stride (1)); + deep_copy (C_buf_bot, C_bot_view); const int lwork = ncols_Q; const std::string trans = apply_type.toString (); @@ -254,10 +271,8 @@ namespace TSQR { C_buf_.data(), C_buf_.stride(1), work, lwork); // Copy back the results. - copy_matrix (ncols_Q, ncols_C, C_top, ldc_top, - &C_buf_(0, 0), C_buf_.stride(1)); - copy_matrix (ncols_Q, ncols_C, C_bot, ldc_bot, - &C_buf_(ncols_Q, 0), C_buf_.stride(1)); + deep_copy (C_top_view, C_buf_top); + deep_copy (C_bot_view, C_buf_bot); } private: From d2a261930f2bf670e17121b6ec70b75640999953 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:02:46 -0700 Subject: [PATCH 30/50] TSQR: Clean up in prep for removing uses of copy_matrix --- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 12 ++-- .../tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp | 59 +++++++++---------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 6c44bebc1ed4..5914ff99ef55 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -317,10 +317,11 @@ namespace TSQR { } // Copy [R3; A] into A_R3A. - copy_matrix (numCols, numCols, &A_R3A(0, 0), A_R3A.stride(1), - R3.data(), R3.stride(1)); - copy_matrix (numRows, numCols, &A_R3A(numCols, 0), A_R3A.stride(1), - A.data(), A.stride(1)); + { + auto A_R3A_views = partition_2x1 (A_R3A, numCols); + deep_copy (A_R3A_views.first, R3); + deep_copy (A_R3A_views.second, A); + } // Space to put the explicit Q factors. matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0)); @@ -329,6 +330,9 @@ namespace TSQR { // Fill the explicit Q factor matrices with the first numCols // columns of the identity matrix. for (Ordinal k = 0; k < numCols; ++k) { + // FIXME (mfh 26 Nov 2019) Eventually we want to get away from + // direct modification of the entries of a Matrix or MatView, + // in favor of only doing so with a Kokkos kernel or TPL. Q_R1R2(k, k) = Scalar(1.0); Q_R3A(k, k) = Scalar(1.0); } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp index d6ad83b7f57e..052ed9598d3b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp @@ -89,37 +89,34 @@ namespace TSQR { vector< Scalar > tau (ncols); // Send and receive R factor. - messenger->swapData (&R_mine[0], &R_other[0], nelts, P_other, tag); - - Combine< LocalOrdinal, Scalar > combine; - if (P_mine == P_top) - { - combine.factor_pair (ncols, &R_mine[0], ldr, &R_other[0], ldr, &tau[0], &work[0]); - Q_factors.push_back (R_other); - tau_arrays.push_back (tau); - } - else if (P_mine == P_bot) - { - combine.factor_pair (ncols, &R_other[0], ldr, &R_mine[0], ldr, &tau[0], &work[0]); - Q_factors.push_back (R_mine); - // Make sure that the "bottom" processor gets the current R - // factor, which is returned in R_mine. - copy_matrix (ncols, ncols, &R_mine[0], ldr, &R_other[0], ldr); - tau_arrays.push_back (tau); - } - else - { - // mfh 16 Apr 2010: the troubles with assert statements are as follows: - // - // 1. They go away in a release build. - // 2. They don't often print out useful diagnostic information. - // 3. If you mistype the assert, like "assert(errcode = 1);" instead of - // "assert(errcode == 1)", you'll get false positives. - ostringstream os; - os << "Should never get here: P_mine (= " << P_mine - << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; - throw std::logic_error (os.str()); - } + messenger->swapData (R_mine.data(), R_other.data(), + nelts, P_other, tag); + + Combine combine; + if (P_mine == P_top) { + combine.factor_pair (ncols, R_mine.data(), ldr, + R_other.data(), ldr, + tau.data(), work.data()); + Q_factors.push_back (R_other); + tau_arrays.push_back (tau); + } + else if (P_mine == P_bot) { + combine.factor_pair (ncols, R_other.data(), ldr, + R_mine.data(), ldr, + tau.data(), work.data()); + Q_factors.push_back (R_mine); + // Make sure that the "bottom" processor gets the current R + // factor, which is returned in R_mine. + copy_matrix (ncols, ncols, R_mine.data(), ldr, + R_other.data(), ldr); + tau_arrays.push_back (tau); + } + else { + ostringstream os; + os << "Should never get here: P_mine (= " << P_mine + << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; + throw std::logic_error (os.str()); + } } void From c78e06ad868bad97d9ae019500ae6463088e32bb Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:11:00 -0700 Subject: [PATCH 31/50] TSQR: Remove more uses of copy_matrix --- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 9 +++++---- packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp | 9 ++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 05cc1afc71e4..dcc60458eda9 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -276,8 +276,10 @@ namespace TSQR { DistTsqrHelper helper; const ordinal_type ncols = R_mine.extent(1); - std::vector R_local (ncols*ncols); - copy_matrix (ncols, ncols, R_local.data(), ncols, R_mine.data(), R_mine.stride(1)); + std::vector R_local (ncols * ncols); + MatView R_local_view + (ncols, ncols, R_local.data(), ncols); + deep_copy (R_local_view, R_mine); const int P = messenger_->size(); const int my_rank = messenger_->rank(); @@ -285,8 +287,7 @@ namespace TSQR { std::vector work (ncols); helper.factor_helper (ncols, R_local, my_rank, 0, P-1, first_tag, messenger_.get(), Q_factors, tau_arrays, work); - copy_matrix (ncols, ncols, R_mine.data(), R_mine.stride(1), - R_local.data(), ncols); + deep_copy (R_mine, R_local_view); return std::make_pair (Q_factors, tau_arrays); } diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp index 052ed9598d3b..b1990fda4567 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp @@ -105,10 +105,13 @@ namespace TSQR { R_mine.data(), ldr, tau.data(), work.data()); Q_factors.push_back (R_mine); + // Make sure that the "bottom" processor gets the current R // factor, which is returned in R_mine. - copy_matrix (ncols, ncols, R_mine.data(), ldr, - R_other.data(), ldr); + using view_type = MatView; + view_type R_mine_view (ncols, ncols, R_mine.data(), ldr); + view_type R_other_view (ncols, ncols, R_other.data(), ldr); + deep_copy (R_mine_view, R_other_view); tau_arrays.push_back (tau); } else { @@ -197,7 +200,7 @@ namespace TSQR { if (! b_even) { const int theTag = 142; // magic constant - messenger->send (&R_mine[0], ncols*ncols, P_mid-1, theTag); + messenger->send (R_mine.data(), ncols*ncols, P_mid-1, theTag); } } } From a965992115b8f6869dab368922ac7e07dfebfea3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:20:14 -0700 Subject: [PATCH 32/50] TSQR: Remove all uses of copy_matrix --- packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp | 14 ++++++++------ packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp | 17 +++++++++-------- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 14 ++++++++++---- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp index f666e6ce70f8..02f7ab39d61a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_GlobalVerify.hpp @@ -225,18 +225,20 @@ namespace TSQR { vector Resid (nrows_local * ncols, STS::nan()); const LocalOrdinal ld_resid = nrows_local; - - // Resid := A (deep copy) - copy_matrix (nrows_local, ncols, &Resid[0], ld_resid, A_local, lda_local); + MatView Resid_view + (nrows_local, ncols, Resid.data (), ld_resid); + MatView A_view + (nrows_local, ncols, A_local, lda_local); + deep_copy (Resid_view, A_view); // Resid := Resid - Q*R blas.GEMM (NO_TRANS, NO_TRANS, nrows_local, ncols, ncols, -ONE, Q_local, ldq_local, R, ldr, - ONE, &Resid[0], ld_resid); + ONE, Resid.data(), ld_resid); const magnitude_type Resid_F = - global_frobenius_norm (nrows_local, ncols, &Resid[0], ld_resid, messenger); - + global_frobenius_norm (nrows_local, ncols, Resid.data(), + ld_resid, messenger); vector results (3); results[0] = Resid_F; results[1] = Orthog_F; diff --git a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp index 91fcb13f10e1..d9fd984c109a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_LocalVerify.hpp @@ -247,20 +247,21 @@ namespace TSQR { typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; - std::vector AbsResid (nrows * ncols, std::numeric_limits::quiet_NaN ()); - const Ordinal AbsResid_stride = nrows; - Impl::SystemBlas blas; - const magnitude_type ONE (1.0); + MatView A_view (nrows, ncols, A, lda); + Matrix AbsResid + (nrows, ncols, std::numeric_limits::quiet_NaN ()); + deep_copy (AbsResid, A); // A_copy := A_copy - Q * R - copy_matrix (nrows, ncols, AbsResid.data(), - AbsResid_stride, A, lda); + Impl::SystemBlas blas; + const magnitude_type ONE (1.0); blas.GEMM (NO_TRANS, NO_TRANS, nrows, ncols, ncols, -ONE, Q, ldq, R, ldr, - ONE, AbsResid.data(), AbsResid_stride); + ONE, AbsResid.data(), AbsResid.stride(1)); const magnitude_type absolute_residual = - local_frobenius_norm (nrows, ncols, &AbsResid[0], AbsResid_stride); + local_frobenius_norm (nrows, ncols, AbsResid.data(), + AbsResid.stride(1)); return absolute_residual / A_norm_F; } diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index e54a91601808..43421a50b89f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -191,8 +191,11 @@ namespace TSQR { A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (! in.empty()) { - copy_matrix (extent(0), extent(1), data(), stride(1), - in.data(), in.stride(1)); + MatView this_view + (extent(0), extent(1), data(), stride(1)); + MatView in_view + (in.extent(0), in.extent(1), in.data(), in.stride(1)); + deep_copy (this_view, in_view); } } @@ -212,8 +215,11 @@ namespace TSQR { A_ (verified_alloc_size (in.extent(0), in.extent(1))) { if (A_.size() != 0) { - copy_matrix (extent(0), extent(1), data(), stride(1), - in.data(), in.stride(1)); + MatView this_view + (extent(0), extent(1), data(), stride(1)); + MatView in_view + (in.extent(0), in.extent(1), in.data(), in.stride(1)); + deep_copy (this_view, in_view); } } From fb7f1b83f2bff0610674400dd231c2ace635f1ff Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:20:34 -0700 Subject: [PATCH 33/50] TSQR: Remove copy_matrix itself --- packages/tpetra/tsqr/src/Tsqr_Util.hpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp index 2063ae9ddf5f..bb118497b1dc 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp @@ -129,22 +129,6 @@ namespace TSQR { } } - template< class Ordinal, class Scalar > - void - copy_matrix (const Ordinal nrows, - const Ordinal ncols, - Scalar* const A, - const Ordinal lda, - const Scalar* const B, - const Ordinal ldb) - { - for (Ordinal j = 0; j < ncols; ++j) { - Scalar* const A_j = &A[j*lda]; - const Scalar* const B_j = &B[j*ldb]; - std::copy (B_j, B_j + nrows, A_j); - } - } - template< class Ordinal, class Scalar > void fill_matrix (const Ordinal nrows, From bac42030bfa059a5d5a44e08a9102fd351915cd6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:33:02 -0700 Subject: [PATCH 34/50] TSQR::SequentialTsqr: Remove uses of fill_matrix --- packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index d8c604dad86f..832f66cba844 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -484,7 +484,8 @@ namespace TSQR { const_mat_view_type A_top = this->top_block (A_view, contiguous_cache_blocks); // Fill R (including lower triangle) with zeros. - fill_matrix (ncols, ncols, R, ldr, Teuchos::ScalarTraits::zero()); + mat_view_type R_view (ncols, ncols, R, ldr); + deep_copy (R_view, Scalar {}); // Copy out the upper triangle of the R factor from A into R. copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.stride(1)); @@ -540,12 +541,12 @@ namespace TSQR { // Copy the R factor resulting from the factorization out of // R_view (a view of the topmost cache block of A) into the R // output argument. - fill_matrix (ncols, ncols, R, ldr, Scalar(0)); + mat_view_type R_out (ncols, ncols, R, ldr); + deep_copy (R_out, Scalar {}); copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.stride(1)); return tau_arrays; } - /// \brief The number of cache blocks that factor() would use. /// /// The \c factor() method breaks the input matrix A into one or From 446b942196fdb1db60dbbf6eaf0b75ad538522db Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:34:56 -0700 Subject: [PATCH 35/50] TSQR::SequentialCholeskyTsqr: Remove uses of fill_matrix --- .../tsqr/src/Tsqr_SequentialCholeskyQR.hpp | 57 ++++++++++--------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index 9a84fa299ad3..f768fe5ae898 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -127,36 +127,36 @@ namespace TSQR { Matrix ATA (ncols, ncols, Scalar {}); FactorOutput retval (0); - if (contiguous_cache_blocks) - { - // Compute ATA := A^T * A, by iterating through the cache - // blocks of A from top to bottom. - // - // We say "A_rest" because it points to the remaining part of - // the matrix left to process; at the beginning, the "remaining" - // part is the whole matrix, but that will change as the - // algorithm progresses. - mat_view_type A_rest (nrows, ncols, A, lda); - // This call modifies A_rest (but not the actual matrix - // entries; just the dimensions and current position). - mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - // Process the first cache block: ATA := A_cur^T * A_cur + if (contiguous_cache_blocks) { + // Compute ATA := A^T * A, by iterating through the cache + // blocks of A from top to bottom. + // + // We say "A_rest" because it points to the remaining part of + // the matrix left to process; at the beginning, the + // "remaining" part is the whole matrix, but that will change + // as the algorithm progresses. + mat_view_type A_rest (nrows, ncols, A, lda); + // This call modifies A_rest (but not the actual matrix + // entries; just the dimensions and current position). + mat_view_type A_cur = + blocker.split_top_block (A_rest, contiguous_cache_blocks); + // Process the first cache block: ATA := A_cur^T * A_cur + // + // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? + blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), + Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), + A_cur.stride (1), Scalar (0), ATA.data (), ATA.stride (1)); + // Process the remaining cache blocks in order. + while (! A_rest.empty ()) { + A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + // ATA := ATA + A_cur^T * A_cur // // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), - A_cur.stride (1), Scalar (0), ATA.data (), ATA.stride (1)); - // Process the remaining cache blocks in order. - while (! A_rest.empty ()) { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - // ATA := ATA + A_cur^T * A_cur - // - // FIXME (mfh 08 Oct 2014) Shouldn't this be CONJ_TRANS? - blas.GEMM (Teuchos::TRANS, NO_TRANS, ncols, ncols, A_cur.extent (0), - Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), - A_cur.stride (1), Scalar (1), ATA.data (), ATA.stride (1)); - } + A_cur.stride (1), Scalar (1), ATA.data (), ATA.stride (1)); } + } else { // Compute ATA := A^T * A, using a single BLAS call. // @@ -175,8 +175,11 @@ namespace TSQR { // CholeskyQR + symmetric eigensolver factorization. // Copy out the R factor - fill_matrix (ncols, ncols, R, ldr, Scalar {}); - copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.stride(1)); + { + mat_view_type R_out (ncols, ncols, R, ldr); + deep_copy (R_out, Scalar {}); + copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.stride(1)); + } // Compute A := A * R^{-1}. We do this in place in A, using // BLAS' TRSM with the R factor (form POTRF) stored in the upper From 119108ec3f1233dc22dcf2abfdac4968c6ffb728 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:36:55 -0700 Subject: [PATCH 36/50] TSQR::DistTsqr: Remove uses of fill_matrix --- packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index dcc60458eda9..39aba991f8cc 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -47,7 +47,6 @@ #include "Tsqr_DistTsqrRB.hpp" #include "Teuchos_ParameterList.hpp" #include "Teuchos_ParameterListAcceptorDefaultBase.hpp" -#include "Teuchos_ScalarTraits.hpp" #include // std::pair @@ -72,11 +71,6 @@ namespace TSQR { typedef std::pair FactorOutput; typedef int rank_type; - private: - typedef Teuchos::ScalarTraits STS; - - public: - /// \brief Constructor (that accepts a parameter list). /// /// \param plist [in/out] List of parameters for configuring TSQR. @@ -338,10 +332,16 @@ namespace TSQR { "you must first call init() with a valid " "MessengerBase instance."); const int myRank = messenger_->rank (); - fill_matrix (ncols_Q, ncols_Q, Q_mine, ldq_mine, STS::zero()); + + MatView Q_mine_view + (ncols_Q, ncols_Q, Q_mine, ldq_mine); + deep_copy (Q_mine_view, scalar_type {}); if (myRank == 0) { - for (ordinal_type j = 0; j < ncols_Q; ++j) - Q_mine[j + j*ldq_mine] = STS::one(); + for (ordinal_type j = 0; j < ncols_Q; ++j) { + // FIXME (26 Nov 2019) Eventually, we only want to write to + // a matrix through a Kokkos kernel or a TPL. + Q_mine[j + j*ldq_mine] = scalar_type (1.0); + } } apply (ApplyType::NoTranspose, ncols_Q, ncols_Q, Q_mine, ldq_mine, factor_output); From 9d3ff60214650c411f139f241cd79feaaa0ef51e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:37:16 -0700 Subject: [PATCH 37/50] TSQR: Remove fill_matrix itself --- packages/tpetra/tsqr/src/Tsqr_Util.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp index bb118497b1dc..ddbe59f4f062 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp @@ -129,20 +129,6 @@ namespace TSQR { } } - template< class Ordinal, class Scalar > - void - fill_matrix (const Ordinal nrows, - const Ordinal ncols, - Scalar* const A, - const Ordinal lda, - const Scalar& default_val) - { - for (Ordinal j = 0; j < ncols; ++j) { - Scalar* const A_j = &A[j*lda]; - std::fill (A_j, A_j + nrows, default_val); - } - } - template< class Ordinal, class Scalar, class Generator > void generate_matrix (const Ordinal nrows, From aa9ad10b8adc1d28f9b1503d799b546f31abf858 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 15:48:36 -0700 Subject: [PATCH 38/50] TSQR::Combine*::factor_first now takes MatView instead of a pointer The goal is conversion to use Kokkos::View. --- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 17 ++++-- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 34 +++++------- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 21 +++++--- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 52 +++++++++++++------ packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 4 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 3 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 3 +- 7 files changed, 81 insertions(+), 53 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 42857f63e704..e169541b1581 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -129,14 +129,21 @@ namespace TSQR { /// scaling factors for the Householder reflectors /// \param work [out] Workspace array of length ncols void - factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, + factor_first (const MatView& A, + Scalar tau[], + Scalar work[]) const + { + return impl_.factor_first (A, tau, work); + } + + void + factor_first (Matrix& A, Scalar tau[], Scalar work[]) const { - return impl_.factor_first (nrows, ncols, A, lda, tau, work); + MatView A_view + (A.extent (0), A.extent (1), A.data (), A.stride (1)); + return factor_first (A_view, tau, work); } /// \brief Apply the result of \c factor_first(). diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 27ebc62b08be..5f081688b3e4 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -313,14 +313,12 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) - { - combiner.factor_first (numRows, numCols, A.data(), A.stride(1), - tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); - } + for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { + combiner.factor_first (A, tau.data(), work.data()); + combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); + } // How much time numTrials runs must take in order for // numTrials to be considered sufficiently large. @@ -343,14 +341,12 @@ namespace TSQR { do { numTrials *= 2; // First value of numTrials is 4. timer.start(); - for (int trial = 0; trial < numTrials; ++trial) - { - combiner.factor_first (numRows, numCols, A.data(), A.stride(1), - tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); - } + for (int trial = 0; trial < numTrials; ++trial) { + combiner.factor_first (A, tau.data(), work.data()); + combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, + A.data(), A.stride(1), tau.data(), + Q.data(), Q.stride(1), work.data()); + } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -414,8 +410,7 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (numRows, numCols, A.data(), A.stride(1), - tau.data(), work.data()); + combiner.factor_first (A, tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); @@ -426,8 +421,7 @@ namespace TSQR { timer_type timer ("Combine first"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (numRows, numCols, A.data(), A.stride(1), - tau.data(), work.data()); + combiner.factor_first (A, tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 426ac07755c8..8ebd5c7849a7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -84,15 +84,24 @@ namespace TSQR { } void - factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, + factor_first (const MatView& A, + Scalar tau[], + Scalar work[]) + { + const int lwork = A.extent (1); + lapack_.compute_QR (A.extent (0), A.extent (1), + A.data (), A.stride (1), + tau, work, lwork); + } + + void + factor_first (Matrix& A, Scalar tau[], Scalar work[]) { - const int lwork = ncols; - lapack_.compute_QR (nrows, ncols, A, lda, tau, work, lwork); + MatView A_view + (A.extent (0), A.extent (1), A.data (), A.stride (1)); + factor_first (A_view, tau, work); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index 56d402d19368..f2e8982cfb6f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -50,6 +50,7 @@ #include "KokkosBlas2_gemv.hpp" #include "Kokkos_ArithTraits.hpp" #include "Tsqr_Impl_Lapack.hpp" +#include "Tsqr_Matrix.hpp" namespace TSQR { @@ -92,14 +93,21 @@ namespace TSQR { } void - factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, + factor_first (const MatView& A, + Scalar tau[], + Scalar work[]) const + { + return default_.factor_first (A, tau, work); + } + + void + factor_first (Matrix& A, Scalar tau[], Scalar work[]) const { - return default_.factor_first (nrows, ncols, A, lda, tau, work); + MatView A_view + (A.extent (0), A.extent (1), A.data (), A.stride (1)); + return factor_first (A_view, tau, work); } void @@ -277,14 +285,21 @@ namespace TSQR { } void - factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, + factor_first (const MatView& A, + Scalar tau[], + Scalar work[]) const + { + return default_.factor_first (A, tau, work); + } + + void + factor_first (Matrix& A, Scalar tau[], Scalar work[]) const { - return default_.factor_first (nrows, ncols, A, lda, tau, work); + MatView A_view + (A.extent (0), A.extent (1), A.data (), A.stride (1)); + return factor_first (A_view, tau, work); } void @@ -373,14 +388,21 @@ namespace TSQR { } void - factor_first (const Ordinal nrows, - const Ordinal ncols, - Scalar A[], - const Ordinal lda, + factor_first (const MatView& A, + Scalar tau[], + Scalar work[]) const + { + return default_.factor_first (A, tau, work); + } + + void + factor_first (Matrix& A, Scalar tau[], Scalar work[]) const { - return default_.factor_first (nrows, ncols, A, lda, tau, work); + MatView A_view + (A.extent (0), A.extent (1), A.data (), A.stride (1)); + return factor_first (A_view, tau, work); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 5914ff99ef55..d6fe01c3ce26 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -532,9 +532,7 @@ namespace TSQR { } Combine combiner; // qr( A1 ) - combiner.factor_first (numRows, numCols, - A1.data(), A1.stride(1), - tau1.data(), work.data()); + combiner.factor_first (A1, tau1.data(), work.data()); // View of numCols by numCols upper triangle of A1. mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1)); // qr( [R1; A2] ) diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index c5ddab5cfc6c..b45383a7c654 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -173,8 +173,7 @@ namespace TSQR { // We should only call this if A_top.extent(1) > 0 and therefore // work.size() > 0, but we've already checked for that, so we // don't have to check again. - combine.factor_first (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.stride(1), tau.data(), work.data()); + combine.factor_first (A_top, tau.data(), work.data()); return tau; } diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 832f66cba844..11adccd6a1d8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -159,8 +159,7 @@ namespace TSQR { std::vector& work) const { const LocalOrdinal ncols = A_top.extent(1); - combine.factor_first (A_top.extent(0), ncols, A_top.data(), A_top.stride(1), - tau.data(), work.data()); + combine.factor_first (A_top, tau.data(), work.data()); return mat_view_type(ncols, ncols, A_top.data(), A_top.stride(1)); } From 4ff038d1984696c5462fbd7c647e8c4b79237fe6 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Mon, 18 Nov 2019 22:50:00 -0700 Subject: [PATCH 39/50] Ifpack2: Adding unit test for 'reduced' matvec for use in s-step methods --- .../Ifpack2_UnitTestOverlappingRowMatrix.cpp | 215 +++++++++++++++++- 1 file changed, 214 insertions(+), 1 deletion(-) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index 4efbc633a50c..89260f529db7 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -66,6 +66,7 @@ #include #endif +#include #include #include @@ -109,6 +110,145 @@ typedef Tpetra::global_size_t GST; } while (false) + + +/***********************************************************************************/ +template +void localReducedMatvec(const MatrixClass & A_lcl, + const MultiVectorClass & X_lcl, + const int userNumRows, + MultiVectorClass & Y_lcl) { + using Teuchos::NO_TRANS; + + using execution_space = typename MatrixClass::execution_space; + + if (A_lcl.numRows() == 0 || userNumRows ==0 || userNumRows > A_lcl.numRows()) { + return; + } + + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + int64_t numLocalRows = userNumRows; + int64_t myNnz = A_lcl.nnz(); + + int64_t rows_per_team = + Tpetra::Details::residual_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t worksets = (X_lcl.extent (0) + rows_per_team - 1) / rows_per_team; + + using policy_type = typename Kokkos::TeamPolicy; + using team_member = typename policy_type::member_type; + + using residual_value_type = typename MultiVectorClass::non_const_value_type; + using KAT = Kokkos::ArithTraits; + using LO = int64_t; + + policy_type policy (1, 1); + if (team_size < 0) { + policy = policy_type (worksets, Kokkos::AUTO, vector_length); + } + else { + policy = policy_type (worksets, team_size, vector_length); + } + + bool is_vector = (X_lcl.extent(1) == 1); + + if(is_vector) { + // Vector case + // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp + Kokkos::parallel_for("reduced-mv-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= numLocalRows) { + return; + } + + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + residual_value_type A_x = KAT::zero (); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_lcl(A_row.colidx(iEntry),0); + }, A_x); + Y_lcl(lclRow,0) = A_x; + + });//end parallel_for TeamThreadRange + });//end parallel_for "residual-vector" + } else { + // MultiVector case + // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp + Kokkos::parallel_for("reduced-mv-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { + // NOTE: It looks like I should be able to get this data up above, but if I try to + // we get internal compiler errors. Who knew that gcc tried to "gimplify"? + const LO numVectors = static_cast(X_lcl.extent(1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= numLocalRows) { + return; + } + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + for(LO v=0; v +void reducedMatvec(const OverlappedMatrixClass & A, + const MultiVectorClass & X, + const int userExtNumRows, + MultiVectorClass & Y) { + using crs_matrix_type = Tpetra::CrsMatrix; + + // Assumes that X& Y are sufficiently overlapped for this to work + RCP undA = Teuchos::rcp_dynamic_cast(A.getUnderlyingMatrix()); + RCP extA = Teuchos::rcp_dynamic_cast(A.getExtMatrix()); + + auto undA_lcl = undA->getLocalMatrix (); + auto extA_lcl = extA->getLocalMatrix (); + auto X_lcl = X.getLocalViewDevice (); + auto Y_lcl = Y.getLocalViewDevice (); + + // Do the "Local part" + auto numLocalRows = undA_lcl.numRows(); + localReducedMatvec(undA_lcl,X_lcl,numLocalRows,Y_lcl); + + // Now, do the "overlapped part" + if(userExtNumRows > 0) { + auto X_ext = Kokkos::subview(X_lcl,std::make_pair(0,numLocalRows+userExtNumRows),Kokkos::ALL()); + auto Y_ext = Kokkos::subview(X_lcl,std::make_pair(numLocalRows,numLocalRows+userExtNumRows),Kokkos::ALL()); + + localReducedMatvec(extA_lcl,X_ext,userExtNumRows,Y_ext); + } + +} + + + + + + + TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, Test0, Scalar, LO, GO) { out << "Ifpack2::OverlappingRowMatrix unit test" << endl; @@ -416,9 +556,82 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, getLocalDiag, Sca TEST_EQUALITY(ldGids[i],ovrmGids[i]); } + +TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Scalar, LocalOrdinal, GlobalOrdinal) +{ + using SC = Scalar; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + using NO = Tpetra::Map<>::node_type; + using map_type = Tpetra::Map; + using row_matrix_type = Tpetra::RowMatrix; + using MV = Tpetra::MultiVector; + using Teuchos::RCP; + + Tpetra::global_size_t num_rows_per_proc = 5; + + const RCP > rowmap = tif_utest::create_tpetra_map(num_rows_per_proc); + + RCP > A = tif_utest::create_test_matrix(rowmap); + + // This needs to be one less than the number of matvecs we test + int overlapLevel = 2; + Ifpack2::OverlappingRowMatrix ovA(A, overlapLevel); + + RCP ExtMatrix = ovA.getExtMatrix(); + SC one = Teuchos::ScalarTraits::one(), zero = Teuchos::ScalarTraits::zero(); + + // Vectors in the non-overlapping space + int numVecs = 2; + MV x(rowmap,numVecs), y_direct(rowmap,numVecs), y_overlap(rowmap,numVecs); + x.putScalar(one); + + { + // Direct approach + MV temp1(rowmap,numVecs), temp2(rowmap,numVecs); + A->apply(x,temp1); + A->apply(temp1,temp2); + A->apply(temp2,y_direct); + } + + { + // Overlap approach + RCP ovRowmap = ovA.getRowMap(); + MV ovX(ovRowmap,numVecs), ovY(ovRowmap,numVecs), temp1(ovRowmap,numVecs), temp2(ovRowmap,numVecs); + ovX.putScalar(zero); + Teuchos::ArrayView hstarts = ovA.getExtHaloStarts(); + ovA.importMultiVector(x,ovX); + + printf("Halo Starts:"); + for(size_t i=0; i< (size_t)hstarts.size(); i++) + printf("%d ",(int) hstarts[i]); + printf("\n"); + + reducedMatvec(ovA,ovX,hstarts[2],temp1); + reducedMatvec(ovA,temp1,hstarts[1],temp2); + reducedMatvec(ovA,temp2,hstarts[0],ovY); + + // This isn't a Kokkos::deep_copy() since the Kokkos view was complaining about invalid ranges + auto ovY_lcl = ovY.getLocalViewDevice(); + auto Y_lcl = y_overlap.getLocalViewDevice(); + Kokkos::parallel_for("copy out",rowmap->getNodeNumElements(),KOKKOS_LAMBDA(const int i) { + for(int j=0; j < numVecs; j++) + Y_lcl(i,j) = ovY_lcl(i,j); + }); + + } + + + // Compare solutions + TEST_COMPARE_FLOATING_ARRAYS( y_direct.get1dView (), y_overlap.get1dView (), 1e4 * Teuchos::ScalarTraits::eps () ); + +} + + #define UNIT_TEST_GROUP_SCALAR_ORDINAL( Scalar, LO, GO ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2OverlappingRowMatrix, Test0, Scalar, LO, GO ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2OverlappingRowMatrix, getLocalDiag, Scalar, LO, GO ) + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2OverlappingRowMatrix, getLocalDiag, Scalar, LO, GO ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2OverlappingRowMatrix, reducedMatvec, Scalar, LO, GO ) // mfh 26 Aug 2015: Ifpack2::OverlappingRowMatrix was only getting // tested for Scalar = double, LocalOrdinal = int, GlobalOrdinal = From 7314bdb723e3665275bbcc35d64c1c75fab4aa68 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 19 Nov 2019 11:58:52 -0700 Subject: [PATCH 40/50] Ifpack2: OverlappingRowMatrix cleanup --- .../src/Ifpack2_OverlappingRowMatrix_decl.hpp | 2 +- .../src/Ifpack2_OverlappingRowMatrix_def.hpp | 2 +- .../Ifpack2_UnitTestOverlappingRowMatrix.cpp | 45 ++++++++++--------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp index 4cfd464a52d8..682db99e82fb 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp @@ -341,7 +341,7 @@ class OverlappingRowMatrix : Teuchos::RCP getExtMatrix() const; - Teuchos::ArrayView getExtHaloStarts(); + Teuchos::ArrayView getExtHaloStarts() const; private: typedef Tpetra::Map map_type; diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp index efa518cc9d45..cb0426fe770f 100644 --- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp +++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp @@ -853,7 +853,7 @@ OverlappingRowMatrix::getExtMatrix() const } template -Teuchos::ArrayView OverlappingRowMatrix::getExtHaloStarts() +Teuchos::ArrayView OverlappingRowMatrix::getExtHaloStarts() const { return ExtHaloStarts_(); } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index 89260f529db7..c1dc17560b4d 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -66,7 +66,7 @@ #include #endif -#include +#include "Tpetra_Details_residual.hpp" #include #include @@ -158,7 +158,7 @@ void localReducedMatvec(const MatrixClass & A_lcl, // Vector case // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp Kokkos::parallel_for("reduced-mv-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO loop) { const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; if (lclRow >= numLocalRows) { @@ -166,7 +166,7 @@ void localReducedMatvec(const MatrixClass & A_lcl, } const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); + const LO row_length = A_row.length; residual_value_type A_x = KAT::zero (); Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { @@ -184,14 +184,14 @@ void localReducedMatvec(const MatrixClass & A_lcl, // NOTE: It looks like I should be able to get this data up above, but if I try to // we get internal compiler errors. Who knew that gcc tried to "gimplify"? const LO numVectors = static_cast(X_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO loop) { const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; if (lclRow >= numLocalRows) { return; } const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); + const LO row_length = A_row.length; for(LO v=0; v void reducedMatvec(const OverlappedMatrixClass & A, const MultiVectorClass & X, - const int userExtNumRows, + const int overlapLevel, MultiVectorClass & Y) { using crs_matrix_type = Tpetra::CrsMatrix undA = Teuchos::rcp_dynamic_cast(A.getUnderlyingMatrix()); RCP extA = Teuchos::rcp_dynamic_cast(A.getExtMatrix()); + Teuchos::ArrayView hstarts = A.getExtHaloStarts(); + + if(overlapLevel >= (int) hstarts.size()) + throw std::runtime_error("reducedMatvec: Exceeded available overlap"); auto undA_lcl = undA->getLocalMatrix (); auto extA_lcl = extA->getLocalMatrix (); @@ -234,11 +238,14 @@ void reducedMatvec(const OverlappedMatrixClass & A, localReducedMatvec(undA_lcl,X_lcl,numLocalRows,Y_lcl); // Now, do the "overlapped part" - if(userExtNumRows > 0) { - auto X_ext = Kokkos::subview(X_lcl,std::make_pair(0,numLocalRows+userExtNumRows),Kokkos::ALL()); - auto Y_ext = Kokkos::subview(X_lcl,std::make_pair(numLocalRows,numLocalRows+userExtNumRows),Kokkos::ALL()); + if(overlapLevel > 0) { + int yrange = hstarts[overlapLevel]; + auto Y_ext = Kokkos::subview(Y_lcl,std::make_pair(numLocalRows,numLocalRows+yrange),Kokkos::ALL()); + + int xlimit = numLocalRows + ( (overlapLevel == hstarts.size()-1) ? X_lcl.extent(0) : hstarts[overlapLevel+1] ); + auto X_ext = Kokkos::subview(X_lcl,std::make_pair(0,xlimit),Kokkos::ALL()); - localReducedMatvec(extA_lcl,X_ext,userExtNumRows,Y_ext); + localReducedMatvec(extA_lcl,X_ext,yrange,Y_ext); } } @@ -333,7 +340,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, Test0, Scalar, LO } IFPACK2OVERLAPPINGROWMATRIX_REPORT_GLOBAL_ERR( "Ifpack2::OverlappingRowMatrix constructor" ); - Teuchos::ArrayView halo = B->getExtHaloStarts(); + Teuchos::ArrayView halo = B->getExtHaloStarts(); printf("Halo Starts:"); for(size_t i=0; i< (size_t)halo.size(); i++) printf("%d ",(int) halo[i]); @@ -599,7 +606,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc RCP ovRowmap = ovA.getRowMap(); MV ovX(ovRowmap,numVecs), ovY(ovRowmap,numVecs), temp1(ovRowmap,numVecs), temp2(ovRowmap,numVecs); ovX.putScalar(zero); - Teuchos::ArrayView hstarts = ovA.getExtHaloStarts(); + Teuchos::ArrayView hstarts = ovA.getExtHaloStarts(); ovA.importMultiVector(x,ovX); printf("Halo Starts:"); @@ -607,17 +614,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc printf("%d ",(int) hstarts[i]); printf("\n"); - reducedMatvec(ovA,ovX,hstarts[2],temp1); - reducedMatvec(ovA,temp1,hstarts[1],temp2); - reducedMatvec(ovA,temp2,hstarts[0],ovY); + reducedMatvec(ovA,ovX,2,temp1); + reducedMatvec(ovA,temp1,1,temp2); + reducedMatvec(ovA,temp2,0,ovY); - // This isn't a Kokkos::deep_copy() since the Kokkos view was complaining about invalid ranges + // And yes, that int cast is really necessary auto ovY_lcl = ovY.getLocalViewDevice(); auto Y_lcl = y_overlap.getLocalViewDevice(); - Kokkos::parallel_for("copy out",rowmap->getNodeNumElements(),KOKKOS_LAMBDA(const int i) { - for(int j=0; j < numVecs; j++) - Y_lcl(i,j) = ovY_lcl(i,j); - }); + auto ovYsub = Kokkos::subview(ovY_lcl,std::make_pair(0,(int)Y_lcl.extent(0)), Kokkos::ALL); + Kokkos::deep_copy(Y_lcl,ovYsub); } From 018ecba9adb711a062c1c8bb99a4288dbd563085 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Wed, 20 Nov 2019 16:00:23 -0700 Subject: [PATCH 41/50] Ifpack2: Fixing test --- .../Ifpack2_UnitTestOverlappingRowMatrix.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index c1dc17560b4d..2019ca9c8749 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -220,7 +220,7 @@ void reducedMatvec(const OverlappedMatrixClass & A, typename OverlappedMatrixClass::global_ordinal_type, typename OverlappedMatrixClass::node_type>; - // Assumes that X& Y are sufficiently overlapped for this to work + // Assumes that X & Y are sufficiently overlapped for this to work RCP undA = Teuchos::rcp_dynamic_cast(A.getUnderlyingMatrix()); RCP extA = Teuchos::rcp_dynamic_cast(A.getExtMatrix()); Teuchos::ArrayView hstarts = A.getExtHaloStarts(); @@ -234,15 +234,16 @@ void reducedMatvec(const OverlappedMatrixClass & A, auto Y_lcl = Y.getLocalViewDevice (); // Do the "Local part" - auto numLocalRows = undA_lcl.numRows(); + auto numLocalRows = undA->getNodeNumRows(); localReducedMatvec(undA_lcl,X_lcl,numLocalRows,Y_lcl); + // Now, do the "overlapped part" if(overlapLevel > 0) { int yrange = hstarts[overlapLevel]; auto Y_ext = Kokkos::subview(Y_lcl,std::make_pair(numLocalRows,numLocalRows+yrange),Kokkos::ALL()); - int xlimit = numLocalRows + ( (overlapLevel == hstarts.size()-1) ? X_lcl.extent(0) : hstarts[overlapLevel+1] ); + int xlimit = ( (overlapLevel == hstarts.size()-1) ? X_lcl.extent(0) : numLocalRows+hstarts[overlapLevel+1] ); auto X_ext = Kokkos::subview(X_lcl,std::make_pair(0,xlimit),Kokkos::ALL()); localReducedMatvec(extA_lcl,X_ext,yrange,Y_ext); @@ -341,11 +342,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, Test0, Scalar, LO IFPACK2OVERLAPPINGROWMATRIX_REPORT_GLOBAL_ERR( "Ifpack2::OverlappingRowMatrix constructor" ); Teuchos::ArrayView halo = B->getExtHaloStarts(); +#if 0 printf("Halo Starts:"); for(size_t i=0; i< (size_t)halo.size(); i++) printf("%d ",(int) halo[i]); printf("\n"); - +#endif size_t NumGlobalRowsB = B->getGlobalNumRows (); size_t NumGlobalNonzerosB = B->getGlobalNumEntries (); @@ -604,16 +606,21 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc { // Overlap approach RCP ovRowmap = ovA.getRowMap(); + RCP ovColmap = ovA.getColMap(); MV ovX(ovRowmap,numVecs), ovY(ovRowmap,numVecs), temp1(ovRowmap,numVecs), temp2(ovRowmap,numVecs); ovX.putScalar(zero); Teuchos::ArrayView hstarts = ovA.getExtHaloStarts(); ovA.importMultiVector(x,ovX); - +#if 0 printf("Halo Starts:"); for(size_t i=0; i< (size_t)hstarts.size(); i++) printf("%d ",(int) hstarts[i]); printf("\n"); - +#endif + // printf("Before matvec A is (locally)%dx%d x is of size %d, ovX is ov size %d\n",(int)A->getNodeNumRows(),(int)A->getNodeNumCols(), + // (int)x.getMap()->getNodeNumElements(),(int)ovX.getMap()->getNodeNumElements()); + // printf("ovA->getUnderlyingMatrix() is (locally) %dx%d\n",(int)ovA.getUnderlyingMatrix()->getNodeNumRows(),(int)ovA.getUnderlyingMatrix()->getNodeNumCols()); + reducedMatvec(ovA,ovX,2,temp1); reducedMatvec(ovA,temp1,1,temp2); reducedMatvec(ovA,temp2,0,ovY); From 0832da8909df2d2aef037b96e08aecf9669055b5 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Thu, 21 Nov 2019 08:52:47 -0700 Subject: [PATCH 42/50] Ifpack2: Fixing test to only run in parallel --- .../test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index 2019ca9c8749..ff13831c90cb 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -576,11 +576,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc using row_matrix_type = Tpetra::RowMatrix; using MV = Tpetra::MultiVector; using Teuchos::RCP; - Tpetra::global_size_t num_rows_per_proc = 5; const RCP > rowmap = tif_utest::create_tpetra_map(num_rows_per_proc); - + // Only run on > 1 core + if(rowMap->getComm()->getSize() == 1) return; + RCP > A = tif_utest::create_test_matrix(rowmap); // This needs to be one less than the number of matvecs we test From fa71b5e96a518f6f9ef79d8ed53f71293f196f99 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 26 Nov 2019 15:55:10 -0700 Subject: [PATCH 43/50] Ifpack2: spelling --- .../test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index ff13831c90cb..3cfcf3f58198 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -580,7 +580,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2OverlappingRowMatrix, reducedMatvec, Sc const RCP > rowmap = tif_utest::create_tpetra_map(num_rows_per_proc); // Only run on > 1 core - if(rowMap->getComm()->getSize() == 1) return; + if(rowmap->getComm()->getSize() == 1) return; RCP > A = tif_utest::create_test_matrix(rowmap); From cf1bcb682ff5394a83c144b6f9cb0f0fb9386c37 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 26 Nov 2019 16:02:32 -0700 Subject: [PATCH 44/50] MueLu: ignoring OperatorComplexity in ParameterListInterpreter for kokkos, see issue #6361 After kokkos refactor of aggregation, non deterministic aggregates are formed. This means that checking operator complexity against a gold file is wrong. This commits implements logic to ignore OperatorComplexity for kokkos runs of ParameterListInterpreter. --- packages/muelu/test/interface/ParameterListInterpreter.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/muelu/test/interface/ParameterListInterpreter.cpp b/packages/muelu/test/interface/ParameterListInterpreter.cpp index 05429778768a..e6de48a0772b 100644 --- a/packages/muelu/test/interface/ParameterListInterpreter.cpp +++ b/packages/muelu/test/interface/ParameterListInterpreter.cpp @@ -403,7 +403,10 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar replacementString = "Global matrix dimensions: , Global nnz: "; run_sed("'s/" + stringToReplace + "/" + replacementString + "/'", baseFile); - // Catch smoother complexity output from MueLu + // Catch operator/smoother complexity output from MueLu + stringToReplace = "Operator complexity = " + floatRegex; + replacementString = "Operator complexity = "; + run_sed("'s/" + stringToReplace + "/" + replacementString + "/'", baseFile); stringToReplace = "Smoother complexity = " + floatRegex; replacementString = "Smoother complexity = "; run_sed("'s/" + stringToReplace + "/" + replacementString + "/'", baseFile); From 0b0266da7259fb8c918f466db47f9f35333a8e3f Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 16:32:32 -0700 Subject: [PATCH 45/50] TSQR::Combine*::factor_pair now takes MatView instead of a pointer --- .../tpetra/tsqr/src/TbbTsqr_FactorTask.hpp | 4 +- .../tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp | 23 +++--- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 18 +---- .../tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp | 18 ++--- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 15 +--- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 39 ++++++--- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 80 +++++++++---------- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 3 +- .../tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp | 41 +++++----- packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp | 6 +- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 4 +- 11 files changed, 111 insertions(+), 140 deletions(-) diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp index 19b4372ccc2f..e03757db9e18 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp @@ -205,9 +205,7 @@ namespace TSQR { const LocalOrdinal ncols = A_top.extent(1); std::vector& tau = par_output_[P_bot]; std::vector work (ncols); - combine_.factor_pair (ncols, A_top.data(), A_top.stride(1), - A_bot.data(), A_bot.stride(1), - tau.data(), work.data()); + combine_.factor_pair (A_top, A_bot, tau.data(), work.data()); } void diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp index 5c1d584628d5..27aef81f0328 100644 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp @@ -309,26 +309,23 @@ namespace TSQR { const size_t P_bot, mat_view& A_top, mat_view& A_bot, - std::vector< std::vector< Scalar > >& par_outputs, + std::vector>& par_outputs, const bool contiguous_cache_blocks) const { - if (P_top == P_bot) - { - throw std::logic_error("factor_pair: should never get here!"); - return; // to pacify the compiler - } + if (P_top == P_bot) { + throw std::logic_error("factor_pair: should never get here!"); + } // We only read and write the upper ncols x ncols triangle of // each block. const LocalOrdinal ncols = A_top.extent(1); - if (A_bot.extent(1) != ncols) + if (A_bot.extent(1) != ncols) { throw std::logic_error("A_bot.extent(1) != A_top.extent(1)"); + } + std::vector& tau = par_outputs[P_bot]; + std::vector work (ncols); - std::vector< Scalar >& tau = par_outputs[P_bot]; - std::vector< Scalar > work (ncols); - - TSQR::Combine< LocalOrdinal, Scalar > combine_; - combine_.factor_pair (ncols, A_top.data(), A_top.stride(1), - A_bot.data(), A_bot.stride(1), &tau[0], &work[0]); + TSQR::Combine combine_; + combine_.factor_pair (A_top, A_bot, tau.data(), work.data()); } template< class LocalOrdinal, class Scalar > diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index e169541b1581..88d87cd11cfc 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -269,25 +269,13 @@ namespace TSQR { /// /// Store the resulting R factor in R_top, and the resulting /// Householder reflectors implicitly in R_bot and tau. - /// - /// \param n [in] Number of rows and columns of each of R_top and R_bot - /// \param R_top [inout] n by n upper triangular matrix - /// \param ldr_top [in] Leading dimension of R_top - /// \param R_bot [inout] n by n upper triangular matrix - /// \param ldr_bot [in] Leading dimension of R_bot - /// \param tau [out] Scaling factors for Householder reflectors - /// \param work [out] Workspace array (of length >= n) - /// void - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) const { - impl_.factor_pair (n, R_top, ldr_top, R_bot, ldr_bot, tau, work); + impl_.factor_pair (R_top, R_bot, tau, work); } /// \brief Apply the result of \c factor_pair(). diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp index af18ad5cee10..e77802f173ec 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp @@ -261,20 +261,18 @@ namespace TSQR { // Number of trials for factor_pair() and apply_pair(). std::pair result; result = c.calibratePair (numCols, accuracyFactor); - if (debug) - { - cerr << "- Pair number of trials: " << result.first << endl; - cerr << "- Pair calibration time: " << result.second << endl; - } + if (debug) { + cerr << "- Pair number of trials: " << result.first << endl; + cerr << "- Pair calibration time: " << result.second << endl; + } pairNumTrials = result.first; // Number of trials for factor_inner() and apply_inner(). result = c.calibrateCacheBlock (numRows, numCols, accuracyFactor); - if (debug) - { - cerr << "- Cache block number of trials: " << result.first << endl; - cerr << "- Cache block calibration time: " << result.second << endl; - } + if (debug) { + cerr << "- Cache block number of trials: " << result.first << endl; + cerr << "- Cache block calibration time: " << result.second << endl; + } cacheBlockNumTrials = result.first; // Store the updated PRNG seed in the benchmark parameters. diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 5f081688b3e4..28e367464308 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -702,9 +702,7 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (numCols, R1.data(), R1.stride(1), - R2.data(), R2.stride(1), - tau.data(), work.data()); + combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, R2.data(), R2.stride(1), tau.data(), &Q(0, 0), Q.stride(1), @@ -734,8 +732,7 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (numCols, R1.data(), R1.stride(1), - R2.data(), R2.stride(1), + combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, R2.data(), R2.stride(1), tau.data(), @@ -808,9 +805,7 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (numCols, R1.data(), R1.stride(1), - R2.data(), R2.stride(1), - tau.data(), work.data()); + combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, R2.data(), R2.stride(1), tau.data(), &Q(0, 0), Q.stride(1), @@ -823,9 +818,7 @@ namespace TSQR { timer_type timer ("Combine pair"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (numCols, R1.data(), R1.stride(1), - R2.data(), R2.stride(1), - tau.data(), work.data()); + combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, R2.data(), R2.stride(1), tau.data(), &Q(0, 0), Q.stride(1), diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 8ebd5c7849a7..1d6a21384bd1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -209,35 +209,48 @@ namespace TSQR { } void - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) { - const Ordinal numRows = Ordinal(2) * n; + const Ordinal numRows = Ordinal(2) * R_top.extent (1); + const Ordinal numCols = R_top.extent (1); - A_buf_.reshape (numRows, n); + A_buf_.reshape (numRows, numCols); deep_copy (A_buf_, Scalar {}); + MatView A_buf_top (numCols, numCols, + &A_buf_(0, 0), + A_buf_.stride(1)); + MatView A_buf_bot (numCols, numCols, + &A_buf_(numCols, 0), + A_buf_.stride(1)); // Copy the inputs into the compute buffer. Only touch the // upper triangles of R_top and R_bot, since they each may be // views of some cache block (where the strict lower triangle // contains things we don't want to include in the // factorization). - copy_upper_triangle (n, n, &A_buf_(0, 0), A_buf_.stride(1), R_top, ldr_top); - copy_upper_triangle (n, n, &A_buf_(n, 0), A_buf_.stride(1), R_bot, ldr_bot); + copy_upper_triangle (numCols, numCols, + A_buf_top.data(), A_buf_top.stride(1), + R_top.data(), R_top.stride(1)); + copy_upper_triangle (numCols, numCols, + A_buf_bot.data(), A_buf_bot.stride(1), + R_bot.data(), R_bot.stride(1)); - const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1), + const int lwork = static_cast (numCols); + lapack_.compute_QR (numRows, numCols, + A_buf_.data(), A_buf_.stride(1), tau, work, lwork); // Copy back the results. Only read the upper triangles of the // two n by n row blocks of A_buf_ (this means we don't have to // zero out the strict lower triangles), and only touch the // upper triangles of R_top and R_bot. - copy_upper_triangle (n, n, R_top, ldr_top, &A_buf_(0, 0), A_buf_.stride(1)); - copy_upper_triangle (n, n, R_bot, ldr_bot, &A_buf_(n, 0), A_buf_.stride(1)); + copy_upper_triangle (numCols, numCols, + R_top.data(), R_top.stride(1), + A_buf_top.data(), A_buf_top.stride(1)); + copy_upper_triangle (numCols, numCols, + R_bot.data(), R_bot.stride(1), + A_buf_bot.data(), A_buf_bot.stride(1)); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index f2e8982cfb6f..8c705733b373 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -151,11 +151,8 @@ namespace TSQR { Scalar work[]) const; void - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) const; @@ -318,6 +315,16 @@ namespace TSQR { A, lda, tau, C, ldc, work); } + void + factor_inner (const Ordinal m, + const Ordinal n, + Scalar R[], + const Ordinal ldr, + Scalar A[], + const Ordinal lda, + Scalar tau[], + Scalar work[]) const; + void apply_inner (const ApplyType& applyType, const Ordinal m, @@ -331,23 +338,9 @@ namespace TSQR { Scalar C_bot[], const Ordinal ldc_bot, Scalar work[]) const; - void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, - Scalar tau[], - Scalar work[]) const; - - void - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) const; @@ -455,15 +448,12 @@ namespace TSQR { } void - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) const { - return default_.factor_pair (n, R_top, ldr_top, R_bot, ldr_bot, tau, work); + return default_.factor_pair (R_top, R_bot, tau, work); } void @@ -761,12 +751,9 @@ namespace TSQR { template< class Ordinal, class Scalar > void - CombineNative< Ordinal, Scalar, false >:: - factor_pair (const Ordinal n, - Scalar R_top[], - const Ordinal ldr_top, - Scalar R_bot[], - const Ordinal ldr_bot, + CombineNative:: + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], Scalar work[]) const { @@ -774,27 +761,32 @@ namespace TSQR { using Kokkos::subview; using range_type = std::pair; - Kokkos::View R_top_full (R_top, ldr_top, n); - Kokkos::View R_bot_full (R_bot, ldr_bot, n); - Kokkos::View tau_view (tau, n); - Kokkos::View work_view (work, n); - - if (ldr_top == n) { - if (ldr_bot == n) { + const Ordinal numCols = R_top.extent (1); + Kokkos::View R_top_full + (R_top.data(), R_top.stride (1), numCols); + Kokkos::View R_bot_full + (R_bot.data(), R_bot.stride (1), R_bot.extent (1)); + Kokkos::View tau_view + (tau, numCols); + Kokkos::View work_view + (work, numCols); + + if (R_top.stride(1) == numCols) { + if (R_bot.stride(1) == numCols) { this->factor_pair (R_top_full, R_bot_full, tau_view, work_view); } else { - auto R_bot_view = subview (R_bot_full, range_type (0, n), ALL ()); + auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ()); this->factor_pair (R_top_full, R_bot_view, tau_view, work_view); } } else { - auto R_top_view = subview (R_top_full, range_type (0, n), ALL ()); - if (ldr_bot == n) { + auto R_top_view = subview (R_top_full, range_type (0, numCols), ALL ()); + if (R_bot.stride(1) == numCols) { this->factor_pair (R_top_view, R_bot_full, tau_view, work_view); } else { - auto R_bot_view = subview (R_bot_full, range_type (0, n), ALL ()); + auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ()); this->factor_pair (R_top_view, R_bot_view, tau_view, work_view); } } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index d6fe01c3ce26..050907c320e3 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -352,8 +352,7 @@ namespace TSQR { << " by " << numCols << endl << endl; } Combine combiner; - combiner.factor_pair (numCols, R1.data(), R1.stride(1), - R2.data(), R2.stride(1), + combiner.factor_pair (R1.view(), R2.view(), tau_R1R2.data(), work.data()); combiner.apply_pair (ApplyType("N"), numCols, numCols, R2.data(), R2.stride(1), tau_R1R2.data(), diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp index b1990fda4567..67ecc2b31e06 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp @@ -85,7 +85,11 @@ namespace TSQR { const int P_bot = std::max (P_mine, P_other); const LocalOrdinal nelts = ncols * ncols; const LocalOrdinal ldr = ncols; + MatView R_mine_view + (ncols, ncols, R_mine.data (), ldr); vector< Scalar > R_other (nelts); + MatView R_other_view + (ncols, ncols, R_other.data (), ldr); vector< Scalar > tau (ncols); // Send and receive R factor. @@ -94,23 +98,17 @@ namespace TSQR { Combine combine; if (P_mine == P_top) { - combine.factor_pair (ncols, R_mine.data(), ldr, - R_other.data(), ldr, + combine.factor_pair (R_mine_view, R_other_view, tau.data(), work.data()); Q_factors.push_back (R_other); tau_arrays.push_back (tau); } else if (P_mine == P_bot) { - combine.factor_pair (ncols, R_other.data(), ldr, - R_mine.data(), ldr, + combine.factor_pair (R_other_view, R_mine_view, tau.data(), work.data()); Q_factors.push_back (R_mine); - // Make sure that the "bottom" processor gets the current R // factor, which is returned in R_mine. - using view_type = MatView; - view_type R_mine_view (ncols, ncols, R_mine.data(), ldr); - view_type R_other_view (ncols, ncols, R_other.data(), ldr); deep_copy (R_mine_view, R_other_view); tau_arrays.push_back (tau); } @@ -163,24 +161,21 @@ namespace TSQR { // If there aren't an even number of processors in the // original interval, then the last processor in the lower // interval has to skip this round. - if (b_even || my_rank < P_mid - 1) - { - const int my_offset = my_rank - P_first; - const int P_other = P_mid + my_offset; - if (P_other < P_mid || P_other > P_last) - throw std::logic_error ("P_other not in [P_mid,P_last] range"); - - factor_pair (ncols, R_mine, my_rank, P_other, tag, - messenger, Q_factors, tau_arrays, work); + if (b_even || my_rank < P_mid - 1) { + const int my_offset = my_rank - P_first; + const int P_other = P_mid + my_offset; + if (P_other < P_mid || P_other > P_last) { + throw std::logic_error ("P_other not in [P_mid,P_last] range"); } - + factor_pair (ncols, R_mine, my_rank, P_other, tag, + messenger, Q_factors, tau_arrays, work); + } // If I'm skipping this round, get the "current" R factor // from P_mid. - if (! b_even && my_rank == P_mid - 1) - { - const int theTag = 142; // magic constant - messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag); - } + if (! b_even && my_rank == P_mid - 1) { + const int theTag = 142; // magic constant + messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag); + } } else // Interval [P_mid, P_last] { diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index cf421d2bdfce..10035b80c6df 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -388,13 +388,13 @@ namespace TSQR { matrix_type R_other (numCols, numCols); recv_R (R_other, P_mid); - std::vector< scalar_type > tau (numCols); + std::vector tau (numCols); // Don't shrink the workspace array; doing so may // require expensive reallocation every time we send / // receive data. resizeWork (numCols); - combine_.factor_pair (numCols, R_mine.data(), R_mine.stride(1), - R_other.data(), R_other.stride(1), + + combine_.factor_pair (R_mine, R_other.view (), tau.data(), work_.data()); QFactors.push_back (R_other); tauArrays.push_back (tau); diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index b45383a7c654..c2ac4b18e3af 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -1552,9 +1552,7 @@ namespace TSQR { // The statement below only works if R_top and R_bot have a // nonzero (and the same) number of columns, but we have already // checked that above. - combine_.factor_pair (R_top.extent(1), R_top.data(), R_top.stride(1), - R_bot.data(), R_bot.stride(1), tau.data(), - work_.data()); + combine_.factor_pair (R_top, R_bot, tau.data(), work_.data()); return tau; } From bc0ac10724d38b28554991aa59d0a69489e03fe5 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 16:52:51 -0700 Subject: [PATCH 46/50] TSQR::Combine: Remove unneeded factor_first overload --- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 10 ------ .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 8 ++--- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 32 +------------------ 3 files changed, 5 insertions(+), 45 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 88d87cd11cfc..3564efeda15a 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -136,16 +136,6 @@ namespace TSQR { return impl_.factor_first (A, tau, work); } - void - factor_first (Matrix& A, - Scalar tau[], - Scalar work[]) const - { - MatView A_view - (A.extent (0), A.extent (1), A.data (), A.stride (1)); - return factor_first (A_view, tau, work); - } - /// \brief Apply the result of \c factor_first(). /// /// Apply the Q factor, as computed by factor_first() and stored diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 28e367464308..841c62533a4b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -314,7 +314,7 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (A, tau.data(), work.data()); + combiner.factor_first (A.view(), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); @@ -342,7 +342,7 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (A, tau.data(), work.data()); + combiner.factor_first (A.view(), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); @@ -410,7 +410,7 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (A, tau.data(), work.data()); + combiner.factor_first (A.view(), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); @@ -421,7 +421,7 @@ namespace TSQR { timer_type timer ("Combine first"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (A, tau.data(), work.data()); + combiner.factor_first (A.view(), tau.data(), work.data()); combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), Q.data(), Q.stride(1), work.data()); diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index 8c705733b373..db40c9451b3c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -50,7 +50,7 @@ #include "KokkosBlas2_gemv.hpp" #include "Kokkos_ArithTraits.hpp" #include "Tsqr_Impl_Lapack.hpp" -#include "Tsqr_Matrix.hpp" +#include "Tsqr_MatView.hpp" namespace TSQR { @@ -100,16 +100,6 @@ namespace TSQR { return default_.factor_first (A, tau, work); } - void - factor_first (Matrix& A, - Scalar tau[], - Scalar work[]) const - { - MatView A_view - (A.extent (0), A.extent (1), A.data (), A.stride (1)); - return factor_first (A_view, tau, work); - } - void apply_first (const ApplyType& applyType, const Ordinal nrows, @@ -289,16 +279,6 @@ namespace TSQR { return default_.factor_first (A, tau, work); } - void - factor_first (Matrix& A, - Scalar tau[], - Scalar work[]) const - { - MatView A_view - (A.extent (0), A.extent (1), A.data (), A.stride (1)); - return factor_first (A_view, tau, work); - } - void apply_first (const ApplyType& applyType, const Ordinal nrows, @@ -388,16 +368,6 @@ namespace TSQR { return default_.factor_first (A, tau, work); } - void - factor_first (Matrix& A, - Scalar tau[], - Scalar work[]) const - { - MatView A_view - (A.extent (0), A.extent (1), A.data (), A.stride (1)); - return factor_first (A_view, tau, work); - } - void apply_first (const ApplyType& applyType, const Ordinal nrows, From 2731d32ddaba11ff162b2303b24fa3105da614f8 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 17:04:05 -0700 Subject: [PATCH 47/50] TSQR::Combine*::factor_inner now takes MatView instead of a pointer --- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 15 ++---- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 18 +++----- .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 26 ++++++++--- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 46 ++++++------------- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 9 +--- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 5 +- .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 7 +-- 7 files changed, 49 insertions(+), 77 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 3564efeda15a..70fc1a6e8301 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -228,13 +228,8 @@ namespace TSQR { /// A_k]$) entirely in $A_k$ (specifically, in all of $A_k$, not just /// below the diagonal). /// - /// \param m [in] Number of rows in the "bottom" block to factor. - /// The number of rows in the top block doesn't matter, given the - /// assumptions above, as long as $m_{k-1} \geq n$. - /// \param n [in] Number of columns (same in both blocks) /// \param R [inout] "Top" upper triangular n by n block $R_{k-1}$. /// Overwritten with the new R factor $R_k$ of $[R_{k-1}; A_k]$. - /// \param ldr [in] Leading dimension of R /// \param A [inout] "Bottom" dense m by n block $A_k$. Overwritten /// with the Householder reflectors representing the Q factor of /// $[R_{k-1}; A_k]$. @@ -243,16 +238,12 @@ namespace TSQR { /// \param work [out] Workspace (length >= n; don't need lwork or /// workspace query) void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) const { - impl_.factor_inner (m, n, R, ldr, A, lda, tau, work); + impl_.factor_inner (R, A, tau, work); } /// \brief Factor the pair of square upper triangular matrices [R_top; R_bot]. diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 841c62533a4b..4c93f136724f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -499,8 +499,8 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (numRows, numCols, R.data(), R.stride(1), - A.data(), A.stride(1), tau.data(), work.data()); + combiner.factor_inner (R.view(), A.view(), + tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), &Q(0, 0), Q.stride(1), @@ -530,8 +530,8 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (numRows, numCols, R.data(), R.stride(1), - A.data(), A.stride(1), tau.data(), work.data()); + combiner.factor_inner (R.view(), A.view(), + tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), &Q(0, 0), Q.stride(1), @@ -607,10 +607,8 @@ namespace TSQR { // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (numRows, numCols, - R.data(), R.stride(1), - A.data(), A.stride(1), tau.data(), - work.data()); + combiner.factor_inner (R.view(), A.view(), + tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau.data(), @@ -624,9 +622,7 @@ namespace TSQR { timer_type timer ("Combine cache block"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (numRows, numCols, - R.data(), R.stride(1), - A.data(), A.stride(1), + combiner.factor_inner (R.view(), A.view(), tau.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index 1d6a21384bd1..b1d6c8e3a995 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -170,14 +170,27 @@ namespace TSQR { } void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) + { + const Ordinal m = A.extent(0); + const Ordinal n = A.extent(1); + factor_inner_impl (m, n, R.data(), R.stride(1), + A.data(), A.stride(1), tau, work); + } + + private: + void + factor_inner_impl (const Ordinal m, + const Ordinal n, + Scalar R[], + const Ordinal ldr, + Scalar A[], + const Ordinal lda, + Scalar tau[], + Scalar work[]) { const Ordinal numRows = m + n; @@ -208,6 +221,7 @@ namespace TSQR { deep_copy (A_view, A_buf_bot); } + public: void factor_pair (const MatView& R_top, const MatView& R_bot, diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index db40c9451b3c..df93159b2f0c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -131,12 +131,8 @@ namespace TSQR { Scalar work[]) const; void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) const; @@ -296,12 +292,8 @@ namespace TSQR { } void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) const; @@ -405,16 +397,12 @@ namespace TSQR { } void - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) const { - return default_.factor_inner (m, n, R, ldr, A, lda, tau, work); + return default_.factor_inner (R, A, tau, work); } void @@ -549,12 +537,8 @@ namespace TSQR { template< class Ordinal, class Scalar > void CombineNative< Ordinal, Scalar, false >:: - factor_inner (const Ordinal m, - const Ordinal n, - Scalar R[], - const Ordinal ldr, - Scalar A[], - const Ordinal lda, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], Scalar work[]) const { @@ -566,12 +550,12 @@ namespace TSQR { Kokkos::View; using range_type = std::pair; - mat_type A_full (A, lda, n); - mat_type A_view = subview (A_full, range_type (0, m), ALL ()); - mat_type R_full (R, ldr, n); - mat_type R_view = subview (R_full, range_type (0, n), ALL ()); - nonconst_vec_type tau_view (tau, n); - nonconst_vec_type work_view (work, n); + mat_type A_full (A.data(), A.stride(1), A.extent(1)); + mat_type A_view = subview (A_full, range_type (0, A.extent(0)), ALL ()); + mat_type R_full (R.data(), R.stride(1), R.extent(1)); + mat_type R_view = subview (R_full, range_type (0, R.extent(1)), ALL ()); + nonconst_vec_type tau_view (tau, R.extent(1)); + nonconst_vec_type work_view (work, R.extent(1)); this->factor_inner (R_view, A_view, tau_view, work_view); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 050907c320e3..497df0df52e8 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -386,9 +386,7 @@ namespace TSQR { << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols << " and A " << numRows << " by " << numCols << endl << endl; } - combiner.factor_inner (numRows, numCols, - R3.data(), R3.stride(1), - A.data(), A.stride(1), + combiner.factor_inner (R3.view(), A.view(), tau_R3A.data(), work.data()); combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, A.data(), A.stride(1), tau_R3A.data(), @@ -535,10 +533,7 @@ namespace TSQR { // View of numCols by numCols upper triangle of A1. mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1)); // qr( [R1; A2] ) - combiner.factor_inner (numRows, numCols, - R1.data(), R1.stride(1), - A2.data(), A2.stride(1), - tau2.data(), work.data()); + combiner.factor_inner (R1, A2, tau2.data(), work.data()); // Extract (a deep copy of) the R factor. matrix_type R (R1); // Zero out everything below the diagonal of R. diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index c2ac4b18e3af..551ccd16fc32 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -188,10 +188,7 @@ namespace TSQR { // We should only call this if A_top.extent(1) > 0 and therefore // tau.size() > 0 and work.size() > 0, but we've already // checked for that, so we don't have to check again. - combine.factor_inner (A_cur.extent(0), A_top.extent(1), - A_top.data(), A_top.stride(1), - A_cur.data(), A_cur.stride(1), - tau.data(), work.data()); + combine.factor_inner (A_top, A_cur, tau.data(), work.data()); return tau; } diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 11adccd6a1d8..6cc01f2df4e7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -208,12 +208,7 @@ namespace TSQR { std::vector& tau, std::vector& work) const { - const LocalOrdinal nrows_local = A_cur.extent(0); - const LocalOrdinal ncols = A_cur.extent(1); - - combine.factor_inner (nrows_local, ncols, R.data(), R.stride(1), - A_cur.data(), A_cur.stride(1), tau.data(), - work.data()); + combine.factor_inner (R, A_cur, tau.data(), work.data()); } public: From 3b4e6162f00042b51e5a6635a225b15d8f6b669e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 17:24:12 -0700 Subject: [PATCH 48/50] TSQR::Combine*::apply_inner now takes MatView instead of a pointer I also added a conversion constructor to MatView, so I can assign a MatView to a MatView. --- packages/tpetra/tsqr/src/Tsqr_Combine.hpp | 14 ++----- .../tsqr/src/Tsqr_CombineBenchmarker.hpp | 24 +++++------ .../tpetra/tsqr/src/Tsqr_CombineDefault.hpp | 18 ++++---- .../tpetra/tsqr/src/Tsqr_CombineNative.hpp | 42 ++++++------------- packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp | 5 +-- .../tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp | 5 +-- packages/tpetra/tsqr/src/Tsqr_MatView.hpp | 10 +++++ .../tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp | 6 +-- 8 files changed, 54 insertions(+), 70 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 70fc1a6e8301..7b1f15f0f8ae 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -142,18 +142,12 @@ namespace TSQR { /// implicitly in A and tau, to the matrix C. void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C[], - const Ordinal ldc, - Scalar work[]) const + const MatView& C, + Scalar work[]) { - return impl_.apply_first (applyType, nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work); + return impl_.apply_first (applyType, A, tau, C, work); } /// Apply the result of \c factor_inner(). diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 4c93f136724f..54d5f199b0ad 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -315,9 +315,9 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); + combiner.apply_first (ApplyType("N"), + A.view(), tau.data(), + Q.view(), work.data()); } // How much time numTrials runs must take in order for @@ -343,9 +343,9 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); + combiner.apply_first (ApplyType("N"), + A.view(), tau.data(), + Q.view(), work.data()); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -411,9 +411,9 @@ namespace TSQR { const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); + combiner.apply_first (ApplyType("N"), + A.view(), tau.data(), + Q.view(), work.data()); } // // The actual timing runs. @@ -422,9 +422,9 @@ namespace TSQR { timer.start(); for (int trial = 0; trial < numTrials; ++trial) { combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - Q.data(), Q.stride(1), work.data()); + combiner.apply_first (ApplyType("N"), + A.view(), tau.data(), + Q.view(), work.data()); } return timer.stop(); } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index b1d6c8e3a995..f5e5ed7c9ce7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -106,16 +106,17 @@ namespace TSQR { void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C[], - const Ordinal ldc, + const MatView& C, Scalar work[]) { + const Ordinal nrows = A.extent(0); + const Ordinal ncols_C = C.extent(1); + const Ordinal ncols_A = A.extent(1); + const Ordinal lda = A.stride(1); + const Ordinal ldc = C.stride(1); + // LAPACK has the nice feature that it only reads the first // letter of input strings that specify things like which side // to which to apply the operator, or whether to apply the @@ -124,7 +125,8 @@ namespace TSQR { const std::string trans = applyType.toString (); const int lwork = ncols_C; lapack_.apply_Q_factor ('L', trans[0], nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work, lwork); + A.data(), lda, tau, C.data(), ldc, + work, lwork); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index df93159b2f0c..8e44d0fe8b75 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -102,18 +102,12 @@ namespace TSQR { void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C[], - const Ordinal ldc, - Scalar work[]) const + const MatView& C, + Scalar work[]) { - return default_.apply_first (applyType, nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work); + return default_.apply_first (applyType, A, tau, C, work); } void @@ -277,18 +271,12 @@ namespace TSQR { void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C[], - const Ordinal ldc, - Scalar work[]) const + const MatView& C, + Scalar work[]) { - return default_.apply_first (applyType, nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work); + return default_.apply_first (applyType, A, tau, C, work); } void @@ -362,18 +350,12 @@ namespace TSQR { void apply_first (const ApplyType& applyType, - const Ordinal nrows, - const Ordinal ncols_C, - const Ordinal ncols_A, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C[], - const Ordinal ldc, - Scalar work[]) const + const MatView& C, + Scalar work[]) { - return default_.apply_first (applyType, nrows, ncols_C, ncols_A, - A, lda, tau, C, ldc, work); + return default_.apply_first (applyType, A, tau, C, work); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 497df0df52e8..341b22ae9d32 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -555,9 +555,8 @@ namespace TSQR { Q1.data(), Q1.stride(1), Q2.data(), Q2.stride(1), work.data()); combiner.apply_first (ApplyType::NoTranspose, - numRows, numCols, numCols, - A1.data(), A.stride(1), tau1.data(), - Q1.data(), Q1.stride(1), work.data()); + A1, tau1.data(), + Q1, work.data()); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Test matrix A:" << endl; diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp index 551ccd16fc32..71b823b19558 100644 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp @@ -390,9 +390,8 @@ namespace TSQR { // If we get this far, it's fair to assume that we have // checked whether tau and work have nonzero lengths. - combine.apply_first (applyType, C_top.extent(0), C_top.extent(1), - Q_top.extent(1), Q_top.data(), Q_top.stride(1), - tau.data(), C_top.data(), C_top.stride(1), work.data()); + combine.apply_first (applyType, Q_top, tau.data(), + C_top, work.data()); } void diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index b9314d462d60..2b3b8ddecd5d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -175,6 +175,16 @@ namespace TSQR { MatView (MatView&& view) = default; MatView& operator= (MatView&& view) = default; + // Participates in overload resolution only if the type of + // rhs.data() is assignable to A_. + template + MatView (const MatView& rhs) : + nrows_ (rhs.extent(0)), + ncols_ (rhs.extent(1)), + lda_ (rhs.stride(1)), + A_ (rhs.data()) + {} + constexpr ordinal_type extent(const int r) const noexcept { return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 6cc01f2df4e7..0390be6c05f5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -175,10 +175,8 @@ namespace TSQR { mat_view_type& C_first, std::vector& work) const { - const LocalOrdinal nrowsLocal = Q_first.extent(0); - combine.apply_first (applyType, nrowsLocal, C_first.extent(1), - Q_first.extent(1), Q_first.data(), Q_first.stride(1), - tau.data(), C_first.data(), C_first.stride(1), work.data()); + combine.apply_first (applyType, Q_first, tau.data(), + C_first, work.data()); } void From a80c8b5ca07c59bc9f07b78fb537e4cc64831308 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 17:28:15 -0700 Subject: [PATCH 49/50] TSQR::Matrix: Simplify nonmember functions --- packages/tpetra/tsqr/src/Tsqr_Matrix.hpp | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 43421a50b89f..2bb78584016e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -331,9 +331,7 @@ namespace TSQR { void deep_copy (Matrix& tgt, const SourceScalar& src) { - MatView tgt_view (tgt.extent(0), tgt.extent(1), - tgt.data(), tgt.stride(1)); - deep_copy (tgt_view, src); + deep_copy (tgt.view(), src); } template& tgt, const SourceMat& src) { - using mat_view_type = MatView; - mat_view_type tgt_view (tgt.extent(0), tgt.extent(1), - tgt.data(), tgt.stride(1)); - deep_copy (tgt_view, src); + deep_copy (tgt.view(), src); } - // Matrix is a container, so the version of data() that returns a - // nonconst pointer must be nonconst. template std::pair, MatView> partition_2x1 (Matrix& A, const typename Matrix::ordinal_type nrows_top, const bool b_contiguous_blocks = false) { - MatView A_view (A.extent(0), A.extent(1), - A.data(), A.stride(1)); - return partition_2x1 (A_view, nrows_top, b_contiguous_blocks); + return partition_2x1 (A.view(), nrows_top, b_contiguous_blocks); } - // Matrix is a container, so the version of data() that returns a - // nonconst pointer must be nonconst. template std::pair, MatView> partition_2x1 (const Matrix& A, const typename Matrix::ordinal_type nrows_top, const bool b_contiguous_blocks = false) { - MatView A_view (A.extent(0), A.extent(1), - A.data(), A.stride(1)); - return partition_2x1 (A_view, nrows_top, b_contiguous_blocks); + return partition_2x1 (A.view(), nrows_top, b_contiguous_blocks); } } // namespace TSQR From c5429325c022283ef5cbb43a088e6fafb1162fb0 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 26 Nov 2019 18:22:03 -0700 Subject: [PATCH 50/50] TSQR: Fix minor build error with CUDA --- packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp index f18b66f897fe..e97cdb891b8c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_TestSetup.hpp @@ -106,7 +106,7 @@ namespace TSQR { // All MPI processes participate in the distribution of the // test matrix. TSQR::Random::randomGlobalMatrix (&generator, A_local, - singular_values.data (), + singular_values, ordinalComm, scalarComm); } }