diff --git a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp new file mode 100644 index 000000000000..5f4d08aca595 --- /dev/null +++ b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp @@ -0,0 +1,335 @@ +/*@HEADER +// *********************************************************************** +// +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// Copyright (2009) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// *********************************************************************** +//@HEADER +*/ + +#ifndef IFPACK2_GAUSS_SEIDEL_HPP +#define IFPACK2_GAUSS_SEIDEL_HPP + +namespace Ifpack2 +{ +namespace Details +{ + template + struct GaussSeidel + { + using crs_matrix_type = Tpetra::CrsMatrix; + using bcrs_matrix_type = Tpetra::BlockCrsMatrix; + using row_matrix_type = Tpetra::RowMatrix; + using local_matrix_type = typename crs_matrix_type::local_matrix_type; + using vector_type = Tpetra::Vector; + using multivector_type = Tpetra::MultiVector; + using block_multivector_type = Tpetra::BlockMultiVector; + using mem_space_t = typename local_matrix_type::memory_space; + using rowmap_t = typename local_matrix_type::row_map_type::HostMirror; + using entries_t = typename local_matrix_type::index_type::HostMirror; + using values_t = typename local_matrix_type::values_type::HostMirror; + using Offset = typename rowmap_t::non_const_value_type; + using IST = typename crs_matrix_type::impl_scalar_type; + using KAT = Kokkos::ArithTraits; + //Type of view representing inverse diagonal blocks, and its HostMirror. + using InverseBlocks = Kokkos::View; + using InverseBlocksHost = typename InverseBlocks::HostMirror; + + //Setup for CrsMatrix + GaussSeidel(const crs_matrix_type& A, Teuchos::RCP& inverseDiagVec_, Teuchos::ArrayRCP& applyRows_, Scalar omega_) + { + numRows = A.getNodeNumRows(); + inverseDiagVec = inverseDiagVec_; + inverseDiagVec->sync_host(); + applyRows = applyRows_; + blockSize = 1; + omega = omega_; + auto Alocal = A.getLocalMatrix(); + Arowmap = Kokkos::create_mirror_view(Alocal.graph.row_map); + Aentries = Kokkos::create_mirror_view(Alocal.graph.entries); + Avalues = Kokkos::create_mirror_view(Alocal.values); + Kokkos::deep_copy(Arowmap, Alocal.graph.row_map); + Kokkos::deep_copy(Aentries, Alocal.graph.entries); + Kokkos::deep_copy(Avalues, Alocal.values); + } + + GaussSeidel(const row_matrix_type& A, Teuchos::RCP& inverseDiagVec_, Teuchos::ArrayRCP& applyRows_, Scalar omega_) + { + numRows = A.getNodeNumRows(); + inverseDiagVec = inverseDiagVec_; + inverseDiagVec->sync_host(); + applyRows = applyRows_; + blockSize = 1; + omega = omega_; + //Here, need to make a deep CRS copy to avoid slow access via getLocalRowCopy + Arowmap = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Arowmap"), numRows + 1); + Aentries = entries_t(Kokkos::ViewAllocateWithoutInitializing("Aentries"), A.getNodeNumEntries()); + Avalues = values_t(Kokkos::ViewAllocateWithoutInitializing("Avalues"), A.getNodeNumEntries()); + size_t maxDegree = A.getNodeMaxNumRowEntries(); + Teuchos::Array rowValues(maxDegree); + Teuchos::Array rowEntries(maxDegree); + size_t accum = 0; + for(LO i = 0; i <= numRows; i++) + { + Arowmap(i) = accum; + if(i == numRows) + break; + size_t degree; + A.getLocalRowCopy(i, rowEntries(), rowValues(), degree); + accum += degree; + size_t rowBegin = Arowmap(i); + for(size_t j = 0; j < degree; j++) + { + Aentries(rowBegin + j) = rowEntries[j]; + Avalues(rowBegin + j) = rowValues[j]; + } + } + } + + GaussSeidel(const bcrs_matrix_type& A, const InverseBlocks& inverseBlockDiag_, Teuchos::ArrayRCP& applyRows_, Scalar omega_) + { + numRows = A.getNodeNumRows(); + //note: next 2 lines are no-ops if inverseBlockDiag_ is already host-accessible + inverseBlockDiag = Kokkos::create_mirror_view(inverseBlockDiag_); + Kokkos::deep_copy(inverseBlockDiag, inverseBlockDiag_); + applyRows = applyRows_; + omega = omega_; + auto AlocalGraph = A.getCrsGraph().getLocalGraph(); + //A.sync_host(); //note: this only syncs values, not graph + Avalues = A.getValuesHost(); + Arowmap = Kokkos::create_mirror_view(AlocalGraph.row_map); + Aentries = Kokkos::create_mirror_view(AlocalGraph.entries); + Kokkos::deep_copy(Arowmap, AlocalGraph.row_map); + Kokkos::deep_copy(Aentries, AlocalGraph.entries); + blockSize = A.getBlockSize(); + } + + template + void applyImpl(multivector_type& x, const multivector_type& b, Tpetra::ESweepDirection direction) + { + //note: direction is either Forward or Backward (Symmetric is handled in apply()) + LO numApplyRows = useApplyRows ? (LO) applyRows.size() : numRows; + //note: inverseDiagMV always has only one column + auto inverseDiag = inverseDiagVec->get1dView(); + bool forward = direction == Tpetra::Forward; + if(multipleRHS) + { + LO numVecs = x.getNumVectors(); + Teuchos::Array accum(numVecs); + auto xlcl = x.get2dViewNonConst(); + auto blcl = b.get2dView(); + for(LO i = 0; i < numApplyRows; i++) + { + LO row; + if(forward) + row = useApplyRows ? applyRows[i] : i; + else + row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i; + for(LO k = 0; k < numVecs; k++) + { + accum[k] = blcl[k][row]; + } + Offset rowBegin = Arowmap(row); + Offset rowEnd = Arowmap(row + 1); + for(Offset j = rowBegin; j < rowEnd; j++) + { + LO col = Aentries(j); + IST val = Avalues(j); + for(LO k = 0; k < numVecs; k++) + { + accum[k] -= val * IST(xlcl[k][col]); + } + } + //Update x + IST dinv = inverseDiag[row]; + for(LO k = 0; k < numVecs; k++) + { + if(omegaNotOne) + xlcl[k][row] += Scalar(omega * dinv * accum[k]); + else + xlcl[k][row] += Scalar(dinv * accum[k]); + } + } + } + else + { + auto xlcl = Kokkos::subview(x.getLocalViewHost(), Kokkos::ALL(), 0); + auto blcl = Kokkos::subview(b.getLocalViewHost(), Kokkos::ALL(), 0); + auto dlcl = Kokkos::subview(inverseDiagVec->getLocalViewHost(), Kokkos::ALL(), 0); + for(LO i = 0; i < numApplyRows; i++) + { + LO row; + if(forward) + row = useApplyRows ? applyRows[i] : i; + else + row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i; + IST sum = blcl(row); + Offset rowBegin = Arowmap(row); + Offset rowEnd = Arowmap(row + 1); + for(Offset j = rowBegin; j < rowEnd; j++) + { + sum -= Avalues(j) * xlcl(Aentries(j)); + } + //Update x + IST dinv = dlcl(row); + if(omegaNotOne) + xlcl(row) += omega * dinv * sum; + else + xlcl(row) += dinv * sum; + } + } + } + + void applyBlock(block_multivector_type& x, const block_multivector_type& b, Tpetra::ESweepDirection direction) + { + if(direction == Tpetra::Symmetric) + { + applyBlock(x, b, Tpetra::Forward); + applyBlock(x, b, Tpetra::Backward); + return; + } + //Number of scalars in Avalues per block entry. + Offset bs2 = blockSize * blockSize; + LO numVecs = x.getNumVectors(); + Kokkos::View accum( + Kokkos::ViewAllocateWithoutInitializing("BlockGaussSeidel Accumulator"), blockSize, numVecs); + Kokkos::View dinv_accum( + Kokkos::ViewAllocateWithoutInitializing("BlockGaussSeidel A_ii^-1*Accumulator"), blockSize, numVecs); + bool useApplyRows = !applyRows.is_null(); + bool forward = direction == Tpetra::Forward; + LO numApplyRows = useApplyRows ? applyRows.size() : numRows; + for(LO i = 0; i < numApplyRows; i++) + { + LO row; + if(forward) + row = useApplyRows ? applyRows[i] : i; + else + row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i; + for(LO v = 0; v < numVecs; v++) + { + auto bRow = b.getLocalBlock (row, v); + for(LO k = 0; k < blockSize; k++) + { + accum(k, v) = bRow(k); + } + } + Offset rowBegin = Arowmap(row); + Offset rowEnd = Arowmap(row + 1); + for(Offset j = rowBegin; j < rowEnd; j++) + { + LO col = Aentries(j); + IST* blk = &Avalues(j * bs2); + for(LO v = 0; v < numVecs; v++) + { + auto xCol = x.getLocalBlock (col, v); + for(LO br = 0; br < blockSize; br++) + { + for(LO bc = 0; bc < blockSize; bc++) + { + IST Aval = blk[br * blockSize + bc]; + accum(br, v) -= Aval * xCol(bc); + } + } + } + } + //Update x: term is omega * Aii^-1 * accum, where omega is scalar, Aii^-1 is bs*bs, and accum is bs*nv + auto invBlock = Kokkos::subview(inverseBlockDiag, row, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(dinv_accum, KAT::zero()); + for(LO v = 0; v < numVecs; v++) + { + for(LO br = 0; br < blockSize; br++) + { + for(LO bc = 0; bc < blockSize; bc++) + { + dinv_accum(br, v) += invBlock(br, bc) * accum(bc, v); + } + } + } + //Update x + for(LO v = 0; v < numVecs; v++) + { + auto xRow = x.getLocalBlock (row, v); + for(LO k = 0; k < blockSize; k++) + { + xRow(k) += omega * dinv_accum(k, v); + } + } + } + } + + void apply(multivector_type& x, const multivector_type& b, Tpetra::ESweepDirection direction) + { + if(direction == Tpetra::Symmetric) + { + apply(x, b, Tpetra::Forward); + apply(x, b, Tpetra::Backward); + return; + } + else if(applyRows.is_null()) + { + if(x.getNumVectors() > 1) + this->template applyImpl(x, b, direction); + else + { + //Optimize for the all-rows, single vector, omega = 1 case + if(omega == KAT::one()) + this->template applyImpl(x, b, direction); + else + this->template applyImpl(x, b, direction); + } + } + else + { + this->template applyImpl(x, b, direction); + } + } + + values_t Avalues; //length = Aentries.extent(0) * blockSize * blockSize + rowmap_t Arowmap; + entries_t Aentries; + LO numRows; + IST omega; + //If set up with BlockCrsMatrix, the block size. Otherwise 1. + LO blockSize; + //If using a non-block matrix, the inverse diagonal. + Teuchos::RCP inverseDiagVec; + //If using a block matrix, the inverses of all diagonal blocks. + InverseBlocksHost inverseBlockDiag; + //If null, apply over all rows in natural order. Otherwise, apply for each row listed in order. + Teuchos::ArrayRCP applyRows; + }; +} +} + +#endif diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp index 73c1d48c3613..5d6805b64610 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp @@ -50,7 +50,7 @@ #include "Tpetra_BlockCrsMatrix.hpp" #include #include - +#include "Ifpack2_Details_GaussSeidel.hpp" #ifndef DOXYGEN_SHOULD_SKIP_THIS namespace Ifpack2 { @@ -261,6 +261,9 @@ class Relaxation : //! The Node type used by the input MatrixType. typedef typename MatrixType::node_type node_type; + //! The Kokkos device type used by the input MatrixType. + typedef typename MatrixType::node_type::device_type device_type; + //! The type of the magnitude (absolute value) of a matrix entry. typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; @@ -583,17 +586,26 @@ class Relaxation : typedef Teuchos::ScalarTraits STS; typedef Teuchos::ScalarTraits STM; + typedef typename Kokkos::ArithTraits::val_type impl_scalar_type; + /// \brief Tpetra::CrsMatrix specialization used by this class. /// /// We use this for dynamic casts to dispatch to the most efficient /// implementation of various relaxation kernels. typedef Tpetra::CrsMatrix crs_matrix_type; + typedef Tpetra::CrsGraph crs_graph_type; + typedef Tpetra::MultiVector multivector_type; typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; typedef Tpetra::BlockMultiVector block_multivector_type; + typedef Tpetra::Map map_type; + typedef Tpetra::Import import_type; + //@} //! \name Implementation of multithreaded Gauss-Seidel. @@ -611,6 +623,13 @@ class Relaxation : MyExecSpace, TemporaryWorkSpace,PersistentWorkSpace > mt_kernel_handle_type; Teuchos::RCP mtKernelHandle_; + //@} + //! \name Implementation of serial Gauss-Seidel. + //@{ + + using SerialGaussSeidel = Ifpack2::Details::GaussSeidel; + Teuchos::RCP serialGaussSeidel_; + //@} //! \name Unimplemented methods that you are syntactically forbidden to call. //@{ @@ -646,38 +665,36 @@ class Relaxation : const Tpetra::MultiVector& X, Tpetra::MultiVector& Y) const; - //! Apply Gauss-Seidel to X, returning the result in Y. - void ApplyInverseGS( - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; - - //! Apply multi-threaded Gauss-Seidel to X, returning the result in Y. + //! Apply multi-threaded Gauss-Seidel for solving AX = B. void ApplyInverseMTGS_CrsMatrix( - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; - + const Tpetra::MultiVector& B, + Tpetra::MultiVector& X, + Tpetra::ESweepDirection direction) const; - //! Apply Gauss-Seidel for a Tpetra::RowMatrix specialization. - void ApplyInverseGS_RowMatrix( + //! Apply Gauss-Seidel to X, returning the result in Y. Calls one of the Crs/Row/BlockCrs specializations below. + void ApplyInverseSerialGS( const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction) const; //! Apply Gauss-Seidel for a Tpetra::CrsMatrix specialization. - void - ApplyInverseGS_CrsMatrix (const crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; + void ApplyInverseSerialGS_CrsMatrix (const crs_matrix_type& A, + const Tpetra::MultiVector& B, + Tpetra::MultiVector& X, + Tpetra::ESweepDirection direction) const; - //! Apply Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization. - void - ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y); + //! Apply Gauss-Seidel for a Tpetra::RowMatrix specialization. + void ApplyInverseSerialGS_RowMatrix( + const Tpetra::MultiVector& X, + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction) const; - //! Apply symmetric Gauss-Seidel to X, returning the result in Y. - void ApplyInverseSGS( - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; + //! Apply Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization. + void ApplyInverseSerialGS_BlockCrsMatrix( + const block_crs_matrix_type& A, + const Tpetra::MultiVector& X, + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction); //! Apply symmetric multi-threaded Gauss-Seidel to X, returning the result in Y. void ApplyInverseMTSGS_CrsMatrix( @@ -689,23 +706,6 @@ class Relaxation : Tpetra::MultiVector& X, const Tpetra::ESweepDirection direction) const; - //! Apply symmetric Gauss-Seidel for a Tpetra::RowMatrix specialization. - void ApplyInverseSGS_RowMatrix( - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; - - //! Apply symmetric Gauss-Seidel for a Tpetra::CrsMatrix specialization. - void - ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const; - - //! Apply symmetric Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization. - void - ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y); - void computeBlockCrs (); //! A service routine for updating the cached MultiVector @@ -728,9 +728,9 @@ class Relaxation : Teuchos::RCP A_; //! Time object to track timing (setup). //! Importer for parallel Gauss-Seidel and symmetric Gauss-Seidel. - Teuchos::RCP > Importer_; + Teuchos::RCP Importer_; //! Importer for block multivector versions of GS and SGS - Teuchos::RCP > pointImporter_; + Teuchos::RCP pointImporter_; //! Contains the diagonal elements of \c A_. Teuchos::RCP > Diagonal_; //! MultiVector for caching purposes (so apply doesn't need to allocate one on each call) diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp index afb0253ee507..c06e17dea6d2 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp @@ -199,10 +199,10 @@ setMatrix(const Teuchos::RCP& A) Diagonal_ = Teuchos::null; // ??? what if this comes from the user??? isInitialized_ = false; IsComputed_ = false; - using device_type = typename node_type::device_type; diagOffsets_ = Kokkos::View(); savedDiagOffsets_ = false; hasBlockCrsMatrix_ = false; + serialGaussSeidel_ = Teuchos::null; if (! A.is_null ()) { IsParallel_ = (A->getRowMap ()->getComm ()->getSize () > 1); } @@ -576,18 +576,18 @@ apply (const Tpetra::MultiVector::initialize () hasBlockCrsMatrix_ = ! A_bcrs.is_null (); } + serialGaussSeidel_ = Teuchos::null; + if (PrecType_ == Details::MTGS || PrecType_ == Details::MTSGS || PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) { const crs_matrix_type* crsMat = @@ -806,7 +808,6 @@ void Relaxation::computeBlockCrs () using Teuchos::rcp_dynamic_cast; using Teuchos::reduceAll; typedef local_ordinal_type LO; - typedef typename node_type::device_type device_type; const std::string timerName ("Ifpack2::Relaxation::computeBlockCrs"); Teuchos::RCP timer = Teuchos::TimeMonitor::lookupCounter (timerName); @@ -930,6 +931,7 @@ void Relaxation::computeBlockCrs () "Tpetra::BlockCrsMatrix, one or more diagonal block LU factorizations " "failed on one or more (MPI) processes."); #endif // HAVE_IFPACK2_DEBUG + serialGaussSeidel_ = rcp(new SerialGaussSeidel(*blockCrsA, blockDiag_, localSmoothingIndices_, DampingFactor_)); } // end TimeMonitor scope ComputeTime_ += (timer->wallTime() - startTime); @@ -956,7 +958,6 @@ void Relaxation::compute () using LO = local_ordinal_type; using vector_type = Tpetra::Vector; - using device_type = typename vector_type::device_type; using IST = typename vector_type::impl_scalar_type; using KAT = Kokkos::ArithTraits; @@ -993,6 +994,10 @@ void Relaxation::compute () { // Timing of compute starts here. Teuchos::TimeMonitor timeMon (*timer); + + const crs_matrix_type* crsMat = + dynamic_cast (A_.get()); + TEUCHOS_TEST_FOR_EXCEPTION (NumSweeps_ < 0, std::logic_error, methodName << ": NumSweeps_ = " << NumSweeps_ << " < 0. " @@ -1020,8 +1025,6 @@ void Relaxation::compute () // method (not inherited from RowMatrix's interface). It's // perfectly valid to do relaxation on a RowMatrix which is not // a CrsMatrix. - const crs_matrix_type* crsMat = - dynamic_cast (A_.getRawPtr ()); if (crsMat == nullptr || ! crsMat->isStaticGraph ()) { A_->getLocalDiagCopy (*Diagonal_); // slow path } @@ -1324,8 +1327,6 @@ void Relaxation::compute () //KokkosKernels GaussSeidel Initialization. - const crs_matrix_type* crsMat = - dynamic_cast (A_.get()); TEUCHOS_TEST_FOR_EXCEPTION (crsMat == nullptr, std::logic_error, methodName << ": " "Multithreaded Gauss-Seidel methods currently only work " @@ -1347,6 +1348,12 @@ void Relaxation::compute () diagView_1d, is_matrix_structurally_symmetric_); } + else if(PrecType_ == Ifpack2::Details::GS || PrecType_ == Ifpack2::Details::SGS) { + if(crsMat) + serialGaussSeidel_ = rcp(new SerialGaussSeidel(*crsMat, Diagonal_, localSmoothingIndices_, DampingFactor_)); + else + serialGaussSeidel_ = rcp(new SerialGaussSeidel(*A_, Diagonal_, localSmoothingIndices_, DampingFactor_)); + } } // end TimeMonitor scope ComputeTime_ += (timer->wallTime() - startTime); @@ -1556,8 +1563,9 @@ ApplyInverseJacobi_BlockCrsMatrix (const Tpetra::MultiVector void Relaxation:: -ApplyInverseGS (const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const +ApplyInverseSerialGS (const Tpetra::MultiVector& X, + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction) const { using this_type = Relaxation; // The CrsMatrix version is faster, because it can access the sparse @@ -1574,13 +1582,13 @@ ApplyInverseGS (const Tpetra::MultiVector (A_.getRawPtr ()); if (blockCrsMat != nullptr) { - const_cast (*this).ApplyInverseGS_BlockCrsMatrix (*blockCrsMat, X, Y); + const_cast (*this).ApplyInverseSerialGS_BlockCrsMatrix (*blockCrsMat, X, Y, direction); } else if (crsMat != nullptr) { - ApplyInverseGS_CrsMatrix (*crsMat, X, Y); + ApplyInverseSerialGS_CrsMatrix (*crsMat, X, Y, direction); } else { - ApplyInverseGS_RowMatrix (X, Y); + ApplyInverseSerialGS_RowMatrix (X, Y, direction); } } @@ -1588,9 +1596,9 @@ ApplyInverseGS (const Tpetra::MultiVector void Relaxation:: -ApplyInverseGS_RowMatrix (const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const -{ +ApplyInverseSerialGS_RowMatrix (const Tpetra::MultiVector& X, + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction) const { using Teuchos::Array; using Teuchos::ArrayRCP; using Teuchos::ArrayView; @@ -1607,20 +1615,7 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVectorgetNodeMaxNumRowEntries (); - Array Indices (maxLength); - Array Values (maxLength); - - // Local smoothing stuff - const size_t numMyRows = A_->getNodeNumRows(); - const local_ordinal_type* rowInd = 0; - size_t numActive = numMyRows; - bool do_local = ! localSmoothingIndices_.is_null (); - if (do_local) { - rowInd = localSmoothingIndices_.getRawPtr (); - numActive = localSmoothingIndices_.size (); - } + size_t NumVectors = X.getNumVectors(); RCP Y2; if (IsParallel_) { @@ -1636,147 +1631,29 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVector d_rcp = Diagonal_->get1dView (); - ArrayView d_ptr = d_rcp(); - - // Constant stride check - bool constant_stride = X.isConstantStride() && Y2->isConstantStride(); - - if (constant_stride) { - // extract 1D RCPs - size_t x_stride = X.getStride(); - size_t y2_stride = Y2->getStride(); - ArrayRCP y2_rcp = Y2->get1dViewNonConst(); - ArrayRCP x_rcp = X.get1dView(); - ArrayView y2_ptr = y2_rcp(); - ArrayView x_ptr = x_rcp(); - Array dtemp(NumVectors,STS::zero()); - - for (int j = 0; j < NumSweeps_; ++j) { - // data exchange is here, once per sweep - if (IsParallel_) { - if (Importer_.is_null ()) { - // FIXME (mfh 27 May 2019) This doesn't deep copy -- not - // clear if this is correct. Reevaluate at some point. - - *Y2 = Y; // just copy, since domain and column Maps are the same - } else { - Y2->doImport (Y, *Importer_, Tpetra::INSERT); - } - } - - if (! DoBackwardGS_) { // Forward sweep - for (size_t ii = 0; ii < numActive; ++ii) { - local_ordinal_type i = as (do_local ? rowInd[ii] : ii); - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - dtemp.assign(NumVectors,STS::zero()); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - for (size_t m = 0; m < NumVectors; ++m) { - dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m]; - } - } - - for (size_t m = 0; m < NumVectors; ++m) { - y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]); - } - } - } - else { // Backward sweep - // ptrdiff_t is the same size as size_t, but is signed. Being - // signed is important so that i >= 0 is not trivially true. - for (ptrdiff_t ii = as (numActive) - 1; ii >= 0; --ii) { - local_ordinal_type i = as (do_local ? rowInd[ii] : ii); - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - dtemp.assign (NumVectors, STS::zero ()); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - for (size_t m = 0; m < NumVectors; ++m) { - dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m]; - } - } + const_cast(X).sync_host(); + for (int j = 0; j < NumSweeps_; ++j) { + // data exchange is here, once per sweep + if (IsParallel_) { + if (Importer_.is_null ()) { + // FIXME (mfh 27 May 2019) This doesn't deep copy -- not + // clear if this is correct. Reevaluate at some point. - for (size_t m = 0; m < NumVectors; ++m) { - y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]); - } - } - } - // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. - if (IsParallel_) { - Tpetra::deep_copy (Y, *Y2); + *Y2 = Y; // just copy, since domain and column Maps are the same + } else { + Y2->doImport (Y, *Importer_, Tpetra::INSERT); } } - } - else { - // extract 2D RCPS - ArrayRCP > y2_ptr = Y2->get2dViewNonConst (); - ArrayRCP > x_ptr = X.get2dView (); - - for (int j = 0; j < NumSweeps_; ++j) { - // data exchange is here, once per sweep - if (IsParallel_) { - if (Importer_.is_null ()) { - *Y2 = Y; // just copy, since domain and column Maps are the same - } else { - Y2->doImport (Y, *Importer_, Tpetra::INSERT); - } - } + Y2->sync_host(); + serialGaussSeidel_->apply(*Y2, X, direction); + Y2->modify_host(); - if (! DoBackwardGS_) { // Forward sweep - for (size_t ii = 0; ii < numActive; ++ii) { - local_ordinal_type i = as (do_local ? rowInd[ii] : ii); - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - - for (size_t m = 0; m < NumVectors; ++m) { - scalar_type dtemp = STS::zero (); - ArrayView x_local = (x_ptr())[m](); - ArrayView y2_local = (y2_ptr())[m](); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - dtemp += Values[k] * y2_local[col]; - } - y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp); - } - } - } - else { // Backward sweep - // ptrdiff_t is the same size as size_t, but is signed. Being - // signed is important so that i >= 0 is not trivially true. - for (ptrdiff_t ii = as (numActive) - 1; ii >= 0; --ii) { - local_ordinal_type i = as (do_local ? rowInd[ii] : ii); - - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - - for (size_t m = 0; m < NumVectors; ++m) { - scalar_type dtemp = STS::zero (); - ArrayView x_local = (x_ptr())[m](); - ArrayView y2_local = (y2_ptr())[m](); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - dtemp += Values[k] * y2_local[col]; - } - y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp); - } - } - } - - // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. - if (IsParallel_) { - Tpetra::deep_copy (Y, *Y2); - } + // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. + if (IsParallel_) { + Tpetra::deep_copy (Y, *Y2); } } - // See flop count discussion in implementation of ApplyInverseGS_CrsMatrix(). const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0; const double numVectors = as (X.getNumVectors ()); @@ -1790,21 +1667,173 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVector void Relaxation:: -ApplyInverseGS_CrsMatrix (const crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const +ApplyInverseSerialGS_CrsMatrix(const crs_matrix_type& A, + const Tpetra::MultiVector& B, + Tpetra::MultiVector& X, + Tpetra::ESweepDirection direction) const { - using Teuchos::as; - const Tpetra::ESweepDirection direction = - DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward; - if (localSmoothingIndices_.is_null ()) { - A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction, - NumSweeps_, ZeroStartingSolution_); + using Teuchos::null; + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::rcpFromRef; + using Teuchos::rcp_const_cast; + typedef scalar_type Scalar; + const char prefix[] = "Ifpack2::Relaxation::SerialGS: "; + const scalar_type ZERO = Teuchos::ScalarTraits::zero (); + + TEUCHOS_TEST_FOR_EXCEPTION( + ! A.isFillComplete (), std::runtime_error, + prefix << "The matrix is not fill complete."); + TEUCHOS_TEST_FOR_EXCEPTION( + NumSweeps_ < 0, std::invalid_argument, + prefix << "The number of sweeps must be nonnegative, " + "but you provided numSweeps = " << NumSweeps_ << " < 0."); + + if (NumSweeps_ == 0) { + return; } - else { - A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (), - DampingFactor_, direction, - NumSweeps_, ZeroStartingSolution_); + + RCP importer = A.getGraph ()->getImporter (); + + RCP domainMap = A.getDomainMap (); + RCP rangeMap = A.getRangeMap (); + RCP rowMap = A.getGraph ()->getRowMap (); + RCP colMap = A.getGraph ()->getColMap (); + +#ifdef HAVE_IFPACK2_DEBUG + { + // The relation 'isSameAs' is transitive. It's also a + // collective, so we don't have to do a "shared" test for + // exception (i.e., a global reduction on the test value). + TEUCHOS_TEST_FOR_EXCEPTION( + ! X.getMap ()->isSameAs (*domainMap), std::runtime_error, + "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " + "multivector X be in the domain Map of the matrix."); + TEUCHOS_TEST_FOR_EXCEPTION( + ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error, + "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " + "B be in the range Map of the matrix."); + TEUCHOS_TEST_FOR_EXCEPTION( + ! Diagonal_->getMap ()->isSameAs (*rowMap), std::runtime_error, + "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " + "D be in the row Map of the matrix."); + TEUCHOS_TEST_FOR_EXCEPTION( + ! rowMap->isSameAs (*rangeMap), std::runtime_error, + "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the " + "range Map be the same (in the sense of Tpetra::Map::isSameAs)."); + TEUCHOS_TEST_FOR_EXCEPTION( + ! domainMap->isSameAs (*rangeMap), std::runtime_error, + "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and " + "the range Map of the matrix be the same."); + } +#endif + + // Fetch a (possibly cached) temporary column Map multivector + // X_colMap, and a domain Map view X_domainMap of it. Both have + // constant stride by construction. We know that the domain Map + // must include the column Map, because our Gauss-Seidel kernel + // requires that the row Map, domain Map, and range Map are all + // the same, and that each process owns all of its own diagonal + // entries of the matrix. + + RCP X_colMap; + RCP X_domainMap; + bool copyBackOutput = false; + if (importer.is_null ()) { + X_colMap = Teuchos::rcpFromRef (X); + X_domainMap = Teuchos::rcpFromRef (X); + // Column Map and domain Map are the same, so there are no + // remote entries. Thus, if we are not setting the initial + // guess to zero, we don't have to worry about setting remote + // entries to zero, even though we are not doing an Import in + // this case. + if (ZeroStartingSolution_) { + X_colMap->putScalar (ZERO); + } + // No need to copy back to X at end. + } + else { // Column Map and domain Map are _not_ the same. + updateCachedMultiVector(colMap, X.getNumVectors()); + X_colMap = cachedMV_; + X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0); + +#ifdef HAVE_TPETRA_DEBUG + auto X_colMap_host_view = X_colMap->getLocalViewHost (); + auto X_domainMap_host_view = X_domainMap->getLocalViewHost (); + + if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) { + TEUCHOS_TEST_FOR_EXCEPTION + (X_colMap_host_view.data () != X_domainMap_host_view.data (), + std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to " + "start of column Map view of X is not equal to pointer to start of " + "(domain Map view of) X. This may mean that Tpetra::MultiVector::" + "offsetViewNonConst is broken. " + "Please report this bug to the Tpetra developers."); + } + + TEUCHOS_TEST_FOR_EXCEPTION( + X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) || + X_colMap->getLocalLength () < X_domainMap->getLocalLength (), + std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " + "X_colMap has fewer local rows than X_domainMap. " + "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0) + << ", X_domainMap_host_view.extent(0) = " + << X_domainMap_host_view.extent (0) + << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength () + << ", and X_domainMap->getLocalLength() = " + << X_domainMap->getLocalLength () + << ". This means that Tpetra::MultiVector::offsetViewNonConst " + "is broken. Please report this bug to the Tpetra developers."); + + TEUCHOS_TEST_FOR_EXCEPTION( + X_colMap->getNumVectors () != X_domainMap->getNumVectors (), + std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " + "X_colMap has a different number of columns than X_domainMap. " + "X_colMap->getNumVectors() = " << X_colMap->getNumVectors () + << " != X_domainMap->getNumVectors() = " + << X_domainMap->getNumVectors () + << ". This means that Tpetra::MultiVector::offsetViewNonConst " + "is broken. Please report this bug to the Tpetra developers."); +#endif // HAVE_TPETRA_DEBUG + + if (ZeroStartingSolution_) { + // No need for an Import, since we're filling with zeros. + X_colMap->putScalar (ZERO); + } else { + // We could just copy X into X_domainMap. However, that + // wastes a copy, because the Import also does a copy (plus + // communication). Since the typical use case for + // Gauss-Seidel is a small number of sweeps (2 is typical), we + // don't want to waste that copy. Thus, we do the Import + // here, and skip the first Import in the first sweep. + // Importing directly from X effects the copy into X_domainMap + // (which is a view of X_colMap). + X_colMap->doImport (X, *importer, Tpetra::INSERT); + } + copyBackOutput = true; // Don't forget to copy back at end. + } // if column and domain Maps are (not) the same + + const_cast(B).sync_host(); + for (int sweep = 0; sweep < NumSweeps_; ++sweep) { + if (! importer.is_null () && sweep > 0) { + // We already did the first Import for the zeroth sweep above, + // if it was necessary. + X_colMap->doImport (*X_domainMap, *importer, Tpetra::INSERT); + } + X_colMap->sync_host (); + // Do local Gauss-Seidel (forward, backward or symmetric) + serialGaussSeidel_->apply(*X_colMap, B, direction); + X_colMap->modify_host (); + } + + if (copyBackOutput) { + try { + deep_copy (X , *X_domainMap); // Copy result back into X. + } catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) " + "threw an exception: " << e.what ()); + } } // For each column of output, for each sweep over the matrix: @@ -1819,9 +1848,9 @@ ApplyInverseGS_CrsMatrix (const crs_matrix_type& A, // Floating-point operations due to the damping factor, per matrix // row, per direction, per columm of output. const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0; - const double numVectors = as (X.getNumVectors ()); - const double numGlobalRows = as (A_->getGlobalNumRows ()); - const double numGlobalNonzeros = as (A_->getGlobalNumEntries ()); + const double numVectors = X.getNumVectors (); + const double numGlobalRows = A_->getGlobalNumRows (); + const double numGlobalNonzeros = A_->getGlobalNumEntries (); ApplyFlops_ += NumSweeps_ * numVectors * (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops); } @@ -1829,22 +1858,15 @@ ApplyInverseGS_CrsMatrix (const crs_matrix_type& A, template void Relaxation:: -ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, +ApplyInverseSerialGS_BlockCrsMatrix (const block_crs_matrix_type& A, const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) + Tpetra::MultiVector& Y, + Tpetra::ESweepDirection direction) { using Tpetra::INSERT; using Teuchos::RCP; using Teuchos::rcp; using Teuchos::rcpFromRef; - using BMV = Tpetra::BlockMultiVector; - using MV = Tpetra::MultiVector; - using map_type = Tpetra::Map; - using import_type = Tpetra::Import; //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides // @@ -1853,11 +1875,11 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, // does not have constant stride. We should check for that case // here, in case it doesn't work in localGaussSeidel (which is // entirely possible). - BMV yBlock(Y, *(A.getGraph ()->getDomainMap()), A.getBlockSize()); - const BMV xBlock(X, *(A.getColMap ()), A.getBlockSize()); + block_multivector_type yBlock(Y, *(A.getGraph ()->getDomainMap()), A.getBlockSize()); + const block_multivector_type xBlock(X, *(A.getColMap ()), A.getBlockSize()); bool performImport = false; - RCP yBlockCol; + RCP yBlockCol; if (Importer_.is_null()) { yBlockCol = rcpFromRef(yBlock); } @@ -1866,7 +1888,7 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, yBlockColumnPointMap_->getNumVectors() != yBlock.getNumVectors() || yBlockColumnPointMap_->getBlockSize() != yBlock.getBlockSize()) { yBlockColumnPointMap_ = - rcp(new BMV(*(A.getColMap()), A.getBlockSize(), + rcp(new block_multivector_type(*(A.getColMap()), A.getBlockSize(), static_cast(yBlock.getNumVectors()))); } yBlockCol = yBlockColumnPointMap_; @@ -1878,9 +1900,9 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, performImport = true; } - MV yBlock_mv; - MV yBlockCol_mv; - RCP yBlockColPointDomain; + multivector_type yBlock_mv; + multivector_type yBlockCol_mv; + RCP yBlockColPointDomain; if (performImport) { // create views (shallow copies) yBlock_mv = yBlock.getMultiVectorView(); yBlockCol_mv = yBlockCol->getMultiVectorView(); @@ -1892,18 +1914,17 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, yBlockCol->putScalar(STS::zero ()); } else if (performImport) { - yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT); + yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT); } - const Tpetra::ESweepDirection direction = - DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward; - + const_cast(xBlock).sync_host(); for (int sweep = 0; sweep < NumSweeps_; ++sweep) { if (performImport && sweep > 0) { - yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT); + yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT); } - A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_, - DampingFactor_, direction); + yBlockCol->sync_host(); + serialGaussSeidel_->applyBlock(*yBlockCol, xBlock, direction); + yBlockCol->modify_host(); if (performImport) { Tpetra::deep_copy(Y, *yBlockColPointDomain); } @@ -1913,9 +1934,10 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A, template void Relaxation:: -MTGaussSeidel (const Tpetra::MultiVector& B, - Tpetra::MultiVector& X, - const Tpetra::ESweepDirection direction) const +ApplyInverseMTGS_CrsMatrix( + const Tpetra::MultiVector& B, + Tpetra::MultiVector& X, + Tpetra::ESweepDirection direction) const { using Teuchos::null; using Teuchos::RCP; @@ -1925,9 +1947,6 @@ MTGaussSeidel (const Tpetra::MultiVector::zero (); @@ -1971,15 +1990,9 @@ MTGaussSeidel (const Tpetra::MultiVector MV; - typedef typename crs_matrix_type::import_type import_type; - typedef typename crs_matrix_type::export_type export_type; - typedef typename crs_matrix_type::map_type map_type; - RCP importer = crsMat->getGraph ()->getImporter (); - RCP exporter = crsMat->getGraph ()->getExporter (); TEUCHOS_TEST_FOR_EXCEPTION( - ! exporter.is_null (), std::runtime_error, + ! crsMat->getGraph ()->getExporter ().is_null (), std::runtime_error, "This method's implementation currently requires that the matrix's row, " "domain, and range Maps be the same. This cannot be the case, because " "the matrix has a nontrivial Export object."); @@ -2029,8 +2042,8 @@ MTGaussSeidel (const Tpetra::MultiVector X_colMap; - RCP X_domainMap; + RCP X_colMap; + RCP X_domainMap; bool copyBackOutput = false; if (importer.is_null ()) { if (X.isConstantStride ()) { @@ -2147,7 +2160,7 @@ MTGaussSeidel (const Tpetra::MultiVector B_in; + RCP B_in; if (B.isConstantStride ()) { B_in = rcpFromRef (B); } @@ -2155,8 +2168,7 @@ MTGaussSeidel (const Tpetra::MultiVector B_in_nonconst = crsMat->getRowMapMultiVector (B, true); - RCP B_in_nonconst = rcp (new MV (rowMap, B.getNumVectors())); + RCP B_in_nonconst = rcp (new multivector_type(rowMap, B.getNumVectors())); try { deep_copy (*B_in_nonconst, B); } catch (std::exception& e) { @@ -2166,7 +2178,7 @@ MTGaussSeidel (const Tpetra::MultiVector (B_in_nonconst); + B_in = rcp_const_cast (B_in_nonconst); /* TPETRA_EFFICIENCY_WARNING( @@ -2245,382 +2257,6 @@ MTGaussSeidel (const Tpetra::MultiVector -void -Relaxation:: -ApplyInverseMTSGS_CrsMatrix (const Tpetra::MultiVector& B, - Tpetra::MultiVector& X) const -{ - const Tpetra::ESweepDirection direction = Tpetra::Symmetric; - this->MTGaussSeidel (B, X, direction); -} - - -template -void Relaxation::ApplyInverseMTGS_CrsMatrix ( - const Tpetra::MultiVector& B, - Tpetra::MultiVector& X) const { - - const Tpetra::ESweepDirection direction = - DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward; - this->MTGaussSeidel (B, X, direction); -} - -template -void -Relaxation:: -ApplyInverseSGS (const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const -{ - using this_type = Relaxation; - // The CrsMatrix version is faster, because it can access the sparse - // matrix data directly, rather than by copying out each row's data - // in turn. Thus, we check whether the RowMatrix is really a - // CrsMatrix. - // - // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef - // declaration in Ifpack2_Relaxation_decl.hpp header file. The code - // will still be correct if the cast fails, but it will use an - // unoptimized kernel. - const block_crs_matrix_type* blockCrsMat = - dynamic_cast (A_.getRawPtr ()); - const crs_matrix_type* crsMat = - dynamic_cast (A_.getRawPtr ()); - if (blockCrsMat != nullptr) { - const_cast (*this).ApplyInverseSGS_BlockCrsMatrix(*blockCrsMat, X, Y); - } - else if (crsMat != nullptr) { - ApplyInverseSGS_CrsMatrix (*crsMat, X, Y); - } - else { - ApplyInverseSGS_RowMatrix (X, Y); - } -} - - -template -void -Relaxation:: -ApplyInverseSGS_RowMatrix (const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const -{ - using Teuchos::Array; - using Teuchos::ArrayRCP; - using Teuchos::ArrayView; - using Teuchos::as; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcpFromRef; - using MV = Tpetra::MultiVector; - - // Tpetra's GS implementation for CrsMatrix handles zeroing out the - // starting multivector itself. The generic RowMatrix version here - // does not, so we have to zero out Y here. - if (ZeroStartingSolution_) { - Y.putScalar (STS::zero ()); - } - - const size_t NumVectors = X.getNumVectors (); - const size_t maxLength = A_->getNodeMaxNumRowEntries (); - Array Indices (maxLength); - Array Values (maxLength); - - // Local smoothing stuff - const size_t numMyRows = A_->getNodeNumRows(); - const local_ordinal_type * rowInd = 0; - size_t numActive = numMyRows; - bool do_local = !localSmoothingIndices_.is_null(); - if(do_local) { - rowInd = localSmoothingIndices_.getRawPtr(); - numActive = localSmoothingIndices_.size(); - } - - - RCP Y2; - if (IsParallel_) { - if (Importer_.is_null ()) { // domain and column Maps are the same. - updateCachedMultiVector (Y.getMap (), NumVectors); - } - else { - updateCachedMultiVector (Importer_->getTargetMap (), NumVectors); - } - Y2 = cachedMV_; - } - else { - Y2 = rcpFromRef (Y); - } - - // Diagonal - ArrayRCP d_rcp = Diagonal_->get1dView(); - ArrayView d_ptr = d_rcp(); - - // Constant stride check - bool constant_stride = X.isConstantStride() && Y2->isConstantStride(); - - if(constant_stride) { - // extract 1D RCPs - size_t x_stride = X.getStride(); - size_t y2_stride = Y2->getStride(); - ArrayRCP y2_rcp = Y2->get1dViewNonConst(); - ArrayRCP x_rcp = X.get1dView(); - ArrayView y2_ptr = y2_rcp(); - ArrayView x_ptr = x_rcp(); - Array dtemp(NumVectors,STS::zero()); - - for (int j = 0; j < NumSweeps_; j++) { - // data exchange is here, once per sweep - if (IsParallel_) { - if (Importer_.is_null ()) { - // just copy, since domain and column Maps are the same - Tpetra::deep_copy (*Y2, Y); - } else { - Y2->doImport (Y, *Importer_, Tpetra::INSERT); - } - } - for (size_t ii = 0; ii < numActive; ++ii) { - local_ordinal_type i = as(do_local ? rowInd[ii] : ii); - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - dtemp.assign(NumVectors,STS::zero()); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - for (size_t m = 0; m < NumVectors; ++m) { - dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m]; - } - } - - for (size_t m = 0; m < NumVectors; ++m) { - y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]); - } - } - - // ptrdiff_t is the same size as size_t, but is signed. Being - // signed is important so that i >= 0 is not trivially true. - for (ptrdiff_t ii = as (numActive) - 1; ii >= 0; --ii) { - local_ordinal_type i = as(do_local ? rowInd[ii] : ii); - size_t NumEntries; - A_->getLocalRowCopy (i, Indices (), Values (), NumEntries); - dtemp.assign(NumVectors,STS::zero()); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - for (size_t m = 0; m < NumVectors; ++m) { - dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m]; - } - } - - for (size_t m = 0; m < NumVectors; ++m) { - y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]); - } - } - - // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. - if (IsParallel_) { - Tpetra::deep_copy (Y, *Y2); - } - } - } - else { - // extract 2D RCPs - ArrayRCP > y2_ptr = Y2->get2dViewNonConst (); - ArrayRCP > x_ptr = X.get2dView (); - - for (int iter = 0; iter < NumSweeps_; ++iter) { - // only one data exchange per sweep - if (IsParallel_) { - if (Importer_.is_null ()) { - // just copy, since domain and column Maps are the same - Tpetra::deep_copy (*Y2, Y); - } else { - Y2->doImport (Y, *Importer_, Tpetra::INSERT); - } - } - - for (size_t ii = 0; ii < numActive; ++ii) { - local_ordinal_type i = as(do_local ? rowInd[ii] : ii); - const scalar_type diag = d_ptr[i]; - size_t NumEntries; - A_->getLocalRowCopy (as (i), Indices (), Values (), NumEntries); - - for (size_t m = 0; m < NumVectors; ++m) { - scalar_type dtemp = STS::zero (); - ArrayView x_local = (x_ptr())[m](); - ArrayView y2_local = (y2_ptr())[m](); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - dtemp += Values[k] * y2_local[col]; - } - y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag; - } - } - - // ptrdiff_t is the same size as size_t, but is signed. Being - // signed is important so that i >= 0 is not trivially true. - for (ptrdiff_t ii = as (numActive) - 1; ii >= 0; --ii) { - local_ordinal_type i = as(do_local ? rowInd[ii] : ii); - const scalar_type diag = d_ptr[i]; - size_t NumEntries; - A_->getLocalRowCopy (as (i), Indices (), Values (), NumEntries); - - for (size_t m = 0; m < NumVectors; ++m) { - scalar_type dtemp = STS::zero (); - ArrayView x_local = (x_ptr())[m](); - ArrayView y2_local = (y2_ptr())[m](); - - for (size_t k = 0; k < NumEntries; ++k) { - const local_ordinal_type col = Indices[k]; - dtemp += Values[k] * y2_local[col]; - } - y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag; - } - } - - // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map. - if (IsParallel_) { - Tpetra::deep_copy (Y, *Y2); - } - } - } - - // See flop count discussion in implementation of ApplyInverseSGS_CrsMatrix(). - const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0; - const double numVectors = as (X.getNumVectors ()); - const double numGlobalRows = as (A_->getGlobalNumRows ()); - const double numGlobalNonzeros = as (A_->getGlobalNumEntries ()); - ApplyFlops_ += 2.0 * NumSweeps_ * numVectors * - (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops); -} - - -template -void -Relaxation:: -ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) const -{ - using Teuchos::as; - const Tpetra::ESweepDirection direction = Tpetra::Symmetric; - if (localSmoothingIndices_.is_null ()) { - A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction, - NumSweeps_, ZeroStartingSolution_); - } - else { - A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (), - DampingFactor_, direction, - NumSweeps_, ZeroStartingSolution_); - } - - // For each column of output, for each sweep over the matrix: - // - // - One + and one * for each matrix entry - // - One / and one + for each row of the matrix - // - If the damping factor is not one: one * for each row of the - // matrix. (It's not fair to count this if the damping factor is - // one, since the implementation could skip it. Whether it does - // or not is the implementation's choice.) - // - // Each sweep of symmetric Gauss-Seidel / SOR counts as two sweeps, - // one forward and one backward. - - // Floating-point operations due to the damping factor, per matrix - // row, per direction, per columm of output. - const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0; - const double numVectors = as (X.getNumVectors ()); - const double numGlobalRows = as (A_->getGlobalNumRows ()); - const double numGlobalNonzeros = as (A_->getGlobalNumEntries ()); - ApplyFlops_ += 2.0 * NumSweeps_ * numVectors * - (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops); -} - -template -void -Relaxation:: -ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A, - const Tpetra::MultiVector& X, - Tpetra::MultiVector& Y) -{ - using Tpetra::INSERT; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcpFromRef; - using BMV = Tpetra::BlockMultiVector; - using MV = Tpetra::MultiVector; - using map_type = Tpetra::Map; - using import_type = Tpetra::Import; - - //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides - // - // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for - // multiple right-hand sides, unless the input or output MultiVector - // does not have constant stride. We should check for that case - // here, in case it doesn't work in localGaussSeidel (which is - // entirely possible). - BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ()); - const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ()); - - bool performImport = false; - RCP yBlockCol; - if (Importer_.is_null ()) { - yBlockCol = Teuchos::rcpFromRef (yBlock); - } - else { - if (yBlockColumnPointMap_.is_null () || - yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () || - yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) { - yBlockColumnPointMap_ = - rcp (new BMV (* (A.getColMap ()), A.getBlockSize (), - static_cast (yBlock.getNumVectors ()))); - } - yBlockCol = yBlockColumnPointMap_; - if (pointImporter_.is_null()) { - auto srcMap = rcp(new map_type(yBlock.getPointMap())); - auto tgtMap = rcp(new map_type(yBlockCol->getPointMap())); - pointImporter_ = rcp(new import_type(srcMap, tgtMap)); - } - performImport = true; - } - - MV yBlock_mv; - MV yBlockCol_mv; - RCP yBlockColPointDomain; - if (performImport) { // create views (shallow copies) - yBlock_mv = yBlock.getMultiVectorView(); - yBlockCol_mv = yBlockCol->getMultiVectorView(); - yBlockColPointDomain = - yBlockCol_mv.offsetView(A.getDomainMap(), 0); - } - - if (ZeroStartingSolution_) { - yBlockCol->putScalar(STS::zero ()); - } - else if (performImport) { - yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT); - } - - // FIXME (mfh 12 Sep 2014) Shouldn't this come from the user's parameter? - const Tpetra::ESweepDirection direction = Tpetra::Symmetric; - - for (int sweep = 0; sweep < NumSweeps_; ++sweep) { - if (performImport && sweep > 0) { - yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT); - } - A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_, - DampingFactor_, direction); - if (performImport) { - Tpetra::deep_copy(yBlock_mv, *yBlockColPointDomain); - } - } } template @@ -2807,6 +2443,7 @@ describe (Teuchos::FancyOStream &out, } } + } // namespace Ifpack2 #define IFPACK2_RELAXATION_INSTANT(S,LO,GO,N) \ diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp index fed4cda0de9c..cde3ca4f7cf6 100644 --- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp @@ -351,58 +351,6 @@ class BlockCrsMatrix : const Scalar alpha = Teuchos::ScalarTraits::one (), const Scalar beta = Teuchos::ScalarTraits::zero ()); - /// \brief Version of gaussSeidel(), with fewer requirements on X. - /// - /// Not Implemented - void - gaussSeidelCopy (MultiVector &X, - const ::Tpetra::MultiVector &B, - const ::Tpetra::MultiVector &D, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const; - - /// \brief Version of reorderedGaussSeidel(), with fewer requirements on X. - /// - /// Not Implemented - void - reorderedGaussSeidelCopy (::Tpetra::MultiVector& X, - const ::Tpetra::MultiVector& B, - const ::Tpetra::MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const; - - /// \brief Local Gauss-Seidel solve, given a factorized diagonal - /// - /// \param Residual [in] The "residual" (right-hand side) block - /// (multi)vector - /// \param Solution [in/out] On input: the initial guess / current - /// approximate solution. On output: the new approximate - /// solution. - /// \param D_inv [in] Block diagonal, the explicit inverse of this - /// matrix's block diagonal (possibly modified for algorithmic - /// reasons). - /// \param omega [in] (S)SOR relaxation coefficient - /// \param direction [in] Forward, Backward, or Symmetric. - /// - /// One may access block i in \c D_inv using the - /// following code: - /// \code - /// auto D_ii = Kokkos::subview(D_inv, i, Kokkos::ALL(), Kokkos::ALL()); - /// \endcode - /// The resulting block is b x b, where b = this->getBlockSize(). - void - localGaussSeidel (const BlockMultiVector& Residual, - BlockMultiVector& Solution, - const Kokkos::View& D_inv, - const Scalar& omega, - const ESweepDirection direction) const; - /// \brief Replace values at the given (mesh, i.e., block) column /// indices, in the given (mesh, i.e., block) row. /// diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp index 625d2501e18d..3c84d111978f 100644 --- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp @@ -1058,158 +1058,6 @@ class GetLocalDiagCopy { graph_.getLocalDiagOffsets (offsets); } - - template - void - BlockCrsMatrix:: - localGaussSeidel (const BlockMultiVector& B, - BlockMultiVector& X, - const Kokkos::View& D_inv, - const Scalar& omega, - const ESweepDirection direction) const - { - using Kokkos::ALL; - const impl_scalar_type zero = - Kokkos::Details::ArithTraits::zero (); - const impl_scalar_type one = - Kokkos::Details::ArithTraits::one (); - const LO numLocalMeshRows = - static_cast (rowMeshMap_.getNodeNumElements ()); - const LO numVecs = static_cast (X.getNumVectors ()); - - // If using (new) Kokkos, replace localMem with thread-local - // memory. Note that for larger block sizes, this will affect the - // two-level parallelization. Look to Stokhos for best practice - // on making this fast for GPUs. - const LO blockSize = getBlockSize (); - Teuchos::Array localMem (blockSize); - Teuchos::Array localMat (blockSize*blockSize); - little_host_vec_type X_lcl (localMem.getRawPtr (), blockSize); - - // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned. - LO rowBegin = 0, rowEnd = 0, rowStride = 0; - if (direction == Forward) { - rowBegin = 1; - rowEnd = numLocalMeshRows+1; - rowStride = 1; - } - else if (direction == Backward) { - rowBegin = numLocalMeshRows; - rowEnd = 0; - rowStride = -1; - } - else if (direction == Symmetric) { - this->localGaussSeidel (B, X, D_inv, omega, Forward); - this->localGaussSeidel (B, X, D_inv, omega, Backward); - return; - } - - const Scalar one_minus_omega = Teuchos::ScalarTraits::one()-omega; - const Scalar minus_omega = -omega; - - Kokkos::fence(); - - if (numVecs == 1) { - for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) { - const LO actlRow = lclRow - 1; - - little_host_vec_type B_cur = B.getLocalBlock (actlRow, 0); - COPY (B_cur, X_lcl); - SCAL (static_cast (omega), X_lcl); - - const size_t meshBeg = ptrHost_[actlRow]; - const size_t meshEnd = ptrHost_[actlRow+1]; - for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) { - const LO meshCol = indHost_[absBlkOff]; - const_little_block_type A_cur = - getConstLocalBlockFromAbsOffset (absBlkOff); - little_host_vec_type X_cur = X.getLocalBlock (meshCol, 0); - - // X_lcl += alpha*A_cur*X_cur - const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega; - //X_lcl.matvecUpdate (alpha, A_cur, X_cur); - GEMV (static_cast (alpha), A_cur, X_cur, X_lcl); - } // for each entry in the current local row of the matrix - - // NOTE (mfh 20 Jan 2016) The two input Views here are - // unmanaged already, so we don't have to take unmanaged - // subviews first. - auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ()); - little_host_vec_type X_update = X.getLocalBlock (actlRow, 0); - FILL (X_update, zero); - GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update - } // for each local row of the matrix - } - else { - for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) { - for (LO j = 0; j < numVecs; ++j) { - LO actlRow = lclRow-1; - - little_host_vec_type B_cur = B.getLocalBlock (actlRow, j); - COPY (B_cur, X_lcl); - SCAL (static_cast (omega), X_lcl); - - const size_t meshBeg = ptrHost_[actlRow]; - const size_t meshEnd = ptrHost_[actlRow+1]; - for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) { - const LO meshCol = indHost_[absBlkOff]; - const_little_block_type A_cur = - getConstLocalBlockFromAbsOffset (absBlkOff); - little_host_vec_type X_cur = X.getLocalBlock (meshCol, j); - - // X_lcl += alpha*A_cur*X_cur - const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega; - GEMV (static_cast (alpha), A_cur, X_cur, X_lcl); - } // for each entry in the current local row of the matrx - - auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ()); - auto X_update = X.getLocalBlock (actlRow, j); - FILL (X_update, zero); - GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update - } // for each entry in the current local row of the matrix - } // for each local row of the matrix - } - } - - template - void - BlockCrsMatrix:: - gaussSeidelCopy (MultiVector &/* X */, - const MultiVector &/* B */, - const MultiVector &/* D */, - const Scalar& /* dampingFactor */, - const ESweepDirection /* direction */, - const int /* numSweeps */, - const bool /* zeroInitialGuess */) const - { - // FIXME (mfh 12 Aug 2014) This method has entirely the wrong - // interface for block Gauss-Seidel. - TEUCHOS_TEST_FOR_EXCEPTION( - true, std::logic_error, "Tpetra::BlockCrsMatrix::" - "gaussSeidelCopy: Not implemented."); - } - - template - void - BlockCrsMatrix:: - reorderedGaussSeidelCopy (MultiVector& /* X */, - const MultiVector& /* B */, - const MultiVector& /* D */, - const Teuchos::ArrayView& /* rowIndices */, - const Scalar& /* dampingFactor */, - const ESweepDirection /* direction */, - const int /* numSweeps */, - const bool /* zeroInitialGuess */) const - { - // FIXME (mfh 12 Aug 2014) This method has entirely the wrong - // interface for block Gauss-Seidel. - TEUCHOS_TEST_FOR_EXCEPTION( - true, std::logic_error, "Tpetra::BlockCrsMatrix::" - "reorderedGaussSeidelCopy: Not implemented."); - } - - template void BlockCrsMatrix:: diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp index 1cf25b82ff42..a68995ab6700 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp @@ -62,7 +62,6 @@ // DomainScalar and RangeScalar, so we have to include this header // file here, rather than in the _def header file, so that we can get // the interfaces to the corresponding local computational kernels. -#include "KokkosSparse_sor_sequential_impl.hpp" #include // std::shared_ptr @@ -2808,82 +2807,6 @@ namespace Tpetra { /// Tpetra::Backward. ("Symmetric" requires interprocess /// communication (before each sweep), which is not part of the /// local kernel.) - template - void - localGaussSeidel (const MultiVector &B, - MultiVector &X, - const MultiVector &D, - const RangeScalar& dampingFactor, - const ESweepDirection direction) const - { - typedef LocalOrdinal LO; - typedef GlobalOrdinal GO; - typedef Tpetra::MultiVector DMV; - typedef Tpetra::MultiVector MMV; - typedef typename Node::device_type::memory_space dev_mem_space; - typedef typename MMV::dual_view_type::t_host::device_type host_mem_space; - typedef typename Graph::local_graph_type k_local_graph_type; - typedef typename k_local_graph_type::size_type offset_type; - const char prefix[] = "Tpetra::CrsMatrix::localGaussSeidel: "; - - TEUCHOS_TEST_FOR_EXCEPTION - (! this->isFillComplete (), std::runtime_error, - prefix << "The matrix is not fill complete."); - const size_t lclNumRows = this->getNodeNumRows (); - const size_t numVecs = B.getNumVectors (); - TEUCHOS_TEST_FOR_EXCEPTION - (X.getNumVectors () != numVecs, std::invalid_argument, - prefix << "B.getNumVectors() = " << numVecs << " != " - "X.getNumVectors() = " << X.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (B.getLocalLength () != lclNumRows, std::invalid_argument, - prefix << "B.getLocalLength() = " << B.getLocalLength () - << " != this->getNodeNumRows() = " << lclNumRows << "."); - - // mfh 28 Aug 2017: The current local Gauss-Seidel kernel only - // runs on host. (See comments below.) Thus, we need to access - // the host versions of these data. - const_cast (B).sync_host (); - X.sync_host (); - X.modify_host (); - const_cast (D).sync_host (); - - auto B_lcl = B.template getLocalView (); - auto X_lcl = X.template getLocalView (); - auto D_lcl = D.template getLocalView (); - - offset_type B_stride[8], X_stride[8], D_stride[8]; - B_lcl.stride (B_stride); - X_lcl.stride (X_stride); - D_lcl.stride (D_stride); - - local_matrix_type lclMatrix = this->getLocalMatrix (); - k_local_graph_type lclGraph = lclMatrix.graph; - typename local_matrix_type::row_map_type ptr = lclGraph.row_map; - typename local_matrix_type::index_type ind = lclGraph.entries; - typename local_matrix_type::values_type val = lclMatrix.values; - const offset_type* const ptrRaw = ptr.data (); - const LO* const indRaw = ind.data (); - const impl_scalar_type* const valRaw = val.data (); - - const std::string dir ((direction == Forward) ? "F" : "B"); - // NOTE (mfh 28 Aug 2017) This assumes UVM. We can't get around - // that on GPUs without using a GPU-based sparse triangular - // solve to implement Gauss-Seidel. This exists in cuSPARSE, - // but we would need to implement a wrapper with a fall-back - // algorithm for unsupported Scalar and LO types. - KokkosSparse::Impl::Sequential::gaussSeidel (static_cast (lclNumRows), - static_cast (numVecs), - ptrRaw, indRaw, valRaw, - B_lcl.data (), B_stride[1], - X_lcl.data (), X_stride[1], - D_lcl.data (), - static_cast (dampingFactor), - dir.c_str ()); - const_cast (B).template sync (); - X.template sync (); - const_cast (D).template sync (); - } /// \brief Reordered Gauss-Seidel or SOR on \f$B = A X\f$. /// @@ -2911,91 +2834,6 @@ namespace Tpetra { /// Tpetra::Backward. ("Symmetric" requires interprocess /// communication (before each sweep), which is not part of the /// local kernel.) - template - void - reorderedLocalGaussSeidel (const MultiVector& B, - MultiVector& X, - const MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const RangeScalar& dampingFactor, - const ESweepDirection direction) const - { - typedef LocalOrdinal LO; - typedef GlobalOrdinal GO; - typedef Tpetra::MultiVector DMV; - typedef Tpetra::MultiVector MMV; - typedef typename Node::device_type::memory_space dev_mem_space; - typedef typename MMV::dual_view_type::t_host::device_type host_mem_space; - typedef typename Graph::local_graph_type k_local_graph_type; - typedef typename k_local_graph_type::size_type offset_type; - const char prefix[] = "Tpetra::CrsMatrix::reorderedLocalGaussSeidel: "; - - TEUCHOS_TEST_FOR_EXCEPTION - (! this->isFillComplete (), std::runtime_error, - prefix << "The matrix is not fill complete."); - const size_t lclNumRows = this->getNodeNumRows (); - const size_t numVecs = B.getNumVectors (); - TEUCHOS_TEST_FOR_EXCEPTION - (X.getNumVectors () != numVecs, std::invalid_argument, - prefix << "B.getNumVectors() = " << numVecs << " != " - "X.getNumVectors() = " << X.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (B.getLocalLength () != lclNumRows, std::invalid_argument, - prefix << "B.getLocalLength() = " << B.getLocalLength () - << " != this->getNodeNumRows() = " << lclNumRows << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (static_cast (rowIndices.size ()) < lclNumRows, - std::invalid_argument, prefix << "rowIndices.size() = " - << rowIndices.size () << " < this->getNodeNumRows() = " - << lclNumRows << "."); - - // mfh 28 Aug 2017: The current local Gauss-Seidel kernel only - // runs on host. (See comments below.) Thus, we need to access - // the host versions of these data. - const_cast (B).sync_host (); - X.sync_host (); - X.modify_host (); - const_cast (D).sync_host (); - - auto B_lcl = B.template getLocalView (); - auto X_lcl = X.template getLocalView (); - auto D_lcl = D.template getLocalView (); - - offset_type B_stride[8], X_stride[8], D_stride[8]; - B_lcl.stride (B_stride); - X_lcl.stride (X_stride); - D_lcl.stride (D_stride); - - local_matrix_type lclMatrix = this->getLocalMatrix (); - typename Graph::local_graph_type lclGraph = lclMatrix.graph; - typename local_matrix_type::index_type ind = lclGraph.entries; - typename local_matrix_type::row_map_type ptr = lclGraph.row_map; - typename local_matrix_type::values_type val = lclMatrix.values; - const offset_type* const ptrRaw = ptr.data (); - const LO* const indRaw = ind.data (); - const impl_scalar_type* const valRaw = val.data (); - - const std::string dir = (direction == Forward) ? "F" : "B"; - // NOTE (mfh 28 Aug 2017) This assumes UVM. We can't get around - // that on GPUs without using a GPU-based sparse triangular - // solve to implement Gauss-Seidel, and also handling the - // permutations correctly. - KokkosSparse::Impl::Sequential::reorderedGaussSeidel (static_cast (lclNumRows), - static_cast (numVecs), - ptrRaw, indRaw, valRaw, - B_lcl.data (), - B_stride[1], - X_lcl.data (), - X_stride[1], - D_lcl.data (), - rowIndices.getRawPtr (), - static_cast (lclNumRows), - static_cast (dampingFactor), - dir.c_str ()); - const_cast (B).template sync (); - X.template sync (); - const_cast (D).template sync (); - } /// \brief Return another CrsMatrix with the same entries, but /// converted to a different Scalar type \c T. @@ -3048,229 +2886,6 @@ namespace Tpetra { //! @name Other "apply"-like methods //@{ - /// \brief "Hybrid" Jacobi + (Gauss-Seidel or SOR) on \f$B = A X\f$. - /// - /// "Hybrid" means Successive Over-Relaxation (SOR) or - /// Gauss-Seidel within an (MPI) process, but Jacobi between - /// processes. Gauss-Seidel is a special case of SOR, where the - /// damping factor is one. - /// - /// The Forward or Backward sweep directions have their usual SOR - /// meaning within the process. Interprocess communication occurs - /// once before the sweep, as it normally would in Jacobi. - /// - /// The Symmetric sweep option means two sweeps: first Forward, - /// then Backward. Interprocess communication occurs before each - /// sweep, as in Jacobi. Thus, Symmetric results in two - /// interprocess communication steps. - /// - /// \param B [in] Right-hand side(s). - /// \param X [in/out] On input: initial guess(es). On output: - /// result multivector(s). - /// \param D [in] Inverse of diagonal entries of the matrix A. - /// \param dampingFactor [in] SOR damping factor. A damping - /// factor of one results in Gauss-Seidel. - /// \param direction [in] Sweep direction: Forward, Backward, or - /// Symmetric. - /// \param numSweeps [in] Number of sweeps. We count each - /// Symmetric sweep (including both its Forward and its Backward - /// sweep) as one. - /// - /// \section Tpetra_KR_CrsMatrix_gaussSeidel_req Requirements - /// - /// This method has the following requirements: - /// - /// 1. X is in the domain Map of the matrix. - /// 2. The domain and row Maps of the matrix are the same. - /// 3. The column Map contains the domain Map, and both start at the same place. - /// 4. The row Map is uniquely owned. - /// 5. D is in the row Map of the matrix. - /// 6. X is actually a view of a column Map multivector. - /// 7. Neither B nor D alias X. - /// - /// #1 is just the usual requirement for operators: the input - /// multivector must always be in the domain Map. The - /// Gauss-Seidel kernel imposes additional requirements, since it - /// - /// - overwrites the input multivector with the output (which - /// implies #2), and - /// - uses the same local indices for the input and output - /// multivector (which implies #2 and #3). - /// - /// #3 is reasonable if the matrix constructed the column Map, - /// because the method that does this (CrsGraph::makeColMap) puts - /// the local GIDs (those in the domain Map) in front and the - /// remote GIDs (not in the domain Map) at the end of the column - /// Map. However, if you constructed the column Map yourself, you - /// are responsible for maintaining this invariant. #6 lets us do - /// the Import from the domain Map to the column Map in place. - /// - /// The Gauss-Seidel kernel also assumes that each process has the - /// entire value (not a partial value to sum) of all the diagonal - /// elements in the rows in its row Map. (We guarantee this anyway - /// though the separate D vector.) This is because each element of - /// the output multivector depends nonlinearly on the diagonal - /// elements. Shared ownership of off-diagonal elements would - /// produce different results. - void - gaussSeidel (const MultiVector &B, - MultiVector &X, - const MultiVector &D, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps) const; - - /// \brief Reordered "Hybrid" Jacobi + (Gauss-Seidel or SOR) on \f$B = A X\f$. - /// - /// "Hybrid" means Successive Over-Relaxation (SOR) or - /// Gauss-Seidel within an (MPI) process, but Jacobi between - /// processes. Gauss-Seidel is a special case of SOR, where the - /// damping factor is one. The ordering can be a partial one, in which case the Gauss-Seidel is only - /// executed on a local subset of unknowns. - /// - /// The Forward or Backward sweep directions have their usual SOR - /// meaning within the process. Interprocess communication occurs - /// once before the sweep, as it normally would in Jacobi. - /// - /// The Symmetric sweep option means two sweeps: first Forward, - /// then Backward. Interprocess communication occurs before each - /// sweep, as in Jacobi. Thus, Symmetric results in two - /// interprocess communication steps. - /// - /// \param B [in] Right-hand side(s). - /// \param X [in/out] On input: initial guess(es). On output: - /// result multivector(s). - /// \param D [in] Inverse of diagonal entries of the matrix A. - /// \param rowIndices [in] Ordered list of indices on which to execute GS. - /// \param dampingFactor [in] SOR damping factor. A damping - /// factor of one results in Gauss-Seidel. - /// \param direction [in] Sweep direction: Forward, Backward, or - /// Symmetric. - /// \param numSweeps [in] Number of sweeps. We count each - /// Symmetric sweep (including both its Forward and its Backward - /// sweep) as one. - /// - /// \section Tpetra_KR_CrsMatrix_reorderedGaussSeidel_req Requirements - /// - /// This method has the following requirements: - /// - /// 1. X is in the domain Map of the matrix. - /// 2. The domain and row Maps of the matrix are the same. - /// 3. The column Map contains the domain Map, and both start at the same place. - /// 4. The row Map is uniquely owned. - /// 5. D is in the row Map of the matrix. - /// 6. X is actually a view of a column Map multivector. - /// 7. Neither B nor D alias X. - /// - /// #1 is just the usual requirement for operators: the input - /// multivector must always be in the domain Map. The - /// Gauss-Seidel kernel imposes additional requirements, since it - /// - /// - overwrites the input multivector with the output (which - /// implies #2), and - /// - uses the same local indices for the input and output - /// multivector (which implies #2 and #3). - /// - /// #3 is reasonable if the matrix constructed the column Map, - /// because the method that does this (CrsGraph::makeColMap) puts - /// the local GIDs (those in the domain Map) in front and the - /// remote GIDs (not in the domain Map) at the end of the column - /// Map. However, if you constructed the column Map yourself, you - /// are responsible for maintaining this invariant. #6 lets us do - /// the Import from the domain Map to the column Map in place. - /// - /// The Gauss-Seidel kernel also assumes that each process has the - /// entire value (not a partial value to sum) of all the diagonal - /// elements in the rows in its row Map. (We guarantee this anyway - /// though the separate D vector.) This is because each element of - /// the output multivector depends nonlinearly on the diagonal - /// elements. Shared ownership of off-diagonal elements would - /// produce different results. - void - reorderedGaussSeidel (const MultiVector& B, - MultiVector& X, - const MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps) const; - - /// \brief Version of gaussSeidel(), with fewer requirements on X. - /// - /// This method is just like gaussSeidel(), except that X need - /// only be in the domain Map. This method does not require that - /// X be a domain Map view of a column Map multivector. As a - /// result, this method must copy X into a domain Map multivector - /// before operating on it. - /// - /// \param X [in/out] On input: initial guess(es). On output: - /// result multivector(s). - /// \param B [in] Right-hand side(s), in the range Map. - /// \param D [in] Inverse of diagonal entries of the matrix, - /// in the row Map. - /// \param dampingFactor [in] SOR damping factor. A damping - /// factor of one results in Gauss-Seidel. - /// \param direction [in] Sweep direction: Forward, Backward, or - /// Symmetric. - /// \param numSweeps [in] Number of sweeps. We count each - /// Symmetric sweep (including both its Forward and its - /// Backward sweep) as one. - /// \param zeroInitialGuess [in] If true, this method will fill X - /// with zeros initially. If false, this method will assume - /// that X contains a possibly nonzero initial guess on input. - /// Note that a nonzero initial guess may impose an additional - /// nontrivial communication cost (an additional Import). - /// - /// \pre Domain, range, and row Maps of the sparse matrix are all the same. - /// \pre No other argument aliases X. - void - gaussSeidelCopy (MultiVector &X, - const MultiVector &B, - const MultiVector &D, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const; - - /// \brief Version of reorderedGaussSeidel(), with fewer requirements on X. - /// - /// This method is just like reorderedGaussSeidel(), except that X need - /// only be in the domain Map. This method does not require that - /// X be a domain Map view of a column Map multivector. As a - /// result, this method must copy X into a domain Map multivector - /// before operating on it. - /// - /// \param X [in/out] On input: initial guess(es). On output: - /// result multivector(s). - /// \param B [in] Right-hand side(s), in the range Map. - /// \param D [in] Inverse of diagonal entries of the matrix, - /// in the row Map. - /// \param rowIndices [in] Ordered list of indices on which to execute GS. - /// \param dampingFactor [in] SOR damping factor. A damping - /// factor of one results in Gauss-Seidel. - /// \param direction [in] Sweep direction: Forward, Backward, or - /// Symmetric. - /// \param numSweeps [in] Number of sweeps. We count each - /// Symmetric sweep (including both its Forward and its - /// Backward sweep) as one. - /// \param zeroInitialGuess [in] If true, this method will fill X - /// with zeros initially. If false, this method will assume - /// that X contains a possibly nonzero initial guess on input. - /// Note that a nonzero initial guess may impose an additional - /// nontrivial communication cost (an additional Import). - /// - /// \pre Domain, range, and row Maps of the sparse matrix are all the same. - /// \pre No other argument aliases X. - void - reorderedGaussSeidelCopy (MultiVector& X, - const MultiVector& B, - const MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const; - /// \brief Implementation of RowMatrix::add: return alpha*A + beta*this. /// /// This override of the default implementation ensures that, when diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index d46a492f181a..cd995c00e8d4 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -5494,609 +5494,6 @@ namespace Tpetra { } } - template - void - CrsMatrix:: - gaussSeidel (const MultiVector& B, - MultiVector& X, - const MultiVector& D, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps) const - { - reorderedGaussSeidel (B, X, D, Teuchos::null, dampingFactor, direction, numSweeps); - } - - template - void - CrsMatrix:: - reorderedGaussSeidel (const MultiVector& B, - MultiVector& X, - const MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps) const - { - using Teuchos::null; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcp_const_cast; - using Teuchos::rcpFromRef; - typedef Scalar ST; - - TEUCHOS_TEST_FOR_EXCEPTION( - isFillComplete() == false, std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel: cannot call this method until " - "fillComplete() has been called."); - TEUCHOS_TEST_FOR_EXCEPTION( - numSweeps < 0, - std::invalid_argument, - "Tpetra::CrsMatrix::gaussSeidel: The number of sweeps must be , " - "nonnegative but you provided numSweeps = " << numSweeps << " < 0."); - - // Translate from global to local sweep direction. - // While doing this, validate the input. - ESweepDirection localDirection; - if (direction == Forward) { - localDirection = Forward; - } - else if (direction == Backward) { - localDirection = Backward; - } - else if (direction == Symmetric) { - // We'll control local sweep direction manually. - localDirection = Forward; - } - else { - TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, - "Tpetra::CrsMatrix::gaussSeidel: The 'direction' enum does not have " - "any of its valid values: Forward, Backward, or Symmetric."); - } - - if (numSweeps == 0) { - return; // Nothing to do. - } - - // We don't need the Export object because this method assumes - // that the row, domain, and range Maps are the same. We do need - // the Import object, if there is one, though. - RCP importer = this->getGraph()->getImporter(); - RCP exporter = this->getGraph()->getExporter(); - TEUCHOS_TEST_FOR_EXCEPTION( - ! exporter.is_null (), std::runtime_error, - "Tpetra's gaussSeidel implementation requires that the row, domain, " - "and range Maps be the same. This cannot be the case, because the " - "matrix has a nontrivial Export object."); - - RCP domainMap = this->getDomainMap (); - RCP rangeMap = this->getRangeMap (); - RCP rowMap = this->getGraph ()->getRowMap (); - RCP colMap = this->getGraph ()->getColMap (); - -#ifdef HAVE_TEUCHOS_DEBUG - { - // The relation 'isSameAs' is transitive. It's also a - // collective, so we don't have to do a "shared" test for - // exception (i.e., a global reduction on the test value). - TEUCHOS_TEST_FOR_EXCEPTION( - ! X.getMap ()->isSameAs (*domainMap), - std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel requires that the input " - "multivector X be in the domain Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! B.getMap ()->isSameAs (*rangeMap), - std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel requires that the input " - "B be in the range Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! D.getMap ()->isSameAs (*rowMap), - std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel requires that the input " - "D be in the row Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! rowMap->isSameAs (*rangeMap), - std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the " - "range Map be the same (in the sense of Tpetra::Map::isSameAs)."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! domainMap->isSameAs (*rangeMap), - std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and " - "the range Map of the matrix be the same."); - } -#else - // Forestall any compiler warnings for unused variables. - (void) rangeMap; - (void) rowMap; -#endif // HAVE_TEUCHOS_DEBUG - - // If B is not constant stride, copy it into a constant stride - // multivector. We'l handle the right-hand side B first and deal - // with X right before the sweeps, to improve locality of the - // first sweep. (If the problem is small enough, then that will - // hopefully keep more of the entries of X in cache. This - // optimizes for the typical case of a small number of sweeps.) - RCP B_in; - if (B.isConstantStride()) { - B_in = rcpFromRef (B); - } - else { - // The range Map and row Map are the same in this case, so we - // can use the (possibly cached) row Map multivector to store a - // constant stride copy of B. We don't have to copy back, since - // Gauss-Seidel won't modify B. - RCP B_in_nonconst = getRowMapMultiVector (B, true); - deep_copy (*B_in_nonconst, B); // Copy from B into B_in(_nonconst). - B_in = rcp_const_cast (B_in_nonconst); - - TPETRA_EFFICIENCY_WARNING( - ! B.isConstantStride (), - std::runtime_error, - "gaussSeidel: The current implementation of the Gauss-Seidel kernel " - "requires that X and B both have constant stride. Since B does not " - "have constant stride, we had to make a copy. This is a limitation of " - "the current implementation and not your fault, but we still report it " - "as an efficiency warning for your information."); - } - - // If X is not constant stride, copy it into a constant stride - // multivector. Also, make the column Map multivector X_colMap, - // and its domain Map view X_domainMap. (X actually must be a - // domain Map view of a column Map multivector; exploit this, if X - // has constant stride.) - - RCP X_domainMap; - RCP X_colMap; - bool copiedInput = false; - - if (importer.is_null ()) { // Domain and column Maps are the same. - if (X.isConstantStride ()) { - X_domainMap = rcpFromRef (X); - X_colMap = X_domainMap; - copiedInput = false; - } - else { - // Get a temporary column Map multivector, make a domain Map - // view of it, and copy X into the domain Map view. We have - // to copy here because we won't be doing Import operations. - X_colMap = getColumnMapMultiVector (X, true); - X_domainMap = X_colMap; // Domain and column Maps are the same. - deep_copy (*X_domainMap, X); // Copy X into the domain Map view. - copiedInput = true; - TPETRA_EFFICIENCY_WARNING( - ! X.isConstantStride (), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the " - "Gauss-Seidel kernel requires that X and B both have constant " - "stride. Since X does not have constant stride, we had to make a " - "copy. This is a limitation of the current implementation and not " - "your fault, but we still report it as an efficiency warning for " - "your information."); - } - } - else { // We will be doing Import operations in the sweeps. - if (X.isConstantStride ()) { - X_domainMap = rcpFromRef (X); - // This kernel assumes that X is a domain Map view of a column - // Map multivector. We will only check if this is valid if - // the CMake configure Teuchos_ENABLE_DEBUG is ON. - X_colMap = X_domainMap->offsetViewNonConst (colMap, 0); - - // FIXME (mfh 19 Mar 2013) Do we need to fill the remote - // entries of X_colMap with zeros? Do we need to fill all of - // X_domainMap initially with zeros? Ifpack - // (Ifpack_PointRelaxation.cpp, line 906) creates an entirely - // new MultiVector each time. - - // Do the first Import for the first sweep. This simplifies - // the logic in the sweeps. - X_colMap->doImport (X, *importer, INSERT); - copiedInput = false; - } - else { - // Get a temporary column Map multivector X_colMap, and make a - // domain Map view X_domainMap of it. Instead of copying, we - // do an Import from X into X_domainMap. This saves us a - // copy, since the Import has to copy the data anyway. - X_colMap = getColumnMapMultiVector (X, true); - X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0); - X_colMap->doImport (X, *importer, INSERT); - copiedInput = true; - TPETRA_EFFICIENCY_WARNING( - ! X.isConstantStride (), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the " - "Gauss-Seidel kernel requires that X and B both have constant stride. " - "Since X does not have constant stride, we had to make a copy. " - "This is a limitation of the current implementation and not your fault, " - "but we still report it as an efficiency warning for your information."); - } - } - - for (int sweep = 0; sweep < numSweeps; ++sweep) { - if (! importer.is_null () && sweep > 0) { - // We already did the first Import for the zeroth sweep. - X_colMap->doImport (*X_domainMap, *importer, INSERT); - } - - // Do local Gauss-Seidel. - if (direction != Symmetric) { - if (rowIndices.is_null ()) { - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - localDirection); - } - else { - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - localDirection); - } - } - else { // direction == Symmetric - const bool doImportBetweenDirections = false; - if (rowIndices.is_null ()) { - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - Forward); - // mfh 18 Mar 2013: Aztec's implementation of "symmetric - // Gauss-Seidel" does _not_ do an Import between the forward - // and backward sweeps. This makes sense, because Aztec - // considers "symmetric Gauss-Seidel" a subdomain solver. - if (doImportBetweenDirections) { - // Communicate again before the Backward sweep. - if (! importer.is_null ()) { - X_colMap->doImport (*X_domainMap, *importer, INSERT); - } - } - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - Backward); - } - else { - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - Forward); - if (doImportBetweenDirections) { - // Communicate again before the Backward sweep. - if (! importer.is_null ()) { - X_colMap->doImport (*X_domainMap, *importer, INSERT); - } - } - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - Backward); - } - } - } - - if (copiedInput) { - deep_copy (X, *X_domainMap); // Copy back from X_domainMap to X. - } - } - - template - void - CrsMatrix:: - gaussSeidelCopy (MultiVector& X, - const MultiVector& B, - const MultiVector& D, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const - { - reorderedGaussSeidelCopy (X, B, D, Teuchos::null, dampingFactor, direction, - numSweeps, zeroInitialGuess); - } - - template - void - CrsMatrix:: - reorderedGaussSeidelCopy (MultiVector& X, - const MultiVector& B, - const MultiVector& D, - const Teuchos::ArrayView& rowIndices, - const Scalar& dampingFactor, - const ESweepDirection direction, - const int numSweeps, - const bool zeroInitialGuess) const - { - using Teuchos::null; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcpFromRef; - using Teuchos::rcp_const_cast; - typedef Scalar ST; - const char prefix[] = "Tpetra::CrsMatrix::(reordered)gaussSeidelCopy: "; - const Scalar ZERO = Teuchos::ScalarTraits::zero (); - - TEUCHOS_TEST_FOR_EXCEPTION( - ! isFillComplete (), std::runtime_error, - prefix << "The matrix is not fill complete."); - TEUCHOS_TEST_FOR_EXCEPTION( - numSweeps < 0, std::invalid_argument, - prefix << "The number of sweeps must be nonnegative, " - "but you provided numSweeps = " << numSweeps << " < 0."); - - // Translate from global to local sweep direction. - // While doing this, validate the input. - ESweepDirection localDirection; - if (direction == Forward) { - localDirection = Forward; - } - else if (direction == Backward) { - localDirection = Backward; - } - else if (direction == Symmetric) { - // We'll control local sweep direction manually. - localDirection = Forward; - } - else { - TEUCHOS_TEST_FOR_EXCEPTION( - true, std::invalid_argument, - prefix << "The 'direction' enum does not have any of its valid " - "values: Forward, Backward, or Symmetric."); - } - - if (numSweeps == 0) { - return; - } - - RCP importer = this->getGraph ()->getImporter (); - RCP exporter = this->getGraph ()->getExporter (); - TEUCHOS_TEST_FOR_EXCEPTION( - ! exporter.is_null (), std::runtime_error, - "This method's implementation currently requires that the matrix's row, " - "domain, and range Maps be the same. This cannot be the case, because " - "the matrix has a nontrivial Export object."); - - RCP domainMap = this->getDomainMap (); - RCP rangeMap = this->getRangeMap (); - RCP rowMap = this->getGraph ()->getRowMap (); - RCP colMap = this->getGraph ()->getColMap (); - -#ifdef HAVE_TEUCHOS_DEBUG - { - // The relation 'isSameAs' is transitive. It's also a - // collective, so we don't have to do a "shared" test for - // exception (i.e., a global reduction on the test value). - TEUCHOS_TEST_FOR_EXCEPTION( - ! X.getMap ()->isSameAs (*domainMap), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " - "multivector X be in the domain Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " - "B be in the range Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! D.getMap ()->isSameAs (*rowMap), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input " - "D be in the row Map of the matrix."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! rowMap->isSameAs (*rangeMap), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the " - "range Map be the same (in the sense of Tpetra::Map::isSameAs)."); - TEUCHOS_TEST_FOR_EXCEPTION( - ! domainMap->isSameAs (*rangeMap), std::runtime_error, - "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and " - "the range Map of the matrix be the same."); - } -#else - // Forestall any compiler warnings for unused variables. - (void) rangeMap; - (void) rowMap; -#endif // HAVE_TEUCHOS_DEBUG - - // Fetch a (possibly cached) temporary column Map multivector - // X_colMap, and a domain Map view X_domainMap of it. Both have - // constant stride by construction. We know that the domain Map - // must include the column Map, because our Gauss-Seidel kernel - // requires that the row Map, domain Map, and range Map are all - // the same, and that each process owns all of its own diagonal - // entries of the matrix. - - RCP X_colMap; - RCP X_domainMap; - bool copyBackOutput = false; - if (importer.is_null ()) { - if (X.isConstantStride ()) { - X_colMap = rcpFromRef (X); - X_domainMap = rcpFromRef (X); - // Column Map and domain Map are the same, so there are no - // remote entries. Thus, if we are not setting the initial - // guess to zero, we don't have to worry about setting remote - // entries to zero, even though we are not doing an Import in - // this case. - if (zeroInitialGuess) { - X_colMap->putScalar (ZERO); - } - // No need to copy back to X at end. - } - else { // We must copy X into a constant stride multivector. - // Just use the cached column Map multivector for that. - // force=true means fill with zeros, so no need to fill - // remote entries (not in domain Map) with zeros. - X_colMap = getColumnMapMultiVector (X, true); - // X_domainMap is always a domain Map view of the column Map - // multivector. In this case, the domain and column Maps are - // the same, so X_domainMap _is_ X_colMap. - X_domainMap = X_colMap; - if (! zeroInitialGuess) { // Don't copy if zero initial guess - try { - deep_copy (*X_domainMap , X); // Copy X into constant stride MV - } catch (std::exception& e) { - std::ostringstream os; - os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: " - "deep_copy(*X_domainMap, X) threw an exception: " - << e.what () << "."; - TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ()); - } - } - copyBackOutput = true; // Don't forget to copy back at end. - TPETRA_EFFICIENCY_WARNING( - ! X.isConstantStride (), - std::runtime_error, - "gaussSeidelCopy: The current implementation of the Gauss-Seidel " - "kernel requires that X and B both have constant stride. Since X " - "does not have constant stride, we had to make a copy. This is a " - "limitation of the current implementation and not your fault, but we " - "still report it as an efficiency warning for your information."); - } - } - else { // Column Map and domain Map are _not_ the same. - X_colMap = getColumnMapMultiVector (X); - X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0); - -#ifdef HAVE_TPETRA_DEBUG - auto X_colMap_host_view = X_colMap->getLocalViewHost (); - auto X_domainMap_host_view = X_domainMap->getLocalViewHost (); - - if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) { - TEUCHOS_TEST_FOR_EXCEPTION - (X_colMap_host_view.data () != X_domainMap_host_view.data (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to " - "start of column Map view of X is not equal to pointer to start of " - "(domain Map view of) X. This may mean that Tpetra::MultiVector::" - "offsetViewNonConst is broken. " - "Please report this bug to the Tpetra developers."); - } - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) || - X_colMap->getLocalLength () < X_domainMap->getLocalLength (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " - "X_colMap has fewer local rows than X_domainMap. " - "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0) - << ", X_domainMap_host_view.extent(0) = " - << X_domainMap_host_view.extent (0) - << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength () - << ", and X_domainMap->getLocalLength() = " - << X_domainMap->getLocalLength () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); - - TEUCHOS_TEST_FOR_EXCEPTION( - X_colMap->getNumVectors () != X_domainMap->getNumVectors (), - std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: " - "X_colMap has a different number of columns than X_domainMap. " - "X_colMap->getNumVectors() = " << X_colMap->getNumVectors () - << " != X_domainMap->getNumVectors() = " - << X_domainMap->getNumVectors () - << ". This means that Tpetra::MultiVector::offsetViewNonConst " - "is broken. Please report this bug to the Tpetra developers."); -#endif // HAVE_TPETRA_DEBUG - - if (zeroInitialGuess) { - // No need for an Import, since we're filling with zeros. - X_colMap->putScalar (ZERO); - } else { - // We could just copy X into X_domainMap. However, that - // wastes a copy, because the Import also does a copy (plus - // communication). Since the typical use case for - // Gauss-Seidel is a small number of sweeps (2 is typical), we - // don't want to waste that copy. Thus, we do the Import - // here, and skip the first Import in the first sweep. - // Importing directly from X effects the copy into X_domainMap - // (which is a view of X_colMap). - X_colMap->doImport (X, *importer, INSERT); - } - copyBackOutput = true; // Don't forget to copy back at end. - } // if column and domain Maps are (not) the same - - // The Gauss-Seidel / SOR kernel expects multivectors of constant - // stride. X_colMap is by construction, but B might not be. If - // it's not, we have to make a copy. - RCP B_in; - if (B.isConstantStride ()) { - B_in = rcpFromRef (B); - } - else { - // Range Map and row Map are the same in this case, so we can - // use the cached row Map multivector to store a constant stride - // copy of B. - RCP B_in_nonconst = getRowMapMultiVector (B, true); - try { - deep_copy (*B_in_nonconst, B); - } catch (std::exception& e) { - std::ostringstream os; - os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: " - "deep_copy(*B_in_nonconst, B) threw an exception: " - << e.what () << "."; - TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ()); - } - B_in = rcp_const_cast (B_in_nonconst); - - TPETRA_EFFICIENCY_WARNING( - ! B.isConstantStride (), - std::runtime_error, - "gaussSeidelCopy: The current implementation requires that B have " - "constant stride. Since B does not have constant stride, we had to " - "copy it into a separate constant-stride multivector. This is a " - "limitation of the current implementation and not your fault, but we " - "still report it as an efficiency warning for your information."); - } - - for (int sweep = 0; sweep < numSweeps; ++sweep) { - if (! importer.is_null () && sweep > 0) { - // We already did the first Import for the zeroth sweep above, - // if it was necessary. - X_colMap->doImport (*X_domainMap, *importer, INSERT); - } - - // Do local Gauss-Seidel. - if (direction != Symmetric) { - if (rowIndices.is_null ()) { - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - localDirection); - } - else { - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - localDirection); - } - } - else { // direction == Symmetric - if (rowIndices.is_null ()) { - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - Forward); - // mfh 18 Mar 2013: Aztec's implementation of "symmetric - // Gauss-Seidel" does _not_ do an Import between the forward - // and backward sweeps. This makes symmetric Gauss-Seidel a - // symmetric preconditioner if the matrix A is symmetric. We - // imitate Aztec's behavior here. - this->template localGaussSeidel (*B_in, *X_colMap, D, - dampingFactor, - Backward); - } - else { - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - Forward); - this->template reorderedLocalGaussSeidel (*B_in, *X_colMap, - D, rowIndices, - dampingFactor, - Backward); - - } - } - } - - if (copyBackOutput) { - try { - deep_copy (X , *X_domainMap); // Copy result back into X. - } catch (std::exception& e) { - TEUCHOS_TEST_FOR_EXCEPTION( - true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) " - "threw an exception: " << e.what ()); - } - } - } template template diff --git a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp index 8260ad2cb414..aca76f9cb091 100644 --- a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp +++ b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp @@ -1768,450 +1768,6 @@ namespace { TEST_EQUALITY_CONST( gblSuccess, 1 ); } - // - // Test BlockCrsMatrix's localGaussSeidel with a block diagonal matrix. - // - TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( BlockCrsMatrix, localGSDiagonalMatrix, Scalar, LO, GO, Node ) - { - using Kokkos::ALL; - typedef Scalar ST; - typedef Tpetra::BlockVector BV; - typedef Tpetra::BlockCrsMatrix BCM; - typedef Tpetra::CrsGraph graph_type; - typedef Tpetra::Map map_type; - typedef typename graph_type::device_type device_type; - typedef typename BCM::impl_scalar_type IST; - typedef Teuchos::ScalarTraits STS; - - const ST two = STS::one () + STS::one (); - const ST three = two + STS::one (); - const auto tol = 10.0 * STS::eps (); - - out << "Testing Tpetra::BlockCrsMatrix::localGaussSeidel " - "with a matrix whose graph is diagonal" << endl; - Teuchos::OSTab tab0 (out); - - RCP > comm = getDefaultComm (); - const GST INVALID = Teuchos::OrdinalTraits::invalid (); - - out << "Creating mesh row Map" << endl; - - const size_t numLocalMeshPoints = 12; - const GO indexBase = 1; - // Use a block size that is not a power of 2, to test correctness - // in case the matrix pads blocks for SIMD-ization. - const LO blockSize = 3; - - // mfh 20 May 2014: Tpetra::CrsGraph still needs the row Map as an - // RCP. Later interface changes will let us pass in the Map by - // const reference and assume view semantics. - RCP meshRowMapPtr = - rcp (new map_type (INVALID, numLocalMeshPoints, indexBase, comm)); - const map_type& meshRowMap = *meshRowMapPtr; - - out << "Creating mesh graph" << endl; - - const size_t maxNumEntPerRow = 1; - graph_type graph (meshRowMapPtr, maxNumEntPerRow, Tpetra::StaticProfile); - - // Fill the graph with only diagonal entries - Teuchos::Array gblColInds (maxNumEntPerRow); - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd); - gblColInds[0] = gblRowInd; - graph.insertGlobalIndices (gblRowInd, gblColInds ()); - } - graph.fillComplete (); - - // Get the graph's column Map (the "mesh column Map"). - TEST_ASSERT( ! graph.getColMap ().is_null () ); - map_type meshColMap = * (graph.getColMap ()); - - out << "Creating BlockCrsMatrix" << endl; - - // Construct the BlockCrsMatrix. - BCM blockMat; - TEST_NOTHROW( blockMat = BCM (graph, blockSize) ); - BV residual; - TEST_NOTHROW( residual = BV(meshRowMap, blockSize)); - BV solution; - TEST_NOTHROW( solution = BV(meshRowMap, blockSize)); - - Teuchos::Array basematrix (blockSize*blockSize, STS::zero ()); - basematrix[0] = two; - basematrix[2] = three; - basematrix[3] = three; - basematrix[4] = two; - basematrix[7] = three; - basematrix[8] = two; - - Teuchos::Array baseResidual(blockSize, STS::zero ()); - baseResidual[0] = STS::one (); - baseResidual[1] = three; - baseResidual[2] = -two; - - // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar - // = float or double. On the other hand, the author of these - // tests understood that and only instantiated them for - // Scalar=double (see the instantiations list below). - Teuchos::Array exactSolution(blockSize, STS::zero()); - exactSolution[0] = 43.0/35.0; - exactSolution[1] = -12.0/35.0; - exactSolution[2] = -17.0/35.0; - - // NOTE (mfh 26 May 2016) We may start modifying the matrix on - // host now, because we haven't yet done anything to it on device. - - Teuchos::Array lclColInds(1); - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - for (LO k = 0; k < blockSize*blockSize; ++k) { - basematrix[k] *= two; - } - for (LO k = 0; k < blockSize; ++k) { - baseResidual[k] *= two; - } - lclColInds[0] = lclRowInd; - blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (), - basematrix.getRawPtr (), 1); - residual.replaceLocalValues (lclRowInd, baseResidual.getRawPtr ()); - solution.replaceLocalValues (lclRowInd, baseResidual.getRawPtr ()); - } - - Kokkos::View diagonalOffsets ("offsets", numLocalMeshPoints); - graph.getLocalDiagOffsets (diagonalOffsets); - - // Sync the matrix to device, since getLocalDiagCopy runs there. - blockMat.template sync (); - - typedef Kokkos::View block_diag_type; - block_diag_type blockDiag ("blockDiag", numLocalMeshPoints, - blockSize, blockSize); - blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets); - - Kokkos::View pivots ("pivots", numLocalMeshPoints, blockSize); - // That's how we found this test: the pivots array was filled with ones. - Kokkos::deep_copy (pivots, 1); - out << "pivots size = " << pivots.extent(0) << endl; - - for (LO lclMeshRow = 0; lclMeshRow < static_cast (numLocalMeshPoints); ++lclMeshRow) { - auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ()); - - // Make sure that the diagonal block is correct. - Scalar* blkVals = NULL; - const LO* blkColInds = NULL; - LO blkNumEnt = 0; - blockMat.getLocalRowView (lclMeshRow, blkColInds, blkVals, blkNumEnt); - - TEST_EQUALITY( blkNumEnt, static_cast (1) ); - if (blkNumEnt == 1) { - typename BCM::const_little_block_type diagBlock2 ((typename BCM::const_little_block_type::value_type*) blkVals, blockSize, blockSize); - for (LO j = 0; j < blockSize; ++j) { - for (LO i = 0; i < blockSize; ++i) { - TEST_EQUALITY( diagBlock(i,j), diagBlock2(i,j) ); - } - } - } - - auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ()); - int info = 0; - Tpetra::GETF2 (diagBlock, ipiv, info); - TEST_EQUALITY( info, 0 ); - - // GETRI needs workspace. Use host space for now. - Teuchos::Array workVec (blockSize); - Kokkos::View - work (workVec.getRawPtr (), blockSize); - Tpetra::GETRI (diagBlock, ipiv, work, info); - TEST_EQUALITY( info, 0 ); - } - - blockMat.localGaussSeidel (residual, solution, blockDiag, - STS::one(), Tpetra::Forward); - - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd); - ST* x = reinterpret_cast (xlcl.data ()); - out << "row = " << lclRowInd << endl; - for (LO k = 0; k < blockSize; ++k) { - TEST_FLOATING_EQUALITY( x[k], exactSolution[k], tol ); - x[k] = -STS::one (); - } - } - - blockMat.localGaussSeidel (residual, solution, blockDiag, - STS::one (), Tpetra::Backward); - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd); - ST* x = reinterpret_cast (xlcl.data ()); - for (LO k = 0; k < blockSize; ++k) { - TEST_FLOATING_EQUALITY( x[k], exactSolution[k], tol ); - } - } - - // Final output - int lclSuccess = success; - int gblSuccess = 0; - reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); - TEST_EQUALITY_CONST( gblSuccess, 1 ); - } - - // - // Test BlockCrsMatrix's localGaussSeidel with a triangular matrix (???) - // - TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( BlockCrsMatrix, localGSTriangularMatrices, ST, LO, GO, Node ) - { - using Kokkos::ALL; - typedef Tpetra::BlockVector BV; - typedef Tpetra::BlockCrsMatrix BCM; - typedef typename BCM::impl_scalar_type IST; - typedef Tpetra::CrsGraph graph_type; - typedef typename graph_type::device_type device_type; - typedef Tpetra::Map map_type; - typedef Teuchos::ScalarTraits STS; - - const ST two = STS::one () + STS::one (); - const ST three = STS::one () + STS::one () + STS::one (); - const auto tol = 10.0 * STS::eps (); - - out << "Testing Tpetra::BlockCrsMatrix::localGaussSeidel " - "with a triangular matrix ( ??? )" << endl; - Teuchos::OSTab tab0 (out); - - RCP > comm = getDefaultComm (); - const GST INVALID = Teuchos::OrdinalTraits::invalid (); - - out << "Creating mesh row Map" << endl; - - const size_t numLocalMeshPoints = 3; - const GO indexBase = 1; - // Use a block size that is not a power of 2, to test correctness - // in case the matrix pads blocks for SIMD-ization. - const LO blockSize = 2; - - // mfh 20 May 2014: Tpetra::CrsGraph still needs the row Map as an - // RCP. Later interface changes will let us pass in the Map by - // const reference and assume view semantics. - RCP meshRowMapPtr = - rcp (new map_type (INVALID, numLocalMeshPoints, indexBase, comm)); - const map_type& meshRowMap = *meshRowMapPtr; - - out << "Creating mesh graph" << endl; - - const size_t maxNumEntPerRow = 3; - graph_type graph (meshRowMapPtr, maxNumEntPerRow, Tpetra::StaticProfile); - - Teuchos::Array gblColInds (maxNumEntPerRow); - - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - - const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd); - - for (size_t k = 0; k < maxNumEntPerRow; ++k) { - const LO lclColInd = meshRowMap.getMinLocalIndex () + k; - const GO gblColInd = meshRowMap.getGlobalElement (lclColInd); - gblColInds[k] = gblColInd; - } - graph.insertGlobalIndices (gblRowInd, gblColInds ()); - } - - graph.fillComplete (); - - // Get the graph's column Map (the "mesh column Map"). - TEST_ASSERT( ! graph.getColMap ().is_null () ); - map_type meshColMap = * (graph.getColMap ()); - - out << "Creating BlockCrsMatrix" << endl; - - // Construct the BlockCrsMatrix. - BCM blockMat; - TEST_NOTHROW( blockMat = BCM (graph, blockSize) ); - BV residual; - TEST_NOTHROW( residual = BV (meshRowMap, blockSize)); - BV solution; - TEST_NOTHROW( solution = BV (meshRowMap, blockSize)); - - Teuchos::Array basematrix (maxNumEntPerRow * maxNumEntPerRow, STS::zero ()); - basematrix[0] = two; - basematrix[3] = three; - basematrix[4] = two; - basematrix[6] = STS::one (); - basematrix[7] = three; - basematrix[8] = two; - - Teuchos::Array baseResidual (maxNumEntPerRow, STS::zero ()); - baseResidual[0] = STS::one (); - baseResidual[1] = three; - baseResidual[2] = -two; - - // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar - // = float or double. On the other hand, the author of these - // tests understood that and only instantiated them for - // Scalar=double (see the instantiations list below). - Teuchos::Array exactSolution (maxNumEntPerRow, STS::zero ()); - exactSolution[0] = 0.5; - exactSolution[1] = 0.75; - exactSolution[2] = -19.0/8.0; - - Teuchos::Array assembleMatrix (blockSize * blockSize, STS::zero ()); - Teuchos::Array assembleResidual (blockSize, STS::zero ()); - - Teuchos::Array lclColInds (1); - - // lower triangular matrix - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex (); - for (LO k = 0; k < blockSize; ++k) { - assembleResidual[k] = baseResidual[rowOffset]; - } - residual.replaceLocalValues (lclRowInd, &assembleResidual[0]); - - for (LO k = 0; k < blockSize; ++k) { - assembleResidual[k] = STS::zero (); - } - solution.replaceLocalValues (lclRowInd, &assembleResidual[0]); - - for (size_t i = 0; i < maxNumEntPerRow; ++i) { - for (LO j = 0; j < blockSize*blockSize; ++j) { - assembleMatrix[j] = STS::zero (); - } - const size_t indexBaseMatrix = (maxNumEntPerRow)*rowOffset+i; - for (LO j = 0; j < blockSize; ++j) { - assembleMatrix[(blockSize+1)*j] = basematrix[indexBaseMatrix]; - } - - lclColInds[0] = meshRowMap.getMinLocalIndex () + i; - blockMat.replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &assembleMatrix[0], 1); - } - } - - Kokkos::View diagonalOffsets ("offsets", numLocalMeshPoints); - graph.getLocalDiagOffsets(diagonalOffsets); - - typedef Kokkos::View block_diag_type; - block_diag_type blockDiag ("blockDiag", numLocalMeshPoints, - blockSize, blockSize); - Kokkos::fence (); - blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets); - Kokkos::fence (); - - using Kokkos::view_alloc; - using Kokkos::WithoutInitializing; - Kokkos::View pivots (view_alloc ("pivots", WithoutInitializing), - numLocalMeshPoints, blockSize); - // That's how we found this test: the pivots array was filled with ones. - Kokkos::deep_copy (pivots, 1); - - Kokkos::fence (); - - for (LO lclMeshRow = 0; lclMeshRow < static_cast (numLocalMeshPoints); ++lclMeshRow) { - auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ()); - auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ()); - int info = 0; - Tpetra::GETF2 (diagBlock, ipiv, info); - TEST_EQUALITY( info, 0 ); - - // GETRI needs workspace. Use host space for now. - Teuchos::Array workVec (blockSize); - Kokkos::View - work (workVec.getRawPtr (), blockSize); - Tpetra::GETRI (diagBlock, ipiv, work, info); - TEST_EQUALITY( info, 0 ); - } - - blockMat.localGaussSeidel (residual, solution, blockDiag, - STS::one (), Tpetra::Forward); - - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) { - const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex (); - typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd); - ST* x = reinterpret_cast (xlcl.data ()); - for (LO k = 0; k < blockSize; ++k) { - TEST_FLOATING_EQUALITY( x[k], exactSolution[rowOffset], tol ); - x[k] = -STS::one (); - } - } - - // upper triangular matrix - // - // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar - // = float or double. On the other hand, the author of these - // tests understood that and only instantiated them for - // Scalar=double (see the instantiations list below). - exactSolution[0] = -3.5; - exactSolution[1] = 3.0; - exactSolution[2] = -1.0; - - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) { - - const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex (); - for (size_t i = 0; i < maxNumEntPerRow; ++i) { - for (LO k = 0; k < blockSize; ++k) { - assembleResidual[k] = STS::zero (); - } - solution.replaceLocalValues(lclRowInd, &assembleResidual[0]); - - for (LO j = 0; j < blockSize*blockSize; ++j) { - assembleMatrix[j] = STS::zero (); - } - const size_t indexBaseMatrix = maxNumEntPerRow * i + rowOffset; - for (LO j = 0; j < blockSize; ++j) { - assembleMatrix[(blockSize+1)*j] = basematrix[indexBaseMatrix]; - } - - lclColInds[0] = meshRowMap.getMinLocalIndex () + i; - blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (), - &assembleMatrix[0], 1); - } - } - - blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets); - Kokkos::fence (); - - for (LO lclMeshRow = 0; lclMeshRow < static_cast (numLocalMeshPoints); ++lclMeshRow) { - auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ()); - auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ()); - int info = 0; - Tpetra::GETF2 (diagBlock, ipiv, info); - TEST_EQUALITY( info, 0 ); - - // GETRI needs workspace. Use host space for now. - Teuchos::Array workVec (blockSize); - Kokkos::View - work (workVec.getRawPtr (), blockSize); - Tpetra::GETRI (diagBlock, ipiv, work, info); - TEST_EQUALITY( info, 0 ); - } - - blockMat.localGaussSeidel (residual, solution, blockDiag, - STS::one (), Tpetra::Symmetric); - Kokkos::fence (); - - for (LO lclRowInd = meshRowMap.getMinLocalIndex (); - lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) { - const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex (); - typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd); - ST* x = reinterpret_cast (xlcl.data ()); - for (LO k = 0; k < blockSize; ++k) { - TEST_FLOATING_EQUALITY( x[k], exactSolution[rowOffset], tol ); - x[k] = -STS::one (); - } - } - - // Final output - int lclSuccess = success; - int gblSuccess = 0; - reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); - TEST_EQUALITY_CONST( gblSuccess, 1 ); - } - // // Test conversion from CrsMatrix to BlockCrsMatrix. This test is valid only on // 1, 2, or 4 processes. (See note below.) @@ -2487,8 +2043,6 @@ namespace { TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, SetAllToScalar, SCALAR, LO, GO, NODE ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, ImportCopy, SCALAR, LO, GO, NODE ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, ExportDiffRowMaps, SCALAR, LO, GO, NODE ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, localGSDiagonalMatrix, SCALAR, LO, GO, NODE ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, localGSTriangularMatrices, SCALAR, LO, GO, NODE ) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, point2block, SCALAR, LO, GO, NODE ) TPETRA_ETI_MANGLING_TYPEDEFS() diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt index 5b24627929e8..100631006e67 100644 --- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt +++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt @@ -198,17 +198,6 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( STANDARD_PASS_OUTPUT ) -TRIBITS_ADD_EXECUTABLE_AND_TEST( - CrsMatrix_gaussSeidel - SOURCES - CrsMatrix_gaussSeidel - ${TEUCHOS_STD_UNIT_TEST_MAIN} - ARGS - COMM serial mpi - NUM_MPI_PROCS 1-4 - STANDARD_PASS_OUTPUT - ) - # This test only makes sense for exactly 2 MPI processes. TRIBITS_ADD_EXECUTABLE_AND_TEST( CrsMatrix_Bug5978 diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp deleted file mode 100644 index 6e870fd29f4f..000000000000 --- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp +++ /dev/null @@ -1,969 +0,0 @@ -/* -// @HEADER -// *********************************************************************** -// -// Tpetra: Templated Linear Algebra Services Package -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// -// ************************************************************************ -// @HEADER -*/ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace { - template - typename Teuchos::ScalarTraits::magnitudeType - norm2 (const MV& x) - { - typedef typename MV::scalar_type scalar_type; - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - - Teuchos::Array normTemp (1); - x.norm2 (normTemp); - return normTemp[0]; - } - - // mfh 15 Oct 2014: Unfortunately, Teuchos::ScalarTraits - // doesn't define eps() if Scalar is int, unsigned int, etc. - // isOrdinal is true for integer types and false for floating-point - // types, so we can select on that. - template::isOrdinal> - struct ComputeEps { - static typename Teuchos::ScalarTraits::magnitudeType eps (); - }; - - // Partial specialization for non-floating-point (i.e., integer) types. - template - struct ComputeEps { - static typename Teuchos::ScalarTraits::magnitudeType eps () { - return static_cast (0); - } - }; - - // Partial specialization for floating-point types. - template - struct ComputeEps { - static typename Teuchos::ScalarTraits::magnitudeType eps () { - typedef Teuchos::ScalarTraits STS; - return STS::eps (); - } - }; - -// -// Tests for Tpetra::CrsMatrix::gaussSeidel(). -// -TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, gaussSeidelSerial, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType ) -{ - using Tpetra::createContigMapWithNode; - using Tpetra::createMultiVector; - using Tpetra::global_size_t; - using Tpetra::Map; - using Teuchos::Array; - using Teuchos::ArrayRCP; - using Teuchos::ArrayView; - using Teuchos::as; - using Teuchos::av_const_cast; - using Teuchos::broadcast; - using Teuchos::Comm; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcp_const_cast; - using Teuchos::OrdinalTraits; - using Teuchos::outArg; - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::reduceAll; - using Teuchos::REDUCE_SUM; - using Teuchos::REDUCE_MIN; - using Teuchos::ScalarTraits; - using Teuchos::tuple; - using Teuchos::TypeNameTraits; - using std::cerr; - using std::endl; - - typedef ScalarType scalar_type; - typedef LocalOrdinalType local_ordinal_type; - typedef GlobalOrdinalType global_ordinal_type; - typedef NodeType node_type; - - // Typedefs derived from the above canonical typedefs. - typedef ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - typedef ScalarTraits STM; - typedef Map map_type; - - // Abbreviation typedefs. - typedef scalar_type ST; - typedef local_ordinal_type LO; - typedef global_ordinal_type GO; - typedef node_type NT; - - // CrsMatrix specialization to use in this test. - typedef Tpetra::CrsMatrix crs_matrix_type; - - // CrsGraph specialization corresponding to crs_matrix_type (the - // CrsMatrix specialization). - typedef Tpetra::CrsGraph crs_graph_type; - - // MultiVector specialization corresponding to crs_matrix_type. - typedef Tpetra::MultiVector multivector_type; - // Vector specialization corresponding to crs_matrix_type. - typedef Tpetra::Vector vector_type; - - // This controls whether to print to std::cerr instead of out. - // Printing to std::cerr ensures that output appears before Kokkos - // gets the chance to raise errors. - constexpr bool debug = true; - - //////////////////////////////////////////////////////////////////// - // HERE BEGINS THE TEST. - //////////////////////////////////////////////////////////////////// - - const global_size_t INVALID = OrdinalTraits::invalid(); - - // Get the default communicator. - RCP > comm = Tpetra::getDefaultComm (); - const int numProcs = comm->getSize (); - const int myRank = comm->getRank (); - - if (debug) { - if (myRank == 0) { - cerr << "Test Tpetra's Gauss-Seidel with " << numProcs << " process" - << (numProcs != 1 ? "es" : "") << endl; - } - } - else { - out << "Test Tpetra's Gauss-Seidel with " << numProcs << " process" - << (numProcs != 1 ? "es" : "") << endl; - } - -#if 0 - // This test doesn't make much sense if there is only one MPI - // process. We let it pass trivially in that case. - if (numProcs == 1) { - out << "Number of processes in world is one; test passes trivially." << endl; - return; - } -#endif // 0 - - // Let's build ourselves the graph of a 2-D Laplacian (Dirichlet - // boundary conditions) on a square numGlobalPoints x - // numGlobalPoints mesh. Each process gets a numLocalPoints x - // numGlobalPoints slab. - - const LO numLocalPoints = 9; - const GO numGlobalPoints = numProcs * numLocalPoints; - - // Number of rows in the matrix owned by each process. - const LO numLocalRows = numLocalPoints * numGlobalPoints; - - //CrT: 4Feb14: the void trick does not seem to work, I get warnings - // Number of (global) rows and columns in the matrix. - //const GO numGlobalRows = numGlobalPoints * numGlobalPoints; - //const GO numGlobalCols = numGlobalRows; - // Prevent compile warning for unused variable. - // (It's not really "variable" if it's const, but oh well.) - //(void) numGlobalCols; - - if (debug) { - if (myRank == 0) { - cerr << "Creating contiguous row Map" << endl; - } - } - else { - out << "Creating contiguous row Map" << endl; - } - - // Create a contiguous row Map, with numLocalRows rows per process. - RCP rowMap = - createContigMapWithNode (INVALID, numLocalRows, comm); - - // The Gauss-Seidel kernel requires that the row, domain, and range - // Maps all be the same. - RCP domainMap = rowMap; - RCP rangeMap = rowMap; - - // Min and max row index of this process. - const GO globalMinRow = rowMap->getMinGlobalIndex (); - const GO globalMaxRow = rowMap->getMaxGlobalIndex (); - const GO globalMinAllRow = rowMap->getMinAllGlobalIndex (); - const GO globalMaxAllRow = rowMap->getMaxAllGlobalIndex (); - - if (debug) { - if (myRank == 0) { - cerr << "Creating graph" << endl; - } - } - else { - out << "Creating graph" << endl; - } - - // Create a numGlobalRows by numGlobalCols graph and set its - // structure. Every process sets its diagonal entries (which it - // owns), and entries 1 and numGlobalPoints away in each direction - // (if valid). - RCP graph; - { - // We have a good upper bound for the number of entries per row, so use static profile. - RCP nonconstGraph (new crs_graph_type (rowMap, 5, Tpetra::StaticProfile)); - - for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) { - Teuchos::Array indices; - if (globalRow >= globalMinAllRow + numGlobalPoints) { - indices.push_back (globalRow - numGlobalPoints); - } - if (globalRow >= globalMinAllRow + 1) { - indices.push_back (globalRow - 1); - } - indices.push_back (globalRow); - if (globalRow + 1 <= globalMaxAllRow) { - indices.push_back (globalRow + 1); - } - if (globalRow + numGlobalPoints <= globalMaxAllRow) { - indices.push_back (globalRow + numGlobalPoints); - } - nonconstGraph->insertGlobalIndices (globalRow, indices ()); - } - - nonconstGraph->fillComplete (domainMap, rangeMap); - graph = rcp_const_cast (nonconstGraph); - } - - if (debug) { - if (myRank == 0) { - cerr << "Creating matrix" << endl; - } - } - else { - out << "Creating matrix" << endl; - } - - // Create the matrix, using the above graph. - RCP matrix (new crs_matrix_type (graph)); - { - for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) { - Teuchos::Array indices; - Teuchos::Array values; - if (globalRow >= globalMinAllRow - numGlobalPoints) { - indices.push_back (globalRow - numGlobalPoints); - values.push_back (-STS::one ()); - } - if (globalRow >= globalMinAllRow + 1) { - indices.push_back (globalRow - 1); - values.push_back (-STS::one ()); - } - indices.push_back (globalRow); - values.push_back (as (4)); - if (globalRow + 1 <= globalMaxAllRow) { - indices.push_back (globalRow + 1); - values.push_back (-STS::one ()); - } - if (globalRow + numGlobalPoints <= globalMaxAllRow) { - indices.push_back (globalRow + numGlobalPoints); - values.push_back (-STS::one ()); - } - matrix->replaceGlobalValues (globalRow, indices (), values ()); - } - - if (debug) { - if (myRank == 0) { - cerr << "Calling fillComplete on the matrix" << endl; - } - } - else { - out << "Calling fillComplete on the matrix" << endl; - } - matrix->fillComplete (domainMap, rangeMap); - } - - const bool dumpMatrix = false; - if (dumpMatrix) { - const std::string filename ("A.mtx"); - const std::string scalarType = Teuchos::TypeNameTraits::name (); - const std::string matName = "A-" + scalarType; - typedef Tpetra::MatrixMarket::Writer writer_type; - writer_type::writeSparseFile (filename, matrix, matName, - "Gauss-Seidel test matrix"); - } - - if (debug) { - if (myRank == 0) { - cerr << "Extracting inverse diagonal" << endl; - } - } - else { - out << "Extracting inverse diagonal" << endl; - } - RCP D = rcp (new vector_type (rowMap)); - matrix->getLocalDiagCopy (*D); // Get the diagonal entries. - { - // Check whether any diagonal entries are zero, and invert the - // entries in place. It's faster to do the latter with a Kokkos - // kernel, but this is good enough as a test. - typedef typename ArrayRCP::size_type size_type; - size_type zeroDiagEltIndex = -1; - ArrayRCP D_data = D->getDataNonConst (); - for (size_type k = 0; k < D_data.size (); ++k) { - if (D_data[k] == STS::zero ()) { - zeroDiagEltIndex = k; - } else { - D_data[k] = STS::one() / D_data[k]; - } - } - TEST_EQUALITY_CONST(zeroDiagEltIndex, static_cast (-1)); - // TEUCHOS_TEST_FOR_EXCEPTION( - // zeroDiagEltIndex != -1, - // std::logic_error, - // "On Process " << comm->getRank () << ", diagonal element " - // << zeroDiagEltIndex << ", possibly among others, is zero."); - } - - if (debug) { - if (myRank == 0) { - cerr << "Making vectors" << endl; - } - } - else { - out << "Making vectors" << endl; - } - - // Make (multi)vectors for the initial guess (which will also be the - // solution vector), the right-hand side, and the residual. We're - // only testing a single right-hand side for now. Make all the - // vectors first in the column Map, with the actual vector given to - // Gauss-Seidel in the domain / range Map. - - RCP colMap = graph->getColMap (); - RCP X_colMap = createMultiVector (colMap, 1); - RCP X_exact_colMap = createMultiVector (colMap, 1); - RCP B_colMap = createMultiVector (colMap, 1); - RCP R_colMap = createMultiVector (colMap, 1); - RCP X = X_colMap->offsetViewNonConst (domainMap, 0); - RCP X_exact = X_exact_colMap->offsetViewNonConst (domainMap, 0); - RCP B = B_colMap->offsetViewNonConst (rangeMap, 0); - RCP R = R_colMap->offsetViewNonConst (rangeMap, 0); - - if (debug) { - std::ostringstream os; - os << "Proc " << myRank << ": Set (random) exact solution X_exact" - << endl; - cerr << os.str (); - } - else { - out << "Set (random) exact solution X_exact" << endl; - } - X_exact->randomize (); - - if (debug) { - std::ostringstream os; - os << "Proc " << myRank << ": Compute right-hand side B" << endl; - cerr << os.str (); - } - else { - out << "Compute right-hand side B" << endl; - } - matrix->apply (*X_exact, *B); - - const int maxNumIters = 10; - Array residNorms (maxNumIters+1); - - if (debug) { - std::ostringstream os; - os << "Proc " << myRank << ": Compute initial residual" << endl; - cerr << os.str (); - } - else { - out << "Compute initial residual" << endl; - } - - // Compute initial residual R = B - A * X and ||R||_2. - Tpetra::Details::residual(*matrix,*X,*B,*R); - residNorms[0] = norm2 (*R); - - if (debug) { - std::ostringstream os; - os << "Proc " << myRank << ": Compute norms of X, D, and B" << endl; - cerr << os.str (); - } - else { - out << "Compute norms of X, D, and B" << endl; - } - - // Compute norms of X, D, and B. - // The norms of D and B must not change. - const magnitude_type X_norm_orig = norm2 (*X); - const magnitude_type D_norm_orig = norm2 (*D); - const magnitude_type B_norm_orig = norm2 (*B); - if (myRank == 0) { - std::ostream& os = debug ? cerr : out; - os << "Before iterating:" << endl - << "- ||R||_2 = " << residNorms[0] << endl - << "- ||X||_2 = " << X_norm_orig << endl - << "- ||B||_2 = " << B_norm_orig << endl - << "- ||D||_2 = " << D_norm_orig << endl; - } - - // Monitor the norms of (X,) D, and B. If the norms of D or B - // change, that means Gauss-Seidel is broken (or we mixed up the - // order of its arguments). - magnitude_type X_norm = X_norm_orig; - magnitude_type D_norm = D_norm_orig; - magnitude_type B_norm = B_norm_orig; - - if (debug) { - std::ostringstream os; - os << "Proc " << myRank << ": Test CrsMatrix::gaussSeidel" << endl; - cerr << os.str (); - } - else { - out << "Test CrsMatrix::gaussSeidel" << endl; - } - - int localSuccess = 1; - int globalSuccess = 1; - int smallestFailingRank = 0; - for (int iter = 0; iter < maxNumIters; ++iter) { - std::string exMsg; - try { - matrix->gaussSeidel (*B, *X, *D, STS::one(), Tpetra::Symmetric, 1); - } - catch (std::exception& e) { - exMsg = e.what (); - localSuccess = 0; - } - reduceAll (*comm, REDUCE_SUM, localSuccess, outArg (globalSuccess)); - if (globalSuccess < numProcs) { - // Compute min rank that failed. For procs that didn't fail, - // set their "rank" to numProcs. - const int inRank = (localSuccess == 1) ? numProcs : myRank; - reduceAll (*comm, REDUCE_MIN, inRank, outArg (smallestFailingRank)); - // The min-failing-rank process broadcasts the error message's - // length and the message itself to all the other processes. - int msgLen = as (exMsg.size ()); - broadcast (*comm, smallestFailingRank, outArg (msgLen)); - exMsg.reserve (msgLen); - char* const exMsgRaw = const_cast (exMsg.c_str ()); - broadcast (*comm, smallestFailingRank, msgLen, exMsgRaw); - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, exMsg); - } - - // Compute the new residual R = B - A * X. This is not part of - // Gauss-Seidel itself, but we use it to measure convergence. - Tpetra::Details::residual(*matrix,*X,*B,*R); - residNorms[iter+1] = norm2 (*R); - - X_norm = norm2 (*X); - D_norm = norm2 (*D); - B_norm = norm2 (*B); - if (myRank == 0) { - out << "After iteration " << iter+1 << " of " << maxNumIters << ":" << endl - << "- ||R||_2 = " << residNorms[iter+1] << endl - << "- ||X||_2 = " << X_norm << endl - << "- ||B||_2 = " << B_norm << endl - << "- ||D||_2 = " << D_norm << endl; - } - } - - // The test passes if - // - // 1. The norms of B and D did not change. - // 2. The residual norm decreased. - // - // It would be better if we had a specific decrease rate in mind, - // but we'll leave that for later. - // - // FIXME (mfh 01 Jan 2013) This test assumes that norms are computed - // deterministically. This is not necessarily correct, even when - // running in MPI-only (no hybrid parallelism) mode. Thus, we need - // some kind of tolerance for these tests. For the prefactor, - // square root of N (the usual heuristic) was not enough; we had to - // use N instead. - const magnitude_type testTolPrefactor = - static_cast (B->getGlobalLength ()); - const magnitude_type testTol = - testTolPrefactor * ComputeEps::eps (); - - const magnitude_type B_norm_diff = STM::magnitude (B_norm - B_norm_orig); - TEUCHOS_TEST_FOR_EXCEPTION( - B_norm_diff > testTol, std::logic_error, - "Gauss-Seidel changed the norm of B! |B_norm_orig - B_norm| = " - << B_norm_diff << " > testTol = " << testTol << ". That means either the " - "Gauss-Seidel implementation is broken, or we mixed up the order of its " - "arguments. Original ||B||_2 = " << B_norm_orig << "; new ||B||_2 = " - << B_norm << "."); - - const magnitude_type D_norm_diff = STM::magnitude (D_norm - D_norm_orig); - TEUCHOS_TEST_FOR_EXCEPTION( - D_norm_diff > testTol, std::logic_error, - "Gauss-Seidel changed the norm of D (the vector of diagonal entries of the " - "matrix)! |D_norm_orig - D_norm| = " << D_norm_diff << " > testTol = " - << testTol << ". That means either the Gauss-Seidel implementation is " - "broken, or we mixed up the order of its arguments. Original ||D||_2 = " - << D_norm_orig << "; new ||D||_2 = " << D_norm << "."); - - TEUCHOS_TEST_FOR_EXCEPTION( - maxNumIters > 0 && residNorms[maxNumIters] > residNorms[0], - std::logic_error, - "Gauss-Seidel failed to reduce the residual norm after " << maxNumIters - << " iterations! Original ||R||_2 = " << residNorms[0] << "; final " - "||R||_2 = " << residNorms[maxNumIters] << "."); - - { - const int lclSuccess = success ? 1 : 0; - int gblSuccess = 0; // output argument - reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); - TEST_EQUALITY_CONST( gblSuccess, 1 ); - if (gblSuccess != 1) { - std::ostream& out2 = debug ? cerr : out; - out2 << "Test failed on at least one process!" << endl; - } - } -} - - -// -// Tests for Tpetra::CrsMatrix::gaussSeidel(). -// -TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, reorderedGaussSeidelSerial, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType ) -{ - using Tpetra::createContigMapWithNode; - using Tpetra::createMultiVector; - using Tpetra::global_size_t; - using Tpetra::Map; - using Teuchos::Array; - using Teuchos::ArrayRCP; - using Teuchos::ArrayView; - using Teuchos::as; - using Teuchos::av_const_cast; - using Teuchos::broadcast; - using Teuchos::Comm; - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::rcp_const_cast; - using Teuchos::OrdinalTraits; - using Teuchos::outArg; - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::reduceAll; - using Teuchos::REDUCE_SUM; - using Teuchos::REDUCE_MIN; - using Teuchos::ScalarTraits; - using Teuchos::tuple; - using Teuchos::TypeNameTraits; - using std::cerr; - using std::endl; - - typedef ScalarType scalar_type; - typedef LocalOrdinalType local_ordinal_type; - typedef GlobalOrdinalType global_ordinal_type; - typedef NodeType node_type; - - // Typedefs derived from the above canonical typedefs. - typedef ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - typedef ScalarTraits STM; - typedef Map map_type; - - // Abbreviation typedefs. - typedef scalar_type ST; - typedef local_ordinal_type LO; - typedef global_ordinal_type GO; - typedef node_type NT; - - // CrsMatrix specialization to use in this test. - typedef Tpetra::CrsMatrix crs_matrix_type; - - // CrsGraph specialization corresponding to crs_matrix_type (the - // CrsMatrix specialization). - typedef Tpetra::CrsGraph crs_graph_type; - - // MultiVector specialization corresponding to crs_matrix_type. - typedef Tpetra::MultiVector multivector_type; - // Vector specialization corresponding to crs_matrix_type. - typedef Tpetra::Vector vector_type; - - - //////////////////////////////////////////////////////////////////// - // HERE BEGINS THE TEST. - //////////////////////////////////////////////////////////////////// - - const global_size_t INVALID = OrdinalTraits::invalid(); - - // Get the default communicator. - RCP > comm = Tpetra::getDefaultComm (); - const int numProcs = comm->getSize (); - const int myRank = comm->getRank (); - - if (myRank == 0) { - out << "Test with " << numProcs << " process" << (numProcs != 1 ? "es" : "") << endl; - } - -#if 0 - // This test doesn't make much sense if there is only one MPI - // process. We let it pass trivially in that case. - if (numProcs == 1) { - out << "Number of processes in world is one; test passes trivially." << endl; - return; - } -#endif // 0 - - // Let's build ourselves the graph of a 2-D Laplacian (Dirichlet - // boundary conditions) on a square numGlobalPoints x - // numGlobalPoints mesh. Each process gets a numLocalPoints x - // numGlobalPoints slab. - - const LO numLocalPoints = 9; - const GO numGlobalPoints = numProcs * numLocalPoints; - - // Number of rows in the matrix owned by each process. - const LO numLocalRows = numLocalPoints * numGlobalPoints; - - //CrT: 4Feb14: the void trick does not seem to work, I get warnings - // Number of (global) rows and columns in the matrix. - //const GO numGlobalRows = numGlobalPoints * numGlobalPoints; - //const GO numGlobalCols = numGlobalRows; - // Prevent compile warning for unused variable. - // (It's not really "variable" if it's const, but oh well.) - //(void) numGlobalCols; - - if (myRank == 0) { - out << "Creating contiguous row Map" << endl; - } - - // Create a contiguous row Map, with numLocalRows rows per process. - RCP rowMap = createContigMapWithNode (INVALID, numLocalRows, comm); - - // The Gauss-Seidel kernel requires that the row, domain, and range - // Maps all be the same. - RCP domainMap = rowMap; - RCP rangeMap = rowMap; - - // Min and max row index of this process. - const GO globalMinRow = rowMap->getMinGlobalIndex (); - const GO globalMaxRow = rowMap->getMaxGlobalIndex (); - const GO globalMinAllRow = rowMap->getMinAllGlobalIndex (); - const GO globalMaxAllRow = rowMap->getMaxAllGlobalIndex (); - - if (myRank == 0) { - out << "Creating graph" << endl; - } - - // Create a numGlobalRows by numGlobalCols graph and set its - // structure. Every process sets its diagonal entries (which it - // owns), and entries 1 and numGlobalPoints away in each direction - // (if valid). - RCP graph; - { - // We have a good upper bound for the number of entries per row, so use static profile. - RCP nonconstGraph (new crs_graph_type (rowMap, 5, Tpetra::StaticProfile)); - - for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) { - Teuchos::Array indices; - if (globalRow >= globalMinAllRow + numGlobalPoints) { - indices.push_back (globalRow - numGlobalPoints); - } - if (globalRow >= globalMinAllRow + 1) { - indices.push_back (globalRow - 1); - } - indices.push_back (globalRow); - if (globalRow + 1 <= globalMaxAllRow) { - indices.push_back (globalRow + 1); - } - if (globalRow + numGlobalPoints <= globalMaxAllRow) { - indices.push_back (globalRow + numGlobalPoints); - } - nonconstGraph->insertGlobalIndices (globalRow, indices ()); - } - - nonconstGraph->fillComplete (domainMap, rangeMap); - graph = rcp_const_cast (nonconstGraph); - } - - if (myRank == 0) { - out << "Creating matrix" << endl; - } - - // Create the matrix, using the above graph. - RCP matrix (new crs_matrix_type (graph)); - { - for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) { - Teuchos::Array indices; - Teuchos::Array values; - if (globalRow >= globalMinAllRow - numGlobalPoints) { - indices.push_back (globalRow - numGlobalPoints); - values.push_back (-STS::one ()); - } - if (globalRow >= globalMinAllRow + 1) { - indices.push_back (globalRow - 1); - values.push_back (-STS::one ()); - } - indices.push_back (globalRow); - values.push_back (as (4)); - if (globalRow + 1 <= globalMaxAllRow) { - indices.push_back (globalRow + 1); - values.push_back (-STS::one ()); - } - if (globalRow + numGlobalPoints <= globalMaxAllRow) { - indices.push_back (globalRow + numGlobalPoints); - values.push_back (-STS::one ()); - } - matrix->replaceGlobalValues (globalRow, indices (), values ()); - } - - if (myRank == 0) { - out << "Calling fillComplete on the matrix" << endl; - } - matrix->fillComplete (domainMap, rangeMap); - } - - const bool dumpMatrix = false; - if (dumpMatrix) { - const std::string filename ("A.mtx"); - const std::string scalarType = Teuchos::TypeNameTraits::name (); - const std::string matName = "A-" + scalarType; - typedef Tpetra::MatrixMarket::Writer writer_type; - writer_type::writeSparseFile (filename, matrix, matName, "Gauss-Seidel test matrix"); - } - - if (myRank == 0) { - out << "Extracting inverse diagonal" << endl; - } - RCP D = rcp (new vector_type (rowMap)); - matrix->getLocalDiagCopy (*D); // Get the diagonal entries. - { - // Check whether any diagonal entries are zero, and invert the - // entries in place. It's faster to do the latter with a Kokkos - // kernel, but this is good enough as a test. - typedef typename ArrayRCP::size_type size_type; - size_type zeroDiagEltIndex = -1; - ArrayRCP D_data = D->getDataNonConst (); - for (size_type k = 0; k < D_data.size (); ++k) { - if (D_data[k] == STS::zero ()) { - zeroDiagEltIndex = k; - } else { - D_data[k] = STS::one() / D_data[k]; - } - } - TEST_EQUALITY_CONST(zeroDiagEltIndex, static_cast (-1)); - // TEUCHOS_TEST_FOR_EXCEPTION( - // zeroDiagEltIndex != -1, - // std::logic_error, - // "On Process " << comm->getRank () << ", diagonal element " - // << zeroDiagEltIndex << ", possibly among others, is zero."); - } - - if (myRank == 0) { - out << "Making vectors" << endl; - } - - // Make (multi)vectors for the initial guess (which will also be the - // solution vector), the right-hand side, and the residual. We're - // only testing a single right-hand side for now. Make all the - // vectors first in the column Map, with the actual vector given to - // Gauss-Seidel in the domain / range Map. - - RCP colMap = graph->getColMap (); - RCP X_colMap = createMultiVector (colMap, 1); - RCP X_exact_colMap = createMultiVector (colMap, 1); - RCP B_colMap = createMultiVector (colMap, 1); - RCP R_colMap = createMultiVector (colMap, 1); - RCP X = X_colMap->offsetViewNonConst (domainMap, 0); - RCP X_exact = X_exact_colMap->offsetViewNonConst (domainMap, 0); - RCP B = B_colMap->offsetViewNonConst (rangeMap, 0); - RCP R = R_colMap->offsetViewNonConst (rangeMap, 0); - - // Set the exact solution and right-hand side. - X_exact->randomize (); - matrix->apply (*X_exact, *B); - - const int maxNumIters = 10; - Array residNorms (maxNumIters+1); - - - // Compute initial residual R = B - A * X and ||R||_2. - Tpetra::Details::residual(*matrix,*X,*B,*R); - residNorms[0] = norm2 (*R); - - // Compute norms of X, D, and B. - // The norms of D and B must not change. - const magnitude_type X_norm_orig = norm2 (*X); - const magnitude_type D_norm_orig = norm2 (*D); - const magnitude_type B_norm_orig = norm2 (*B); - if (myRank == 0) { - out << "Before iterating:" << endl - << "- ||R||_2 = " << residNorms[0] << endl - << "- ||X||_2 = " << X_norm_orig << endl - << "- ||B||_2 = " << B_norm_orig << endl - << "- ||D||_2 = " << D_norm_orig << endl; - } - - // Ordering vector - Teuchos::Array rowIndices(matrix->getNodeNumRows()); - for( LO i=0; (size_t)i < matrix->getNodeNumRows(); i++) - rowIndices[i]=i; - - - // Monitor the norms of (X,) D, and B. If the norms of D or B - // change, that means Gauss-Seidel is broken (or we mixed up the - // order of its arguments). - magnitude_type X_norm = X_norm_orig; - magnitude_type D_norm = D_norm_orig; - magnitude_type B_norm = B_norm_orig; - - int localSuccess = 1; - int globalSuccess = 1; - int smallestFailingRank = 0; - for (int iter = 0; iter < maxNumIters; ++iter) { - std::string exMsg; - try { - matrix->reorderedGaussSeidel (*B, *X, *D, rowIndices,STS::one(), Tpetra::Symmetric, 1); - } - catch (std::exception& e) { - exMsg = e.what (); - localSuccess = 0; - } - reduceAll (*comm, REDUCE_SUM, localSuccess, outArg (globalSuccess)); - if (globalSuccess < numProcs) { - // Compute min rank that failed. For procs that didn't fail, - // set their "rank" to numProcs. - const int inRank = (localSuccess == 1) ? numProcs : myRank; - reduceAll (*comm, REDUCE_MIN, inRank, outArg (smallestFailingRank)); - // The min-failing-rank process broadcasts the error message's - // length and the message itself to all the other processes. - int msgLen = as (exMsg.size ()); - broadcast (*comm, smallestFailingRank, outArg (msgLen)); - exMsg.reserve (msgLen); - char* const exMsgRaw = const_cast (exMsg.c_str ()); - broadcast (*comm, smallestFailingRank, msgLen, exMsgRaw); - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, exMsg); - } - - // Compute the new residual R = B - A * X. This is not part of - // Gauss-Seidel itself, but we use it to measure convergence. - Tpetra::Details::residual(*matrix,*X,*B,*R); - residNorms[iter+1] = norm2 (*R); - - X_norm = norm2 (*X); - D_norm = norm2 (*D); - B_norm = norm2 (*B); - if (myRank == 0) { - out << "After iteration " << iter+1 << " of " << maxNumIters << ":" << endl - << "- ||R||_2 = " << residNorms[iter+1] << endl - << "- ||X||_2 = " << X_norm << endl - << "- ||B||_2 = " << B_norm << endl - << "- ||D||_2 = " << D_norm << endl; - } - } - - // The test passes if - // - // 1. The norms of B and D did not change. - // 2. The residual norm decreased. - // - // It would be better if we had a specific decrease rate in mind, - // but we'll leave that for later. - // - // FIXME (mfh 01 Jan 2013) This test assumes that norms are computed - // deterministically. This is not necessarily correct, even when - // running in MPI-only (no hybrid parallelism) mode. Thus, we - // really need some kind of tolerance for these tests. For the - // prefactor, square root of N (the usual heuristic) was not enough; - // we had to use N instead. - const magnitude_type testTolPrefactor = - static_cast (B->getGlobalLength ()); - const magnitude_type testTol = - testTolPrefactor * ComputeEps::eps (); - - const magnitude_type B_norm_diff = STM::magnitude (B_norm - B_norm_orig); - TEUCHOS_TEST_FOR_EXCEPTION( - B_norm_diff > testTol, std::logic_error, - "Gauss-Seidel changed the norm of B! |B_norm_orig - B_norm| = " - << B_norm_diff << " > testTol = " << testTol << ". That means either the " - "Gauss-Seidel implementation is broken, or we mixed up the order of its " - "arguments. Original ||B||_2 = " << B_norm_orig << "; new ||B||_2 = " - << B_norm << "."); - - const magnitude_type D_norm_diff = STM::magnitude (D_norm - D_norm_orig); - TEUCHOS_TEST_FOR_EXCEPTION( - D_norm_diff > testTol, std::logic_error, - "Gauss-Seidel changed the norm of D (the vector of diagonal entries of the " - "matrix)! |D_norm_orig - D_norm| = " << D_norm_diff << " > testTol = " - << testTol << ". That means either the Gauss-Seidel implementation is " - "broken, or we mixed up the order of its arguments. Original ||D||_2 = " - << D_norm_orig << "; new ||D||_2 = " << D_norm << "."); - - TEUCHOS_TEST_FOR_EXCEPTION( - maxNumIters > 0 && residNorms[maxNumIters] > residNorms[0], - std::logic_error, - "Gauss-Seidel failed to reduce the residual norm after " << maxNumIters - << " iterations! Original ||R||_2 = " << residNorms[0] << "; final " - "||R||_2 = " << residNorms[maxNumIters] << "."); -} - - -////////////////////////////////////////////////////////////////////// -// INSTANTIATE THE TEMPLATED UNIT TESTS -////////////////////////////////////////////////////////////////////// - -typedef Tpetra::Details::DefaultTypes::node_type default_node_type; -#define UNIT_TEST_GROUP( SCALAR, LO, GO ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, gaussSeidelSerial, LO, GO, SCALAR, default_node_type ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, reorderedGaussSeidelSerial, LO, GO, SCALAR, default_node_type ) - -TPETRA_ETI_MANGLING_TYPEDEFS() - -TPETRA_INSTANTIATE_SLG( UNIT_TEST_GROUP ) - -} // namespace (anonymous) - -