diff --git a/TPLsList.cmake b/TPLsList.cmake index 1bdb278b2dce..76ab8382b9f4 100644 --- a/TPLsList.cmake +++ b/TPLsList.cmake @@ -58,6 +58,8 @@ TRIBITS_REPOSITORY_DEFINE_TPLS( yaml-cpp "cmake/TPLs/" EX Peano "cmake/TPLs/" EX CUDA "${${PROJECT_NAME}_TRIBITS_DIR}/core/std_tpls/" PT + CUBLAS "cmake/TPLs/" PT + CUSOLVER "cmake/TPLs/" PT CUSPARSE "cmake/TPLs/" PT Thrust "cmake/TPLs/" ST Cusp "cmake/TPLs/" ST diff --git a/cmake/TPLs/FindTPLCUBLAS.cmake b/cmake/TPLs/FindTPLCUBLAS.cmake new file mode 100644 index 000000000000..8ce61e78e661 --- /dev/null +++ b/cmake/TPLs/FindTPLCUBLAS.cmake @@ -0,0 +1,70 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + +IF (NOT TPL_ENABLE_CUDA) + MESSAGE(FATAL_ERROR "\nCUBLAS: This TPL requires CUDA") +ELSE() + find_library(CUDA_cublas_LIBRARY + cublas + HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib + ) + IF(CUDA_cublas_LIBRARY STREQUAL "CUDA_cublas_LIBRARY-NOTFOUND") + MESSAGE(FATAL_ERROR "\nCUBLAS: could not find cublas library.") + ENDIF() + GLOBAL_SET(TPL_CUBLAS_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUBLAS_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) + GLOBAL_SET(TPL_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY}) +ENDIF() + diff --git a/cmake/TPLs/FindTPLCUSOLVER.cmake b/cmake/TPLs/FindTPLCUSOLVER.cmake new file mode 100644 index 000000000000..7725cc028cfc --- /dev/null +++ b/cmake/TPLs/FindTPLCUSOLVER.cmake @@ -0,0 +1,70 @@ +# @HEADER +# ************************************************************************ +# +# Trilinos: An Object-Oriented Solver Framework +# Copyright (2001) Sandia Corporation +# +# +# Copyright (2001) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000, there is a non-exclusive license for use of this +# work by or on behalf of the U.S. Government. Export of this program +# may require a license from the United States Government. +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# NOTICE: The United States Government is granted for itself and others +# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide +# license in this data to reproduce, prepare derivative works, and +# perform publicly and display publicly. Beginning five (5) years from +# July 25, 2001, the United States Government is granted for itself and +# others acting on its behalf a paid-up, nonexclusive, irrevocable +# worldwide license in this data to reproduce, prepare derivative works, +# distribute copies to the public, perform publicly and display +# publicly, and to permit others to do so. +# +# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT +# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES +# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR +# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY +# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS +# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# +# ************************************************************************ +# @HEADER + +IF (NOT TPL_ENABLE_CUDA) + MESSAGE(FATAL_ERROR "\nCUSOLVER: This TPL requires CUDA") +ELSE() + find_library(CUDA_cusolver_LIBRARY + cusolver + HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib + ) + IF(CUDA_cusolver_LIBRARY STREQUAL "CUDA_cusolver_LIBRARY-NOTFOUND") + MESSAGE(FATAL_ERROR "\nCUSOLVER: could not find cusolver library.") + ENDIF() + GLOBAL_SET(TPL_CUSOLVER_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUSOLVER_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) + GLOBAL_SET(TPL_CUSOLVER_LIBRARIES ${CUDA_cusolver_LIBRARY}) +ENDIF() + diff --git a/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp b/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp index e3c06fe85626..cbada90ed6d5 100644 --- a/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp +++ b/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ // @HEADER @@ -48,18 +46,18 @@ #include "Stokhos_Sacado_Kokkos_UQ_PCE.hpp" -# include // create intranode TSQR object -# include // full (internode + intranode) TSQR -# include // internode TSQR +# include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object +# include "Tsqr.hpp" // full (internode + intranode) TSQR +# include "Tsqr_DistTsqr.hpp" // internode TSQR // Subclass of TSQR::MessengerBase, implemented using Teuchos // communicator template helper functions -# include -# include -# include +# include "Tsqr_TeuchosMessenger.hpp" +# include "Tpetra_MultiVector.hpp" +# include "Teuchos_ParameterListAcceptorDefaultBase.hpp" # include // Base TsqrAdator template we will specialize -# include +# include "Tpetra_TsqrAdaptor.hpp" namespace Tpetra { @@ -81,16 +79,16 @@ namespace Tpetra { typedef typename mp_scalar_type::scalar_type scalar_type; typedef typename mp_scalar_type::ordinal_type mp_ordinal_type; typedef typename MV::local_ordinal_type ordinal_type; - typedef typename MV::node_type node_type; typedef Teuchos::SerialDenseMatrix dense_matrix_type; typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; private: - //typedef TSQR::MatView matview_type; - typedef TSQR::NodeTsqrFactory node_tsqr_factory_type; - typedef typename node_tsqr_factory_type::node_tsqr_type node_tsqr_type; - typedef TSQR::DistTsqr dist_tsqr_type; - typedef TSQR::Tsqr tsqr_type; + using node_tsqr_factory_type = + TSQR::NodeTsqrFactory; + using node_tsqr_type = TSQR::NodeTsqr; + using dist_tsqr_type = TSQR::DistTsqr; + using tsqr_type = TSQR::Tsqr; public: /// \brief Constructor (that accepts a parameter list). @@ -100,7 +98,7 @@ namespace Tpetra { /// implementation. For details, call \c getValidParameters() /// and examine the documentation embedded therein. TsqrAdaptor (const Teuchos::RCP& plist) : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -110,7 +108,7 @@ namespace Tpetra { //! Constructor (that uses default parameters). TsqrAdaptor () : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (new node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -289,20 +287,10 @@ namespace Tpetra { { if (! ready_) { prepareDistTsqr (mv); - prepareNodeTsqr (mv); ready_ = true; } } - /// \brief Finish intraprocess TSQR initialization. - /// - /// \note It's OK to call this method more than once; it is idempotent. - void - prepareNodeTsqr (const MV& mv) - { - node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_); - } - /// \brief Finish interprocess TSQR initialization. /// /// \param mv [in] A valid Tpetra::MultiVector instance whose diff --git a/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp b/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp index feab87b8d530..8409389c33fc 100644 --- a/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp +++ b/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ // @HEADER @@ -48,18 +46,18 @@ #include "Stokhos_Sacado_Kokkos_MP_Vector.hpp" -# include // create intranode TSQR object -# include // full (internode + intranode) TSQR -# include // internode TSQR +# include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object +# include "Tsqr.hpp" // full (internode + intranode) TSQR +# include "Tsqr_DistTsqr.hpp" // internode TSQR // Subclass of TSQR::MessengerBase, implemented using Teuchos // communicator template helper functions -# include -# include -# include +# include "Tsqr_TeuchosMessenger.hpp" +# include "Tpetra_MultiVector.hpp" +# include "Teuchos_ParameterListAcceptorDefaultBase.hpp" # include // Base TsqrAdator template we will specialize -# include +# include "Tpetra_TsqrAdaptor.hpp" namespace Tpetra { @@ -81,16 +79,16 @@ namespace Tpetra { typedef typename mp_scalar_type::scalar_type scalar_type; typedef typename mp_scalar_type::ordinal_type mp_ordinal_type; typedef typename MV::local_ordinal_type ordinal_type; - typedef typename MV::node_type node_type; typedef Teuchos::SerialDenseMatrix dense_matrix_type; typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; private: - //typedef TSQR::MatView matview_type; - typedef TSQR::NodeTsqrFactory node_tsqr_factory_type; - typedef typename node_tsqr_factory_type::node_tsqr_type node_tsqr_type; - typedef TSQR::DistTsqr dist_tsqr_type; - typedef TSQR::Tsqr tsqr_type; + using node_tsqr_factory_type = + TSQR::NodeTsqrFactory; + using node_tsqr_type = TSQR::NodeTsqr; + using dist_tsqr_type = TSQR::DistTsqr; + using tsqr_type = TSQR::Tsqr; public: /// \brief Constructor (that accepts a parameter list). @@ -100,7 +98,7 @@ namespace Tpetra { /// implementation. For details, call \c getValidParameters() /// and examine the documentation embedded therein. TsqrAdaptor (const Teuchos::RCP& plist) : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -110,7 +108,7 @@ namespace Tpetra { //! Constructor (that uses default parameters). TsqrAdaptor () : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -289,20 +287,10 @@ namespace Tpetra { { if (! ready_) { prepareDistTsqr (mv); - prepareNodeTsqr (mv); ready_ = true; } } - /// \brief Finish intraprocess TSQR initialization. - /// - /// \note It's OK to call this method more than once; it is idempotent. - void - prepareNodeTsqr (const MV& mv) - { - node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_); - } - /// \brief Finish interprocess TSQR initialization. /// /// \param mv [in] A valid Tpetra::MultiVector instance whose diff --git a/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp b/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp index 7c0d0344905e..22f31ac012b7 100644 --- a/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp +++ b/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp @@ -265,12 +265,6 @@ namespace Thyra { #endif // HAVE_MPI } - /// \brief Finish intraprocess TSQR initialization. - /// - /// \note It's OK to call this method more than once; it is idempotent. - void - prepareNodeTsqr (const MV& /* X */) {} - /// \brief Finish interprocess TSQR initialization. /// /// Input X is a valid Thyra::MultiVectorBase instance whose @@ -306,10 +300,7 @@ namespace Thyra { /// All multivector objects used with this adapter must have the /// same communicator and Kokkos Node instance (if applicable). void - prepareTsqr (const MV& /* X */) - { - throw std::logic_error ("Thyra adaptor for TSQR not implemented"); - } + prepareTsqr (const MV& /* X */) {} }; } // namespace Tpetra diff --git a/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp b/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp index 1cc8cce50e5e..f195e912a40b 100644 --- a/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp +++ b/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp @@ -40,7 +40,6 @@ #ifndef EPETRA_TSQRADAPTOR_HPP #define EPETRA_TSQRADAPTOR_HPP -/// /// \file Epetra_TsqrAdaptor.hpp /// \brief Epetra_MultiVector to TSQR adaptor /// @@ -52,25 +51,22 @@ /// Trilinos to get the correct list of libraries against which to /// link, but we make this easy temporary fix now so they have time to /// fix their build systems later. -/// -#include +#include "Tpetra_ConfigDefs.hpp" #if defined(HAVE_TPETRA_EPETRA) && defined(HAVE_TPETRA_TSQR) -#include // Include minimal Kokkos Node types -#include // create intranode TSQR object -#include // full (internode + intranode) TSQR -#include // internode TSQR -#include +#include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object +#include "Tsqr.hpp" // full (internode + intranode) TSQR +#include "Tsqr_DistTsqr.hpp" // internode TSQR +#include "Epetra_Comm.h" // Subclass of TSQR::MessengerBase, implemented using Teuchos // communicator template helper functions -#include -#include -#include +#include "Epetra_TsqrMessenger.hpp" +#include "Epetra_MultiVector.h" +#include "Teuchos_ParameterListAcceptorDefaultBase.hpp" #include - namespace Epetra { /// \class TsqrAdaptor @@ -117,11 +113,14 @@ namespace Epetra { /// both are int. typedef int ordinal_type; - /// \typedef node_type + /// \typedef device_type /// - /// TSQR depends on a Kokkos Node type. We just use the default - /// Node type here. - typedef Tpetra::Details::DefaultTypes::node_type node_type; + /// TSQR depends on a Kokkos::Device type. For Epetra, use a + /// host-only type. Typical types are Kokkos::Serial or + /// Kokkos::OpenMP, depending on build settings. + using device_type = + Kokkos::Device; /// \typedef dense_matrix_type /// @@ -131,23 +130,25 @@ namespace Epetra { /// \note TSQR lives in the Kokkos package, which requires the /// Teuchos package, so it's acceptable for us to require /// Teuchos components. - typedef Teuchos::SerialDenseMatrix dense_matrix_type; + using dense_matrix_type = + Teuchos::SerialDenseMatrix; /// \typedef magnitude_type /// /// Epetra_MultiVector's "Scalar" type is real. TSQR supports /// complex arithmetic as well, in which magnitude_type would /// differ from scalar_type. - typedef double magnitude_type; + using magnitude_type = double; private: - typedef TSQR::MatView matview_type; - typedef TSQR::NodeTsqrFactory node_tsqr_factory_type; + using matview_type = TSQR::MatView; + using node_tsqr_factory_type = + TSQR::NodeTsqrFactory; // Don't need a "typename" here, because there are no template // parameters involved in the type definition. - typedef node_tsqr_factory_type::node_tsqr_type node_tsqr_type; - typedef TSQR::DistTsqr dist_tsqr_type; - typedef TSQR::Tsqr tsqr_type; + using node_tsqr_type = TSQR::NodeTsqr; + using dist_tsqr_type = TSQR::DistTsqr; + using tsqr_type = TSQR::Tsqr; public: /// \brief Constructor (that accepts a parameter list). @@ -157,7 +158,7 @@ namespace Epetra { /// implementation. For details, call \c getValidParameters() /// and examine the documentation embedded therein. TsqrAdaptor (const Teuchos::RCP& plist) : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -167,7 +168,7 @@ namespace Epetra { //! Constructor (that uses default parameters). TsqrAdaptor () : - nodeTsqr_ (new node_tsqr_type), + nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()), distTsqr_ (new dist_tsqr_type), tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), ready_ (false) @@ -339,20 +340,10 @@ namespace Epetra { { if (! ready_) { prepareDistTsqr (mv); - prepareNodeTsqr (mv); ready_ = true; } } - /// \brief Finish intraprocess TSQR initialization. - /// - /// \note It's OK to call this method more than once; it is idempotent. - void - prepareNodeTsqr (const MV& /* mv */) - { - node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_); - } - /// \brief Finish interprocess TSQR initialization. /// /// \param mv [in] A multivector, from which to extract the diff --git a/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp b/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp index 017206501756..91721b8706ee 100644 --- a/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp @@ -34,8 +34,6 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Michael A. Heroux (maherou@sandia.gov) -// // ************************************************************************ // @HEADER diff --git a/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp b/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp index b48c9dafeb50..e926f53f1694 100644 --- a/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp +++ b/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp @@ -85,55 +85,89 @@ namespace Tpetra { public: using scalar_type = typename MV::scalar_type; using ordinal_type = typename MV::local_ordinal_type; - using dense_matrix_type = Teuchos::SerialDenseMatrix; - using magnitude_type = typename Teuchos::ScalarTraits::magnitudeType; + using dense_matrix_type = + Teuchos::SerialDenseMatrix; + using magnitude_type = + typename Teuchos::ScalarTraits::magnitudeType; private: using node_tsqr_factory_type = - TSQR::NodeTsqrFactory; - using node_tsqr_type = typename node_tsqr_factory_type::node_tsqr_type; + TSQR::NodeTsqrFactory; + using node_tsqr_type = TSQR::NodeTsqr; using dist_tsqr_type = TSQR::DistTsqr; - using tsqr_type = TSQR::Tsqr; + using tsqr_type = TSQR::Tsqr; + + TSQR::MatView + get_mat_view(MV& X) + { + TEUCHOS_ASSERT( ! tsqr_.is_null() ); + // FIXME (mfh 18 Oct 2010, 22 Dec 2019) Check Teuchos::Comm + // object in Q to make sure it is the same communicator as the + // one we are using in our dist_tsqr_type implementation. + + const ordinal_type lclNumRows(X.getLocalLength()); + const ordinal_type numCols(X.getNumVectors()); + scalar_type* X_ptr = nullptr; + // LAPACK and BLAS functions require "LDA" >= 1, even if the + // corresponding matrix dimension is zero. + ordinal_type X_stride = 1; + if(tsqr_->wants_device_memory()) { + X.sync_device(); + X.modify_device(); + auto X_view = X.getLocalViewDevice(); + X_ptr = reinterpret_cast(X_view.data()); + X_stride = static_cast(X_view.stride(1)); + if(X_stride == 0) { + X_stride = ordinal_type(1); // see note above + } + } + else { + X.sync_host(); + X.modify_host(); + auto X_view = X.getLocalViewHost(); + X_ptr = reinterpret_cast(X_view.data()); + X_stride = static_cast(X_view.stride(1)); + if(X_stride == 0) { + X_stride = ordinal_type(1); // see note above + } + } + using mat_view_type = TSQR::MatView; + return mat_view_type(lclNumRows, numCols, X_ptr, X_stride); + } public: - /// \brief Constructor (that accepts a parameter list). + /// \brief Constructor that accepts a Teuchos::ParameterList. /// /// \param plist [in/out] List of parameters for configuring TSQR. /// The specific parameter keys that are read depend on the TSQR - /// implementation. For details, call \c getValidParameters() - /// and examine the documentation embedded therein. - TsqrAdaptor (const Teuchos::RCP& plist) : - nodeTsqr_ (new node_tsqr_type), - distTsqr_ (new dist_tsqr_type), - tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), - ready_ (false) + /// implementation. For details, call getValidParameters() and + /// examine the documentation embedded therein. + TsqrAdaptor(const Teuchos::RCP& plist) : + nodeTsqr_(node_tsqr_factory_type::getNodeTsqr()), + distTsqr_(new dist_tsqr_type), + tsqr_(new tsqr_type(nodeTsqr_, distTsqr_)) { - setParameterList (plist); + setParameterList(plist); } - //! Constructor (that uses default parameters). - TsqrAdaptor () : - nodeTsqr_ (new node_tsqr_type), - distTsqr_ (new dist_tsqr_type), - tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)), - ready_ (false) + //! Constructor(that uses default parameters). + TsqrAdaptor() : + nodeTsqr_(node_tsqr_factory_type::getNodeTsqr()), + distTsqr_(new dist_tsqr_type), + tsqr_(new tsqr_type(nodeTsqr_, distTsqr_)) { - setParameterList (Teuchos::null); + setParameterList(Teuchos::null); } //! Get all valid parameters (with default values) that TSQR understands. Teuchos::RCP - getValidParameters () const + getValidParameters() const { - using Teuchos::RCP; - using Teuchos::rcp; - using Teuchos::ParameterList; - using Teuchos::parameterList; - - if (defaultParams_.is_null()) { - RCP params = parameterList ("TSQR implementation"); - params->set ("NodeTsqr", *(nodeTsqr_->getValidParameters ())); - params->set ("DistTsqr", *(distTsqr_->getValidParameters ())); + if(defaultParams_.is_null()) { + auto params = Teuchos::parameterList("TSQR implementation"); + params->set("NodeTsqr", *(nodeTsqr_->getValidParameters())); + params->set("DistTsqr", *(distTsqr_->getValidParameters())); defaultParams_ = params; } return defaultParams_; @@ -165,19 +199,15 @@ namespace Tpetra { /// long as it is not too large or too small. The default value /// should be fine. void - setParameterList (const Teuchos::RCP& plist) + setParameterList(const Teuchos::RCP& plist) { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; + auto params = plist.is_null() ? + Teuchos::parameterList(*getValidParameters()) : plist; using Teuchos::sublist; + nodeTsqr_->setParameterList(sublist(params, "NodeTsqr")); + distTsqr_->setParameterList(sublist(params, "DistTsqr")); - RCP params = plist.is_null() ? - parameterList (*getValidParameters ()) : plist; - nodeTsqr_->setParameterList (sublist (params, "NodeTsqr")); - distTsqr_->setParameterList (sublist (params, "DistTsqr")); - - this->setMyParamList (params); + this->setMyParamList(params); } /// \brief Compute QR factorization [Q,R] = qr(A,0). @@ -202,39 +232,30 @@ namespace Tpetra { /// instance's constructor. Otherwise, the result of this /// method is undefined. void - factorExplicit (MV& A, - MV& Q, - dense_matrix_type& R, - const bool forceNonnegativeDiagonal=false) + factorExplicit(MV& A, + MV& Q, + dense_matrix_type& R, + const bool forceNonnegativeDiagonal=false) { TEUCHOS_TEST_FOR_EXCEPTION - (! A.isConstantStride (), std::invalid_argument, "TsqrAdaptor::" + (! A.isConstantStride(), std::invalid_argument, "TsqrAdaptor::" "factorExplicit: Input MultiVector A must have constant stride."); TEUCHOS_TEST_FOR_EXCEPTION - (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::" + (! Q.isConstantStride(), std::invalid_argument, "TsqrAdaptor::" "factorExplicit: Input MultiVector Q must have constant stride."); - prepareTsqr (Q); // Finish initializing TSQR. + prepareTsqr(Q); // Finish initializing TSQR. + TEUCHOS_ASSERT( ! tsqr_.is_null() ); - // FIXME (mfh 16 Jan 2016) Currently, TSQR is a host-only - // implementation. - A.sync_host (); - A.modify_host (); - Q.sync_host (); - Q.modify_host (); - auto A_view = A.getLocalViewHost (); - auto Q_view = Q.getLocalViewHost (); - scalar_type* const A_ptr = - reinterpret_cast (A_view.data ()); - scalar_type* const Q_ptr = - reinterpret_cast (Q_view.data ()); - const bool contiguousCacheBlocks = false; - tsqr_->factorExplicitRaw (A_view.extent (0), - A_view.extent (1), - A_ptr, A.getStride (), - Q_ptr, Q.getStride (), - R.values (), R.stride (), - contiguousCacheBlocks, - forceNonnegativeDiagonal); + auto A_view = get_mat_view(A); + auto Q_view = get_mat_view(Q); + constexpr bool contiguousCacheBlocks = false; + tsqr_->factorExplicitRaw(A_view.extent(0), + A_view.extent(1), + A_view.data(), A_view.stride(1), + Q_view.data(), Q_view.stride(1), + R.values(), R.stride(), + contiguousCacheBlocks, + forceNonnegativeDiagonal); } /// \brief Rank-revealing decomposition @@ -268,29 +289,22 @@ namespace Tpetra { /// /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq N\f$. int - revealRank (MV& Q, - dense_matrix_type& R, - const magnitude_type& tol) + revealRank(MV& Q, + dense_matrix_type& R, + const magnitude_type& tol) { TEUCHOS_TEST_FOR_EXCEPTION - (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::" + (! Q.isConstantStride(), std::invalid_argument, "TsqrAdaptor::" "revealRank: Input MultiVector Q must have constant stride."); - prepareTsqr (Q); // Finish initializing TSQR. - // FIXME (mfh 18 Oct 2010) Check Teuchos::Comm object in Q - // to make sure it is the same communicator as the one we are - // using in our dist_tsqr_type implementation. + prepareTsqr(Q); // Finish initializing TSQR. - Q.sync_host (); - Q.modify_host (); - auto Q_view = Q.getLocalViewHost (); - scalar_type* const Q_ptr = - reinterpret_cast (Q_view.data ()); - const bool contiguousCacheBlocks = false; - return tsqr_->revealRankRaw (Q_view.extent (0), - Q_view.extent (1), - Q_ptr, Q.getStride (), - R.values (), R.stride (), - tol, contiguousCacheBlocks); + auto Q_view = get_mat_view(Q); + constexpr bool contiguousCacheBlocks = false; + return tsqr_->revealRankRaw(Q_view.extent(0), + Q_view.extent(1), + Q_view.data(), Q_view.stride(1), + R.values(), R.stride(), + tol, contiguousCacheBlocks); } private: @@ -307,7 +321,7 @@ namespace Tpetra { mutable Teuchos::RCP defaultParams_; //! Whether TSQR has been fully initialized. - bool ready_; + bool ready_ = false; /// \brief Finish TSQR initialization. /// @@ -330,24 +344,14 @@ namespace Tpetra { /// multivector objects used with this Adaptor instance must /// have the same map, communicator, and Kokkos Node instance. void - prepareTsqr (const MV& mv) + prepareTsqr(const MV& mv) { - if (! ready_) { - prepareDistTsqr (mv); - prepareNodeTsqr (mv); + if(! ready_) { + prepareDistTsqr(mv); ready_ = true; } } - /// \brief Finish intraprocess TSQR initialization. - /// - /// \note It's OK to call this method more than once; it is idempotent. - void - prepareNodeTsqr (const MV& mv) - { - node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_); - } - /// \brief Finish interprocess TSQR initialization. /// /// \param mv [in] A valid Tpetra::MultiVector instance whose @@ -355,17 +359,17 @@ namespace Tpetra { /// /// \note It's OK to call this method more than once; it is idempotent. void - prepareDistTsqr (const MV& mv) + prepareDistTsqr(const MV& mv) { using Teuchos::RCP; using Teuchos::rcp_implicit_cast; - typedef TSQR::TeuchosMessenger mess_type; - typedef TSQR::MessengerBase base_mess_type; + using mess_type = TSQR::TeuchosMessenger; + using base_mess_type = TSQR::MessengerBase; - RCP > comm = mv.getMap()->getComm(); - RCP mess (new mess_type (comm)); - RCP messBase = rcp_implicit_cast (mess); - distTsqr_->init (messBase); + auto comm = mv.getMap()->getComm(); + RCP mess(new mess_type(comm)); + auto messBase = rcp_implicit_cast(mess); + distTsqr_->init(messBase); } }; @@ -374,4 +378,3 @@ namespace Tpetra { #endif // HAVE_TPETRA_TSQR #endif // TPETRA_TSQRADAPTOR_HPP - diff --git a/packages/tpetra/tsqr/CMakeLists.txt b/packages/tpetra/tsqr/CMakeLists.txt index 4bf9f40aa773..71b30bf3916d 100644 --- a/packages/tpetra/tsqr/CMakeLists.txt +++ b/packages/tpetra/tsqr/CMakeLists.txt @@ -8,23 +8,38 @@ TRIBITS_SUBPACKAGE(TSQR) # Enabled by default (unless disabled explicitly at the command line) # if Teuchos is built with complex arithmetic support. TRIBITS_ADD_OPTION_AND_DEFINE( - KokkosTSQR_ENABLE_Complex - HAVE_KOKKOSTSQR_COMPLEX + ${PACKAGE_NAME}_ENABLE_Complex + HAVE_TPETRATSQR_COMPLEX "Enable complex arithmetic (std::complex) support for TSQR. This is currently ON if Teuchos_ENABLE_COMPLEX is ON. The default behavior may change as we migrate TSQR to depend on new Kokkos. New Kokkos does not currently support complex arithmetic, but this will change." "${Teuchos_ENABLE_COMPLEX}" ) -# Whether to build TbbTsqr and related classes. -# -# Enabled by default (unless disabled explicitly at the command line) -# if Trilinos is built with the TBB (Intel's Threading Building -# Blocks) TPL (third-party library) enabled. +ASSERT_DEFINED(TPL_ENABLE_CUBLAS) +TRIBITS_ADD_OPTION_AND_DEFINE( + ${PACKAGE_NAME}_ENABLE_CUBLAS + HAVE_TPETRATSQR_CUBLAS + "Enable TSQR's support for the CUBLAS TPL." + "${TPL_ENABLE_CUBLAS}" + ) +ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUBLAS) + +ASSERT_DEFINED(TPL_ENABLE_CUSOLVER) TRIBITS_ADD_OPTION_AND_DEFINE( - KokkosTSQR_ENABLE_TBB - HAVE_KOKKOSTSQR_TBB - "Enable Intel Threading Building Blocks (TBB) intranode parallelization of TSQR. This option is enabled by default if you are building Trilinos with TBB enabled as a 'third-party library' (TPL), so you should not have to enable this option manually. TSQR will work without this, but enabling it gives another parallelization option for TSQR." - "${TPL_ENABLE_TBB}" + ${PACKAGE_NAME}_ENABLE_CUSOLVER + HAVE_TPETRATSQR_CUSOLVER + "Enable TSQR's support for the CUSOLVER TPL." + "${TPL_ENABLE_CUSOLVER}" ) +ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUSOLVER) + +IF(${PACKAGE_NAME}_ENABLE_CUSOLVER AND (NOT ${PACKAGE_NAME}_ENABLE_CUBLAS)) + MESSAGE(FATAL_ERROR "*** We found the CUSOLVER TPL, but not the + CUBLAS TPL. One should not exist without the other.") +ENDIF() +IF((NOT ${PACKAGE_NAME}_ENABLE_CUSOLVER) AND ${PACKAGE_NAME}_ENABLE_CUBLAS) + MESSAGE(FATAL_ERROR "*** We found the CUBLAS TPL, but not the + CUSOLVER TPL. One should not exist without the other.") +ENDIF() # KokkosTSQR_config.h gets created in the src/ subdirectory. ADD_SUBDIRECTORY(src) diff --git a/packages/tpetra/tsqr/cmake/Dependencies.cmake b/packages/tpetra/tsqr/cmake/Dependencies.cmake index beb08e5ca843..94476683e84d 100644 --- a/packages/tpetra/tsqr/cmake/Dependencies.cmake +++ b/packages/tpetra/tsqr/cmake/Dependencies.cmake @@ -3,6 +3,6 @@ SET(LIB_OPTIONAL_DEP_PACKAGES) SET(TEST_REQUIRED_DEP_PACKAGES) SET(TEST_OPTIONAL_DEP_PACKAGES) SET(LIB_REQUIRED_DEP_TPLS) -SET(LIB_OPTIONAL_DEP_TPLS TBB) +SET(LIB_OPTIONAL_DEP_TPLS CUBLAS CUSOLVER) SET(TEST_REQUIRED_DEP_TPLS) SET(TEST_OPTIONAL_DEP_TPLS) diff --git a/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in b/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in index 6f5fb98dbc92..0bb958d792c6 100644 --- a/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in +++ b/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in @@ -2,9 +2,16 @@ #define TPETRATSQR_CONFIG_H /* Define if building TSQR with std::complex support */ -#cmakedefine HAVE_KOKKOSTSQR_COMPLEX +#cmakedefine HAVE_TPETRATSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX + /* For backwards compatibility */ +# define HAVE_KOKKOSTSQR_COMPLEX HAVE_TPETRATSQR_COMPLEX +#endif -/* Define if the TBB (Intel Threading Building Blocks) TPL is available */ -#cmakedefine HAVE_KOKKOSTSQR_TBB +/* Define if TSQR supports the CUBLAS TPL */ +#cmakedefine HAVE_TPETRATSQR_CUBLAS + +/* Define if TSQR supports the CUSOLVER TPL */ +#cmakedefine HAVE_TPETRATSQR_CUSOLVER #endif // TPETRATSQR_CONFIG_H diff --git a/packages/tpetra/tsqr/src/CMakeLists.txt b/packages/tpetra/tsqr/src/CMakeLists.txt index 91cca32b7ec1..9e243aaaf1e5 100644 --- a/packages/tpetra/tsqr/src/CMakeLists.txt +++ b/packages/tpetra/tsqr/src/CMakeLists.txt @@ -12,16 +12,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) # files to install. APPEND_SET(HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) -# If TBB (Intel's Threading Building Blocks) is enabled, add headers -# and sources for TBB-enabled shared-memory parallel TSQR to the -# lists of this subpackage's headers resp. sources. -IF (${PACKAGE_NAME}_ENABLE_TBB) - APPEND_GLOB(HEADERS ${DIR}/TbbTsqr*.hpp) - APPEND_GLOB(SOURCES ${DIR}/TbbTsqr*.cpp) -ENDIF () - -# Add all other headers and sources (those not related to TBB) to the -# lists of this subpackage's headers resp. sources. +# Add headers and sources to the lists of this subpackage's headers +# resp. sources. APPEND_GLOB(HEADERS ${DIR}/Tsqr*.hpp) APPEND_GLOB(HEADERS ${DIR}/KokkosTSQR*.hpp) APPEND_GLOB(SOURCES ${DIR}/Tsqr*.cpp) @@ -37,5 +29,5 @@ TRIBITS_ADD_LIBRARY( # / from this directory, or to / from the 'impl' subdirectory. That ensures # that running "make" will also rerun CMake in order to regenerate Makefiles. # -# Here is another such change, and here is another. Another! +# Behold: another such change, and another. # diff --git a/packages/tpetra/tsqr/src/TbbTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr.hpp deleted file mode 100644 index 996d76e94eec..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr.hpp +++ /dev/null @@ -1,504 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -/// \file TbbTsqr.hpp -/// \brief Intranode TSQR, parallelized with Intel TBB. -/// -#ifndef __TSQR_TbbTsqr_hpp -#define __TSQR_TbbTsqr_hpp - -#include "TbbTsqr_TbbParallelTsqr.hpp" -#include "Tsqr_TimeStats.hpp" -#include "Teuchos_ParameterList.hpp" -#include "Teuchos_ParameterListExceptions.hpp" -#include "Teuchos_Time.hpp" -#include -#include -#include // std::pair -#include - -namespace TSQR { - namespace TBB { - /// \class TbbTsqr - /// \brief Intranode TSQR, parallelized with Intel TBB - /// - /// TSQR factorization for a dense, tall and skinny matrix stored - /// on a single node. Parallelized using Intel's Threading - /// Building Blocks. - /// - /// \note TSQR only needs to know about the local ordinal type - /// (LocalOrdinal), not about the global ordinal type. - /// TimerType may be any class with the same interface as - /// TrivialTimer; it times the divide-and-conquer base cases - /// (the operations on each CPU core within the thread-parallel - /// implementation). - template< class LocalOrdinal, class Scalar, class TimerType = Teuchos::Time > - class TbbTsqr : public Teuchos::Describable { - private: - /// \brief Implementation of TBB TSQR. - /// - /// If you don't have TBB available, you can test this class by - /// substituting in a TbbRecursiveTsqr - /// object. That is a nonparallel implementation that emulates - /// the control flow of TbbParallelTsqr. If you do this, you - /// should also change the FactorOutput public typedef. - /// - /// \note This is NOT a use of the pImpl idiom, because the - /// point of the pImpl idiom is to avoid including the - /// implementation details of the header file of the - /// implementation class. Here, the implementation class is - /// templated, so we have to include the implementation class' - /// implementation details. - TbbParallelTsqr impl_; - - // Collected running statistcs on various computations - mutable TimeStats factorStats_; - mutable TimeStats applyStats_; - mutable TimeStats explicitQStats_; - mutable TimeStats cacheBlockStats_; - mutable TimeStats unCacheBlockStats_; - - // Timers for various computations - mutable TimerType factorTimer_; - mutable TimerType applyTimer_; - mutable TimerType explicitQTimer_; - mutable TimerType cacheBlockTimer_; - mutable TimerType unCacheBlockTimer_; - - public: - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - typedef LocalOrdinal ordinal_type; - - /// \typedef FactorOutput - /// \brief Type of partial output of TBB TSQR. - /// - /// If you don't have TBB available, you can test this class by - /// substituting in "typename TbbRecursiveTsqr::FactorOutput" for the typedef's definition. If you - /// do this, you should also change the type of \c impl_ above. - typedef typename TbbParallelTsqr::FactorOutput FactorOutput; - - /// \brief Constructor. - /// - /// \param numCores [in] Maximum number of processing cores to use - /// when factoring the matrix. Fewer cores may be used if the - /// matrix is not big enough to justify their use. - /// - /// \param cacheSizeHint [in] Cache block size hint (in bytes) - /// to use in the sequential part of TSQR. If zero or not - /// specified, a reasonable default is used. If each CPU core - /// has a private cache, that cache's size (minus a little - /// wiggle room) would be the appropriate value for this - /// parameter. Set to zero for the implementation to choose a - /// reasonable default. - TbbTsqr (const size_t numCores, - const size_t cacheSizeHint = 0) : - impl_ (numCores, cacheSizeHint), - factorTimer_ ("TbbTsqr::factor"), - applyTimer_ ("TbbTsqr::apply"), - explicitQTimer_ ("TbbTsqr::explicit_Q"), - cacheBlockTimer_ ("TbbTsqr::cache_block"), - unCacheBlockTimer_ ("TbbTsqr::un_cache_block") - {} - - /// \brief Constructor (that takes a parameter list). - /// - /// \param plist [in/out] On input: list of TbbTsqr parameters. - /// On output: missing parameters are filled in with default - /// values. - /// - /// For a list of accepted parameters and thei documentation, - /// see the parameter list returned by \c getValidParameters(). - TbbTsqr (const Teuchos::RCP& plist) : - impl_ (plist), - factorTimer_ ("TbbTsqr::factor"), - applyTimer_ ("TbbTsqr::apply"), - explicitQTimer_ ("TbbTsqr::explicit_Q"), - cacheBlockTimer_ ("TbbTsqr::cache_block"), - unCacheBlockTimer_ ("TbbTsqr::un_cache_block") - {} - - /// \brief Constructor (that uses default parameters). - /// - /// \param plist [in/out] On input: list of TbbTsqr parameters. - /// On output: missing parameters are filled in with default - /// values. - /// - /// For a list of accepted parameters and thei documentation, - /// see the parameter list returned by \c getValidParameters(). - TbbTsqr () : - impl_ (Teuchos::null), - factorTimer_ ("TbbTsqr::factor"), - applyTimer_ ("TbbTsqr::apply"), - explicitQTimer_ ("TbbTsqr::explicit_Q"), - cacheBlockTimer_ ("TbbTsqr::cache_block"), - unCacheBlockTimer_ ("TbbTsqr::un_cache_block") - {} - - Teuchos::RCP - getValidParameters () const - { - return impl_.getValidParameters (); - } - - void - setParameterList (const Teuchos::RCP& plist) - { - impl_.setParameterList (plist); - } - - /// \brief Number of tasks that TSQR will use to solve the problem. - /// - /// This is the number of subproblems into which to divide the - /// main problem, in order to solve it in parallel. - size_t ntasks() const { return impl_.ntasks(); } - - //! Cache size hint (in bytes) used for the factorization. - size_t cache_size_hint() const { return impl_.cache_size_hint(); } - - /// Whether or not this QR factorization produces an R factor - /// with all nonnegative diagonal entries. - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - typedef TbbParallelTsqr< LocalOrdinal, Scalar, TimerType > impl_type; - return impl_type::QR_produces_R_factor_with_nonnegative_diagonal(); - } - - //! Whether this object is ready to perform computations. - bool ready() const { - return true; - } - - /// \brief One-line description of this object. - /// - /// This implements Teuchos::Describable::description(). For now, - /// SequentialTsqr uses the default implementation of - /// Teuchos::Describable::describe(). - std::string description () const { - using std::endl; - - // SequentialTsqr also implements Describable, so if you - // decide to implement describe(), you could call - // SequentialTsqr's describe() and get a nice hierarchy of - // descriptions. - std::ostringstream os; - os << "Intranode Tall Skinny QR (TSQR): " - << "Intel Threading Building Blocks (TBB) implementation" - << ", max " << ntasks() << "-way parallelism" - << ", cache size hint of " << cache_size_hint() << " bytes."; - return os.str(); - } - - void - cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const Scalar A_in[], - const LocalOrdinal lda_in) const - { - cacheBlockTimer_.start(true); - impl_.cache_block (nrows, ncols, A_out, A_in, lda_in); - cacheBlockStats_.update (cacheBlockTimer_.stop()); - } - - void - un_cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const LocalOrdinal lda_out, - const Scalar A_in[]) const - { - unCacheBlockTimer_.start(true); - impl_.un_cache_block (nrows, ncols, A_out, lda_out, A_in); - unCacheBlockStats_.update (unCacheBlockTimer_.stop()); - } - - void - fill_with_zeros (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const - { - impl_.fill_with_zeros (nrows, ncols, C, ldc, contiguous_cache_blocks); - } - - template< class MatrixViewType > - MatrixViewType - top_block (const MatrixViewType& C, - const bool contiguous_cache_blocks) const - { - return impl_.top_block (C, contiguous_cache_blocks); - } - - /// \brief Compute QR factorization of the dense matrix A - /// - /// Compute the QR factorization of the dense matrix A. - /// - /// \param nrows [in] Number of rows of A. - /// Precondition: nrows >= ncols. - /// - /// \param ncols [in] Number of columns of A. - /// Precondition: nrows >= ncols. - /// - /// \param A [in,out] On input, the matrix to factor, stored as a - /// general dense matrix in column-major order. On output, - /// overwritten with an implicit representation of the Q factor. - /// - /// \param lda [in] Leading dimension of A. - /// Precondition: lda >= nrows. - /// - /// \param R [out] The final R factor of the QR factorization of - /// the matrix A. An ncols by ncols upper triangular matrix - /// stored in column-major order, with leading dimension ldr. - /// - /// \param ldr [in] Leading dimension of the matrix R. - /// - /// \param b_contiguous_cache_blocks [in] Whether cache blocks are - /// stored contiguously in the input matrix A and the output - /// matrix Q (of explicit_Q()). If not and you want them to be, - /// you should use the cache_block() method to copy them into - /// that format. You may use the un_cache_block() method to - /// copy them out of that format into the usual column-oriented - /// format. - /// - /// \return FactorOutput struct, which together with the data in A - /// form an implicit representation of the Q factor. They - /// should be passed into the apply() and explicit_Q() functions - /// as the "factor_output" parameter. - FactorOutput - factor (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const - { - factorTimer_.start(true); - return impl_.factor (nrows, ncols, A, lda, R, ldr, contiguous_cache_blocks); - factorStats_.update (factorTimer_.stop()); - } - - /// \brief Apply Q factor to the global dense matrix C - /// - /// Apply the Q factor (computed by factor() and represented - /// implicitly) to the dense matrix C. - /// - /// \param apply_type [in] Whether to compute Q*C, Q^T * C, or - /// Q^H * C. - /// - /// \param nrows [in] Number of rows of the matrix C and the - /// matrix Q. Precondition: nrows >= ncols_Q, ncols_C. - /// - /// \param ncols_Q [in] Number of columns of Q - /// - /// \param Q [in] Same as the "A" output of factor() - /// - /// \param ldq [in] Same as the "lda" input of factor() - /// - /// \param factor_output [in] Return value of factor() - /// - /// \param ncols_C [in] Number of columns in C. - /// Precondition: nrows_local >= ncols_C. - /// - /// \param C [in,out] On input, the matrix C, stored as a general - /// dense matrix in column-major order. On output, overwritten - /// with op(Q)*C, where op(Q) = Q or Q^T. - /// - /// \param ldc [in] Leading dimension of C. - /// Precondition: ldc_local >= nrows_local. - /// Not applicable if C is cache-blocked in place. - /// - /// \param contiguous_cache_blocks [in] Whether or not cache - /// blocks of Q and C are stored contiguously (default: - /// false). - void - apply (const ApplyType& apply_type, - const LocalOrdinal nrows, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const FactorOutput& factor_output, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const - { - applyTimer_.start(true); - impl_.apply (apply_type, nrows, ncols_Q, Q, ldq, factor_output, - ncols_C, C, ldc, contiguous_cache_blocks); - applyStats_.update (applyTimer_.stop()); - } - - /// \brief Compute the explicit Q factor from factor() - /// - /// Compute the explicit version of the Q factor computed by - /// factor() and represented implicitly (via Q_in and - /// factor_output). - /// - /// \param nrows [in] Number of rows of the matrix Q_in. Also, - /// the number of rows of the output matrix Q_out. - /// Precondition: nrows >= ncols_Q_in. - /// - /// \param ncols_Q_in [in] Number of columns in the original matrix - /// A, whose explicit Q factor we are computing. - /// Precondition: nrows >= ncols_Q_in. - /// - /// \param Q_local_in [in] Same as A output of factor(). - /// - /// \param ldq_local_in [in] Same as lda input of factor() - /// - /// \param ncols_Q_out [in] Number of columns of the explicit Q - /// factor to compute. - /// - /// \param Q_out [out] The explicit representation of the Q factor. - /// - /// \param ldq_out [in] Leading dimension of Q_out. - /// - /// \param factor_output [in] Return value of factor(). - void - explicit_Q (const LocalOrdinal nrows, - const LocalOrdinal ncols_Q_in, - const Scalar Q_in[], - const LocalOrdinal ldq_in, - const FactorOutput& factor_output, - const LocalOrdinal ncols_Q_out, - Scalar Q_out[], - const LocalOrdinal ldq_out, - const bool contiguous_cache_blocks) const - { - explicitQTimer_.start(true); - impl_.explicit_Q (nrows, ncols_Q_in, Q_in, ldq_in, factor_output, - ncols_Q_out, Q_out, ldq_out, contiguous_cache_blocks); - explicitQStats_.update (explicitQTimer_.stop()); - } - - /// \brief Compute Q*B - /// - /// Compute matrix-matrix product Q*B, where Q is nrows by ncols - /// and B is ncols by ncols. Respect cache blocks of Q. - void - Q_times_B (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar Q[], - const LocalOrdinal ldq, - const Scalar B[], - const LocalOrdinal ldb, - const bool contiguous_cache_blocks) const - { - impl_.Q_times_B (nrows, ncols, Q, ldq, B, ldb, contiguous_cache_blocks); - } - - /// Compute SVD \f$R = U \Sigma V^*\f$, not in place. Use the - /// resulting singular values to compute the numerical rank of R, - /// with respect to the relative tolerance tol. If R is full - /// rank, return without modifying R. If R is not full rank, - /// overwrite R with \f$\Sigma \cdot V^*\f$. - /// - /// \return Numerical rank of R: 0 <= rank <= ncols. - LocalOrdinal - reveal_R_rank (const LocalOrdinal ncols, - Scalar R[], - const LocalOrdinal ldr, - Scalar U[], - const LocalOrdinal ldu, - const magnitude_type tol) const - { - return impl_.reveal_R_rank (ncols, R, ldr, U, ldu, tol); - } - - /// \brief Rank-revealing decomposition - /// - /// Using the R factor from factor() and the explicit Q factor - /// from explicit_Q(), compute the SVD of R (\f$R = U \Sigma - /// V^*\f$). R. If R is full rank (with respect to the given - /// relative tolerance tol), don't change Q or R. Otherwise, - /// compute \f$Q := Q \cdot U\f$ and \f$R := \Sigma V^*\f$ in - /// place (the latter may be no longer upper triangular). - /// - /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq ncols\f$. - /// - LocalOrdinal - reveal_rank (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar Q[], - const LocalOrdinal ldq, - Scalar R[], - const LocalOrdinal ldr, - const magnitude_type tol, - const bool contiguous_cache_blocks) const - { - return impl_.reveal_rank (nrows, ncols, Q, ldq, R, ldr, tol, - contiguous_cache_blocks); - } - - double - min_seq_factor_timing () const { return impl_.min_seq_factor_timing(); } - double - max_seq_factor_timing () const { return impl_.max_seq_factor_timing(); } - double - min_seq_apply_timing () const { return impl_.min_seq_apply_timing(); } - double - max_seq_apply_timing () const { return impl_.max_seq_apply_timing(); } - - void getStats (std::vector< TimeStats >& stats) { - const int numStats = 5; - stats.resize (numStats); - stats[0] = factorStats_; - stats[1] = applyStats_; - stats[2] = explicitQStats_; - stats[3] = cacheBlockStats_; - stats[4] = unCacheBlockStats_; - } - - void getStatsLabels (std::vector< std::string >& labels) { - const int numStats = 5; - labels.resize (numStats); - labels[0] = factorTimer_.name(); - labels[1] = applyTimer_.name(); - labels[2] = explicitQTimer_.name(); - labels[3] = cacheBlockTimer_.name(); - labels[4] = unCacheBlockTimer_.name(); - } - }; // class TbbTsqr - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TbbTsqr_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp deleted file mode 100644 index 0caff734b512..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp +++ /dev/null @@ -1,228 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_ApplyTask_hpp -#define __TSQR_TBB_ApplyTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - - /// \class ApplyTask - /// \brief TBB task for recursive TSQR "apply Q factor" phase. - /// - template< class LocalOrdinal, class Scalar, class TimerType > - class ApplyTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair split_t; - typedef std::pair const_split_t; - typedef std::pair top_blocks_t; - typedef std::vector array_top_blocks_t; - - /// \typedef SeqOutput - /// Result of SequentialTsqr for each thread. - typedef typename SequentialTsqr::FactorOutput SeqOutput; - /// \typedef ParOutput - /// - /// Array of ncores "local tau arrays" from parallel TSQR. - /// (Local Q factors are stored in place.) - typedef std::vector > ParOutput; - /// \typedef FactorOutput - /// Result of SequentialTsqr for the data on each thread, - /// and the result of combining the threads' data. - typedef typename std::pair, ParOutput> FactorOutput; - - /// \brief Constructor. - /// - /// \note The timing references are only modified by one thread - /// at a time; recursive calls use distinct references and - /// combine the results. - ApplyTask (const size_t P_first__, - const size_t P_last__, - const_mat_view_type Q, - mat_view_type C, - array_top_blocks_t& top_blocks, - const FactorOutput& factor_output, - const SequentialTsqr& seq, - double& my_seq_timing, - double& min_seq_timing, - double& max_seq_timing, - const bool contiguous_cache_blocks) : - P_first_ (P_first__), - P_last_ (P_last__), - Q_ (Q), - C_ (C), - top_blocks_ (top_blocks), - factor_output_ (factor_output), - seq_ (seq), - apply_type_ (ApplyType::NoTranspose), // FIXME: modify to support Q^T and Q^H - my_seq_timing_ (my_seq_timing), - min_seq_timing_ (min_seq_timing), - max_seq_timing_ (max_seq_timing), - contiguous_cache_blocks_ (contiguous_cache_blocks) - {} - - tbb::task* execute () - { - if (P_first_ > P_last_ || Q_.empty() || C_.empty()) - return NULL; - else if (P_first_ == P_last_) - { - execute_base_case (); - return NULL; - } - else - { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first_ + P_last_) / 2; - const_split_t Q_split = - partitioner_.split (Q_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - split_t C_split = - partitioner_.split (C_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - - // The partitioner may decide that the current blocks Q_ - // and C_ have too few rows to be worth splitting. In - // that case, Q_split.second and C_split.second (the - // bottom block) will be empty. We can deal with this by - // treating it as the base case. - if (Q_split.second.empty() || Q_split.second.extent(0) == 0) - { - execute_base_case (); - return NULL; - } - - double top_timing; - double top_min_timing = 0.0; - double top_max_timing = 0.0; - double bot_timing; - double bot_min_timing = 0.0; - double bot_max_timing = 0.0; - - apply_pair (P_first_, P_mid+1); - ApplyTask& topTask = *new( allocate_child() ) - ApplyTask (P_first_, P_mid, Q_split.first, C_split.first, - top_blocks_, factor_output_, seq_, - top_timing, top_min_timing, top_max_timing, - contiguous_cache_blocks_); - ApplyTask& botTask = *new( allocate_child() ) - ApplyTask (P_mid+1, P_last_, Q_split.second, C_split.second, - top_blocks_, factor_output_, seq_, - bot_timing, bot_min_timing, bot_max_timing, - contiguous_cache_blocks_); - - set_ref_count (3); // 3 children (2 + 1 for the wait) - spawn (topTask); - spawn_and_wait_for_all (botTask); - - top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing; - top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing; - - bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing; - bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing; - - min_seq_timing_ = std::min (top_min_timing, bot_min_timing); - max_seq_timing_ = std::min (top_max_timing, bot_max_timing); - - return NULL; - } - } - - private: - size_t P_first_, P_last_; - const_mat_view_type Q_; - mat_view_type C_; - array_top_blocks_t& top_blocks_; - const FactorOutput& factor_output_; - SequentialTsqr seq_; - TSQR::ApplyType apply_type_; - TSQR::Combine combine_; - Partitioner partitioner_; - double& my_seq_timing_; - double& min_seq_timing_; - double& max_seq_timing_; - bool contiguous_cache_blocks_; - - void - execute_base_case () - { - TimerType timer(""); - timer.start(); - const std::vector& seq_outputs = factor_output_.first; - seq_.apply (apply_type_, Q_.extent(0), Q_.extent(1), - Q_.data(), Q_.stride(1), seq_outputs[P_first_], - C_.extent(1), C_.data(), C_.stride(1), - contiguous_cache_blocks_); - my_seq_timing_ = timer.stop(); - } - - void - apply_pair (const size_t P_top, - const size_t P_bot) - { - if (P_top == P_bot) - throw std::logic_error("apply_pair: should never get here!"); - - const_mat_view_type& Q_bot = top_blocks_[P_bot].first; - mat_view_type& C_top = top_blocks_[P_top].second; - mat_view_type& C_bot = top_blocks_[P_bot].second; - - const ParOutput& par_output = factor_output_.second; - const std::vector& tau = par_output[P_bot]; - std::vector work (C_top.extent(1)); - combine_.apply_pair (apply_type_, - C_top.extent(1), Q_bot.extent(1), - Q_bot.data(), Q_bot.stride(1), tau.data(), - C_top.data(), C_top.stride(1), - C_bot.data(), C_bot.stride(1), work.data()); - } - - }; - - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_ApplyTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp deleted file mode 100644 index 8827a1ce4091..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp +++ /dev/null @@ -1,146 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_CacheBlockTask_hpp -#define __TSQR_TBB_CacheBlockTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - /// \class CacheBlockTask - /// \brief TBB task for recursive TSQR cache blocking phase. - /// - /// "Cache blocking" here means copying the input matrix, which is - /// stored with noncontiguous cache blocks, to the output matrix, - /// which is stored with contiguous cache blocks. - template - class CacheBlockTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair split_t; - typedef std::pair const_split_t; - - CacheBlockTask (const size_t P_first__, - const size_t P_last__, - mat_view_type& A_out, - const_mat_view_type& A_in, - const SequentialTsqr& seq) : - P_first_ (P_first__), - P_last_ (P_last__), - A_out_ (A_out), - A_in_ (A_in), - seq_ (seq) - {} - - tbb::task* execute () - { - using tbb::task; - - if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) - return nullptr; - else if (P_first_ == P_last_) - { - execute_base_case (); - return nullptr; - } - else - { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last]. - const size_t P_mid = (P_first_ + P_last_) / 2; - split_t out_split = - partitioner_.split (A_out_, P_first_, P_mid, P_last_, true); - const_split_t in_split = - partitioner_.split (A_in_, P_first_, P_mid, P_last_, false); - - // The partitioner may decide that the current blocks - // A_out_ and A_in_ have too few rows to be worth - // splitting. (It should split both A_out_ and A_in_ in - // the same way.) In that case, out_split.second and - // in_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.extent(0) == 0) - { - execute_base_case (); - return nullptr; - } - - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - CacheBlockTask& topTask = *new( c.allocate_child() ) - CacheBlockTask (P_first_, P_mid, out_split.first, - in_split.first, seq_); - CacheBlockTask& botTask = *new( c.allocate_child() ) - CacheBlockTask (P_mid+1, P_last_, out_split.second, - in_split.second, seq_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } - } - - private: - size_t P_first_, P_last_; - mat_view_type A_out_; - const_mat_view_type A_in_; - SequentialTsqr seq_; - Partitioner partitioner_; - - void - execute_base_case () - { - seq_.cache_block (A_out_.extent(0), A_out_.extent(1), - A_out_.data(), A_in_.data(), A_in_.stride(1)); - } - }; - - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_CacheBlockTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp deleted file mode 100644 index b0ce1e40f6c2..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp +++ /dev/null @@ -1,147 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_ExplicitQTask_hpp -#define __TSQR_TBB_ExplicitQTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - /// \class ExplicitQTask - /// \brief TBB task for recursive TSQR "compute explicit Q" phase. - template< class LocalOrdinal, class Scalar > - class ExplicitQTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - - private: - typedef std::pair split_t; - typedef std::pair const_split_t; - - public: - ExplicitQTask (const size_t P_first__, - const size_t P_last__, - mat_view_type Q_out, - const SequentialTsqr& seq, - const bool contiguous_cache_blocks) : - P_first_ (P_first__), P_last_ (P_last__), Q_out_ (Q_out), - seq_ (seq), contiguous_cache_blocks_ (contiguous_cache_blocks) - {} - - tbb::task* execute () - { - if (P_first_ > P_last_ || Q_out_.empty ()) { - return NULL; - } - else if (P_first_ == P_last_) { - execute_base_case (); - return NULL; - } - else { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first_ + P_last_) / 2; - split_t Q_split = - partitioner_.split (Q_out_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block Q_out - // has too few rows to be worth splitting. In that case, - // Q_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (Q_split.second.empty() || Q_split.second.extent(0) == 0) { - execute_base_case (); - return NULL; - } - - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - ExplicitQTask& topTask = *new( c.allocate_child() ) - ExplicitQTask (P_first_, P_mid, Q_split.first, seq_, - contiguous_cache_blocks_); - ExplicitQTask& botTask = *new( c.allocate_child() ) - ExplicitQTask (P_mid+1, P_last_, Q_split.second, seq_, - contiguous_cache_blocks_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } - } - - private: - size_t P_first_, P_last_; - mat_view_type Q_out_; - SequentialTsqr seq_; - Partitioner partitioner_; - bool contiguous_cache_blocks_; - - void - execute_base_case () - { - // Fill my partition with zeros. - seq_.fill_with_zeros (Q_out_.extent(0), Q_out_.extent(1), - Q_out_.data(), Q_out_.stride(1), - contiguous_cache_blocks_); - // If our partition is the first (topmost), fill it with - // the first Q_out.extent(1) columns of the identity matrix. - if (P_first_ == 0) { - // Fetch the topmost cache block of my partition. Its - // leading dimension should be set correctly by - // top_block(). - mat_view_type Q_out_top = - seq_.top_block (Q_out_, contiguous_cache_blocks_); - // Set the top block of Q_out to the first ncols - // columns of the identity matrix. - for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j) { - Q_out_top(j,j) = Scalar(1); - } - } - } - }; - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TBB_ExplicitQTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp deleted file mode 100644 index e03757db9e18..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp +++ /dev/null @@ -1,231 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_FactorTask_hpp -#define __TSQR_TBB_FactorTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" -#include "Teuchos_Assert.hpp" -#include - -namespace TSQR { - namespace TBB { - /// \class FactorTask - /// \brief TBB task for recursive TSQR factorization phase. - template - class FactorTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair split_t; - typedef std::pair const_split_t; - - /// \typedef SeqOutput - /// Result of SequentialTsqr for each thread. - typedef typename SequentialTsqr::FactorOutput SeqOutput; - /// \typedef ParOutput - /// - /// Array of ncores "local tau arrays" from parallel TSQR. - /// (Local Q factors are stored in place.) - typedef std::vector > ParOutput; - /// \typedef FactorOutput - /// Result of SequentialTsqr for the data on each thread, - /// and the result of combining the threads' data. - typedef typename std::pair, ParOutput> FactorOutput; - - /// \brief Constructor. - /// - /// \note The timing references are only modified by one thread - /// at a time; recursive calls use distinct references and - /// combine the results. - FactorTask (const size_t P_first__, - const size_t P_last__, - mat_view_type A, - mat_view_type* const A_top_ptr, - std::vector& seq_outputs, - ParOutput& par_output, - const SequentialTsqr& seq, - double& my_seq_timing, - double& min_seq_timing, - double& max_seq_timing, - const bool contiguous_cache_blocks) : - P_first_ (P_first__), - P_last_ (P_last__), - A_ (A), - A_top_ptr_ (A_top_ptr), - seq_outputs_ (seq_outputs), - par_output_ (par_output), - seq_ (seq), - contiguous_cache_blocks_ (contiguous_cache_blocks), - my_seq_timing_ (my_seq_timing), - min_seq_timing_ (min_seq_timing), - max_seq_timing_ (max_seq_timing) - {} - - tbb::task* execute () - { - if (P_first_ > P_last_ || A_.empty()) - return NULL; - else if (P_first_ == P_last_) - { - execute_base_case (); - return NULL; - } - else - { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first_ + P_last_) / 2; - split_t A_split = - partitioner_.split (A_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block A_ - // has too few rows to be worth splitting. In that case, - // A_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (A_split.second.empty() || A_split.second.extent(0) == 0) - { - execute_base_case (); - return NULL; - } - - double top_timing; - double top_min_timing = 0.0; - double top_max_timing = 0.0; - double bot_timing; - double bot_min_timing = 0.0; - double bot_max_timing = 0.0; - - FactorTask& topTask = *new( allocate_child() ) - FactorTask (P_first_, P_mid, A_split.first, A_top_ptr_, - seq_outputs_, par_output_, seq_, - top_timing, top_min_timing, top_max_timing, - contiguous_cache_blocks_); - // After the task finishes, A_bot will be set to the topmost - // partition of A_split.second. This will let us combine - // the two subproblems (using factor_pair()) after their - // tasks complete. - mat_view_type A_bot; - FactorTask& botTask = *new( allocate_child() ) - FactorTask (P_mid+1, P_last_, A_split.second, &A_bot, - seq_outputs_, par_output_, seq_, - bot_timing, bot_min_timing, bot_max_timing, - contiguous_cache_blocks_); - set_ref_count (3); // 3 children (2 + 1 for the wait) - spawn (topTask); - spawn_and_wait_for_all (botTask); - - // Combine the two results - factor_pair (P_first_, P_mid+1, *A_top_ptr_, A_bot); - - top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing; - top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing; - - bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing; - bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing; - - min_seq_timing_ = std::min (top_min_timing, bot_min_timing); - max_seq_timing_ = std::min (top_max_timing, bot_max_timing); - - return NULL; - } - } - - private: - const size_t P_first_, P_last_; - mat_view_type A_; - mat_view_type* const A_top_ptr_; - std::vector& seq_outputs_; - ParOutput& par_output_; - SequentialTsqr seq_; - TSQR::Combine combine_; - Partitioner partitioner_; - const bool contiguous_cache_blocks_; - double& my_seq_timing_; - double& min_seq_timing_; - double& max_seq_timing_; - - void - factor_pair (const size_t P_top, - const size_t P_bot, - mat_view_type& A_top, // different than A_top_ - mat_view_type& A_bot) - { - const char thePrefix[] = "TSQR::TBB::Factor::factor_pair: "; - TEUCHOS_TEST_FOR_EXCEPTION - (P_top == P_bot, std::logic_error, thePrefix << "Should " - "never get here! P_top == P_bot (= " << P_top << "), that " - "is, the indices of the thread partitions are the same."); - // We only read and write the upper ncols x ncols triangle of - // each block. - TEUCHOS_TEST_FOR_EXCEPTION - (A_top.extent(1) != A_bot.extent(1), std::logic_error, - thePrefix << "The top cache block A_top is " - << A_top.extent(0) << " x " << A_top.extent(1) - << ", and the bottom cache block A_bot is " - << A_bot.extent(0) << " x " << A_bot.extent(1) - << "; this means we can't factor [A_top; A_bot]."); - const LocalOrdinal ncols = A_top.extent(1); - std::vector& tau = par_output_[P_bot]; - std::vector work (ncols); - combine_.factor_pair (A_top, A_bot, tau.data(), work.data()); - } - - void - execute_base_case () - { - TimerType timer(""); - timer.start(); - seq_outputs_[P_first_] = - seq_.factor (A_.extent(0), A_.extent(1), A_.data(), - A_.stride(1), contiguous_cache_blocks_); - // Assign the topmost cache block of the current partition to - // *A_top_ptr_. Every base case invocation does this, so that - // we can combine subproblems. The root task also does this, - // but for a different reason: so that we can extract the R - // factor, once we're done with the factorization. - *A_top_ptr_ = seq_.top_block (A_, contiguous_cache_blocks_); - my_seq_timing_ = timer.stop(); - } - }; - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TBB_FactorTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp deleted file mode 100644 index 8bc0f42264a7..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp +++ /dev/null @@ -1,135 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_FillWithZerosTask_hpp -#define __TSQR_TBB_FillWithZerosTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - /// \class FillWithZerosTask - /// \brief TBB task for recursive TSQR "fill with zeros" phase. - template - class FillWithZerosTask : public tbb::task { - public: - typedef MatView mat_view_type; - - private: - typedef std::pair split_type; - - public: - FillWithZerosTask (const size_t P_first, - const size_t P_last, - mat_view_type C, - const SequentialTsqr& seq, - const bool contiguous_cache_blocks = false) - : P_first_ (P_first), - P_last_ (P_last), - C_ (C), - seq_ (seq), - contiguous_cache_blocks_ (contiguous_cache_blocks) - {} - - tbb::task* execute () - { - if (P_first_ > P_last_ || C_.empty()) { - return nullptr; - } - else if (P_first_ == P_last_) { - execute_base_case (); - return nullptr; - } - else { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last]. - const size_t P_mid = (P_first_ + P_last_) / 2; - split_type C_split = - partitioner_.split (C_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block C_ - // has too few rows to be worth splitting. In that case, - // C_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (C_split.second.empty() || C_split.second.extent(0) == 0) { - execute_base_case (); - return nullptr; - } - - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - FillWithZerosTask& topTask = *new( c.allocate_child() ) - FillWithZerosTask (P_first_, P_mid, C_split.first, seq_, - contiguous_cache_blocks_); - FillWithZerosTask& botTask = *new( c.allocate_child() ) - FillWithZerosTask (P_mid+1, P_last_, C_split.second, seq_, - contiguous_cache_blocks_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } - } - - private: - size_t P_first_, P_last_; - mat_view_type C_; - SequentialTsqr seq_; - Partitioner partitioner_; - bool contiguous_cache_blocks_; - - void - execute_base_case () - { - // Fill my partition with zeros. - seq_.fill_with_zeros (C_.extent(0), C_.extent(1), C_.data(), - C_.stride(1), contiguous_cache_blocks_); - } - }; - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_FillWithZerosTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp deleted file mode 100644 index f37ab6a7a06c..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp +++ /dev/null @@ -1,137 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_Partitioner_hpp -#define __TSQR_TBB_Partitioner_hpp - -#include "Tsqr_MatView.hpp" - -#include // size_t -#include -#include -#include -#include - -namespace TSQR { - namespace TBB { - template - class Partitioner { - private: - bool - should_split (const Ordinal nrows, - const Ordinal ncols, - const size_t num_partitions) const - { - using std::invalid_argument; - using std::ostringstream; - - if (nrows < ncols) { - ostringstream os; - os << "Partitioner::should_split: nrows (= " << nrows - << ") < ncols (= " << ncols << ")"; - throw invalid_argument (os.str()); - } - else if (num_partitions == 0) { - ostringstream os; - os << "Partitioner::should_split: nrows (= " << nrows - << ") < ncols (= " << ncols << ")"; - throw invalid_argument (os.str()); - } - // FIXME (mfh 11 Jul 2010) Need more overflow checks here. - return static_cast(nrows) / num_partitions >= static_cast(ncols); - } - - public: - /// Partition into [P_first, P_mid] and [P_mid+1, P_last]. The - /// base case is reached when the second returned MatrixViewType - /// is empty. - template< class MatrixViewType > - std::pair< MatrixViewType, MatrixViewType > - split (const MatrixViewType& A, - const size_t P_first, - const size_t P_mid, - const size_t P_last, - const bool contiguous_cache_blocks) const - { - using ordinal_type = typename MatrixViewType::ordinal_type; - using pointer_type = typename MatrixViewType::pointer; - - const size_t num_partitions_top = P_mid - P_first + 1; - //const size_t num_partitions_bottom = P_last - P_mid; - const size_t num_partitions = P_last - P_first + 1; - const ordinal_type nrows = A.extent(0); - const ordinal_type ncols = A.extent(1); - - if (! should_split (nrows, ncols, num_partitions)) { - return std::make_pair (MatrixViewType(A), MatrixViewType()); - } - else { - const ordinal_type num_rows_partition = nrows / num_partitions; - const ordinal_type remainder = nrows % num_partitions; - - // Top partition gets the remainder rows. Doing the - // multiplication before the division might make it more - // likely to avoid truncating the fraction, but may cause - // overflow of ordinal_type. - const ordinal_type num_rows_top = - num_rows_partition * num_partitions_top + remainder; - const ordinal_type num_rows_bot = nrows - num_rows_top; - - // We don't call (const_)mat_view::split_top(), because that - // is for splitting off a single cache block. Each half - // of the split may contain more than one cache block. - if (contiguous_cache_blocks) { - pointer_type A_bot_ptr = A.data() + num_rows_top * ncols; - MatrixViewType A_top (num_rows_top, ncols, A.data(), num_rows_top); - MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, num_rows_bot); - return std::make_pair (A_top, A_bot); - } - else { - pointer_type A_bot_ptr = A.data() + num_rows_top; - MatrixViewType A_top (num_rows_top, ncols, A.data(), A.stride(1)); - MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, A.stride(1)); - return std::make_pair (A_top, A_bot); - } - } - } - }; // class Partitioner - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TBB_Partitioner_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp deleted file mode 100644 index 7a3162b2f9a4..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp +++ /dev/null @@ -1,153 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_RevealRankTask_hpp -#define __TSQR_TBB_RevealRankTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - /// \class RevealRankTask - /// \brief TBB task for recursive TSQR "rank-revealing" phase. - /// - /// This part of the factorization doesn't actually reveal the - /// rank in parallel; we assume that this has already been done - /// and the columns of U form a basis for the column space of the - /// R factor (in the QR factorization of the original matrix). - /// All we need to do here is compute Q*U in parallel, respecting - /// the original partitioning and cache blocking scheme. - template - class RevealRankTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair split_type; - typedef SequentialTsqr seq_tsqr_type; - - RevealRankTask (const size_t P_first, - const size_t P_last, - const mat_view_type& Q, - const const_mat_view_type& U, - const seq_tsqr_type& seq, - const bool contiguous_cache_blocks) : - P_first_ (P_first), - P_last_ (P_last), - Q_ (Q), - U_ (U), - seq_ (seq), - contiguous_cache_blocks_ (contiguous_cache_blocks) - {} - - void - execute_base_case () - { - // Use SequentialTsqr to compute Q*U for this core's local - // part of Q. The method is called "Q_times_B" so that it - // doesn't suggest any orthogonality of the B input matrix, - // though in this case B is U and U is orthogonal - // (resp. unitary if Scalar is complex). - seq_.Q_times_B (Q_.extent(0), Q_.extent(1), - Q_.data(), Q_.stride(1), - U_.data(), U_.stride(1), - contiguous_cache_blocks_); - } - - tbb::task* execute () - { - using tbb::task; - - if (P_first_ > P_last_ || Q_.empty()) { - return nullptr; // shouldn't get here, but just in case... - } - else if (P_first_ == P_last_) { - execute_base_case (); - return nullptr; - } - else { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last] - const size_t P_mid = (P_first_ + P_last_) / 2; - split_type out_split = - partitioner_.split (Q_, P_first_, P_mid, P_last_, - contiguous_cache_blocks_); - // The partitioner may decide that the current block Q_ has - // too few rows to be worth splitting. In that case, - // out_split.second (the bottom block) will be empty. We - // can deal with this by treating it as the base case. - if (out_split.second.empty() || out_split.second.extent(0) == 0) { - execute_base_case (); - return nullptr; - } - - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - RevealRankTask& topTask = *new( c.allocate_child() ) - RevealRankTask (P_first_, P_mid, out_split.first, U_, - seq_, contiguous_cache_blocks_); - RevealRankTask& botTask = *new( c.allocate_child() ) - RevealRankTask (P_mid+1, P_last_, out_split.second, U_, - seq_, contiguous_cache_blocks_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } - } - - private: - size_t P_first_, P_last_; - mat_view_type Q_; - const_mat_view_type U_; - SequentialTsqr seq_; - Partitioner partitioner_; - bool contiguous_cache_blocks_; - }; - - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_RevealRankTask_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp deleted file mode 100644 index 53a473d2e5f7..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp +++ /dev/null @@ -1,409 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_TbbMgs_hpp -#define __TSQR_TBB_TbbMgs_hpp - -#include -#include -#include -#include -#include // std::pair - -#include "Tsqr_MessengerBase.hpp" -#include "Teuchos_ScalarTraits.hpp" -#include "Tsqr_Util.hpp" -#include "Teuchos_RCP.hpp" - -#include -#include -#include -#include - -namespace TSQR { - namespace TBB { - - // Forward declaration - template< class LocalOrdinal, class Scalar > - class TbbMgs { - public: - typedef Scalar scalar_type; - typedef LocalOrdinal ordinal_type; - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - typedef MessengerBase< Scalar > messenger_type; - typedef Teuchos::RCP< messenger_type > messenger_ptr; - - TbbMgs (const messenger_ptr& messenger) : - messenger_ (messenger) {} - - void - mgs (const LocalOrdinal nrows_local, - const LocalOrdinal ncols, - Scalar A_local[], - const LocalOrdinal lda_local, - Scalar R[], - const LocalOrdinal ldr); - - private: - messenger_ptr messenger_; - }; - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - - namespace details { - - /// Compute y'*x (where y' means conjugate transpose in the - /// complex case, and transpose in the real case). - template< class LocalOrdinal, class Scalar > - class TbbDot { - public: - void - operator() (const tbb::blocked_range< LocalOrdinal >& r) - { - typedef Teuchos::ScalarTraits STS; - - // The TBB book likes this copying of pointers into the local routine. - // It probably helps the compiler do optimizations. - const Scalar* const x = x_; - const Scalar* const y = y_; - Scalar local_result = result_; - - for (LocalOrdinal i = r.begin(); i != r.end(); ++i) { - local_result += x[i] * STS::conjugate (y[i]); - } - result_ = local_result; - } - /// Result of the reduction. - Scalar result() const { return result_; } - - /// Ordinary constructor - TbbDot (const Scalar* const x, const Scalar* const y) : - result_ (Scalar(0)), x_ (x), y_ (y) {} - - /// "Split constructor" for TBB reductions - TbbDot (TbbDot& d, tbb::split) : - result_ (Scalar(0)), x_ (d.x_), y_ (d.y_) - {} - /// "Join" operator for TBB reductions; it tells TBB how to - /// combine two subproblems. - void join (const TbbDot& d) { result_ += d.result(); } - - private: - // Default constructor doesn't make sense. - TbbDot (); - - Scalar result_; - const Scalar* const x_; - const Scalar* const y_; - }; - - template< class LocalOrdinal, class Scalar > - class TbbScale { - public: - TbbScale (Scalar* const x, const Scalar& denom) : - x_ (x), denom_ (denom) {} - - // TBB demands that this be a "const" operator, in order for - // the parallel_for expression to compile. Strictly speaking, - // it is const, because it does not change the address of the - // pointer x_ (only the values stored there). - void - operator() (const tbb::blocked_range< LocalOrdinal >& r) const - { - // TBB likes arrays to have their pointers copied like this in - // the operator() method. I suspect it has something to do - // with compiler optimizations. If C++ supported the - // "restrict" keyword, here would be a good place to add it... - Scalar* const x = x_; - const Scalar denom = denom_; - for (LocalOrdinal i = r.begin(); i != r.end(); ++i) - x[i] = x[i] / denom; - } - private: - Scalar* const x_; - const Scalar denom_; - }; - - template< class LocalOrdinal, class Scalar > - class TbbAxpy { - public: - TbbAxpy (const Scalar& alpha, const Scalar* const x, Scalar* const y) : - alpha_ (alpha), x_ (x), y_ (y) - {} - // TBB demands that this be a "const" operator, in order for - // the parallel_for expression to compile. Strictly speaking, - // it is const, because it does change the address of the - // pointer y_ (only the values stored there). - void - operator() (const tbb::blocked_range< LocalOrdinal >& r) const - { - const Scalar alpha = alpha_; - const Scalar* const x = x_; - Scalar* const y = y_; - for (LocalOrdinal i = r.begin(); i != r.end(); ++i) - y[i] = y[i] + alpha * x[i]; - } - private: - const Scalar alpha_; - const Scalar* const x_; - Scalar* const y_; - }; - - template< class LocalOrdinal, class Scalar > - class TbbNormSquared { - private: - typedef Teuchos::ScalarTraits STS; - - public: - typedef typename STS::magnitudeType magnitude_type; - - void - operator () (const tbb::blocked_range& r) - { - // Doing the right thing in the complex case requires taking - // an absolute value. We want to avoid this additional cost - // in the real case, which is why we check is_complex. - if (STS::isComplex) { - // The TBB book favors copying array pointers into the - // local routine. It probably helps the compiler do - // optimizations. - const Scalar* const x = x_; - for (LocalOrdinal i = r.begin(); i != r.end(); ++i) { - // One could implement this by computing - // - // result_ += STS::real (x[i] * STS::conjugate(x[i])); - // - // However, in terms of type theory, it's much more - // natural to start with a magnitude_type before - // doing the multiplication. - const magnitude_type xi = STS::magnitude (x[i]); - result_ += xi * xi; - } - } - else { - const Scalar* const x = x_; - for (LocalOrdinal i = r.begin(); i != r.end(); ++i) { - const Scalar xi = x[i]; - result_ += xi * xi; - } - } - } - - magnitude_type result () const { return result_; } - - TbbNormSquared (const Scalar* const x) : - result_ (magnitude_type(0)), x_ (x) {} - - TbbNormSquared (TbbNormSquared& d, tbb::split) : - result_ (magnitude_type(0)), x_ (d.x_) {} - - void join (const TbbNormSquared& d) { result_ += d.result (); } - - private: - // Default constructor doesn't make sense - TbbNormSquared (); - - magnitude_type result_; - const Scalar* const x_; - }; - - - template< class LocalOrdinal, class Scalar > - class TbbMgsOps { - private: - typedef tbb::blocked_range< LocalOrdinal > range_type; - typedef Teuchos::ScalarTraits STS; - - public: - typedef MessengerBase messenger_type; - typedef Teuchos::RCP messenger_ptr; - typedef typename STS::magnitudeType magnitude_type; - - TbbMgsOps (const messenger_ptr& messenger) : - messenger_ (messenger) {} - - void - axpy (const LocalOrdinal nrows_local, - const Scalar alpha, - const Scalar x_local[], - Scalar y_local[]) const - { - using tbb::auto_partitioner; - using tbb::parallel_for; - - TbbAxpy< LocalOrdinal, Scalar > axpyer (alpha, x_local, y_local); - parallel_for (range_type (0, nrows_local), axpyer, auto_partitioner ()); - } - - void - scale (const LocalOrdinal nrows_local, - Scalar x_local[], - const Scalar denom) const - { - using tbb::auto_partitioner; - using tbb::parallel_for; - - // "scaler" is spelled that way (and not as "scalar") on - // purpose. Think about it. - TbbScale scaler (x_local, denom); - parallel_for (range_type (0, nrows_local), scaler, auto_partitioner ()); - } - - /// $y^* \cdot x$: conjugate transpose when Scalar is complex, - /// else regular transpose. - Scalar - dot (const LocalOrdinal nrows_local, - const Scalar x_local[], - const Scalar y_local[]) - { - Scalar localResult (0); - if (true) - { - // FIXME (mfh 26 Aug 2010) I'm not sure why I did this - // (i.e., why I wrote "if (true)" here). Certainly the - // branch that is currently enabled should produce - // correct behavior. I suspect the nonenabled branch - // will not. - if (true) { - TbbDot dotter (x_local, y_local); - dotter (range_type (0, nrows_local)); - localResult = dotter.result (); - } - else { - using tbb::auto_partitioner; - using tbb::parallel_reduce; - - TbbDot dotter (x_local, y_local); - parallel_reduce (range_type (0, nrows_local), - dotter, auto_partitioner ()); - localResult = dotter.result (); - } - } - else { - for (LocalOrdinal i = 0; i != nrows_local; ++i) { - localResult += x_local[i] * STS::conjugate (y_local[i]); - } - } - - // FIXME (mfh 23 Apr 2010) Does MPI_SUM do the right thing for - // complex or otherwise general MPI data types? Perhaps an MPI_Op - // should belong in the MessengerBase... - return messenger_->globalSum (localResult); - } - - magnitude_type - norm2 (const LocalOrdinal nrows_local, - const Scalar x_local[]) - { - using tbb::auto_partitioner; - using tbb::parallel_reduce; - - TbbNormSquared< LocalOrdinal, Scalar > normer (x_local); - parallel_reduce (range_type (0, nrows_local), normer, - auto_partitioner ()); - const magnitude_type localResult = normer.result(); - // FIXME (mfh 12 Oct 2010) This involves an implicit - // typecast from Scalar to magnitude_type. - const magnitude_type globalResult = - messenger_->globalSum (localResult); - // Make sure that sqrt's argument is a magnitude_type. Of - // course global_result should be nonnegative real, but we - // want the compiler to pick up the correct sqrt function. - typedef Teuchos::ScalarTraits STM; - return STM::squareroot (globalResult); - } - - Scalar - project (const LocalOrdinal nrows_local, - const Scalar q_local[], - Scalar v_local[]) - { - const Scalar coeff = this->dot (nrows_local, v_local, q_local); - this->axpy (nrows_local, -coeff, q_local, v_local); - return coeff; - } - - private: - messenger_ptr messenger_; - }; - } // namespace details - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - - template - void - TbbMgs::mgs (const LocalOrdinal nrows_local, - const LocalOrdinal ncols, - Scalar A_local[], - const LocalOrdinal lda_local, - Scalar R[], - const LocalOrdinal ldr) - { - details::TbbMgsOps ops (messenger_); - - for (LocalOrdinal j = 0; j < ncols; ++j) { - Scalar* const v = &A_local[j*lda_local]; - for (LocalOrdinal i = 0; i < j; ++i) { - const Scalar* const q = &A_local[i*lda_local]; - R[i + j*ldr] = ops.project (nrows_local, q, v); - } - const magnitude_type denom = ops.norm2 (nrows_local, v); - - // FIXME (mfh 29 Apr 2010) - // - // NOTE IMPLICIT CAST. This should work for complex numbers. - // If it doesn't work for your Scalar data type, it means that - // you need a different data type for the diagonal elements of - // the R factor, than you need for the other elements. This - // is unlikely if we're comparing MGS against a Householder QR - // factorization; I don't really understand how the latter - // would work (not that it couldn't be given a sensible - // interpretation) in the case of Scalars that aren't plain - // old real or complex numbers. - R[j + j*ldr] = Scalar (denom); - ops.scale (nrows_local, v, denom); - } - } - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TBB_TbbMgs_hpp - diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp deleted file mode 100644 index c86123c42d8b..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp +++ /dev/null @@ -1,690 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_TbbParallelTsqr_hpp -#define __TSQR_TBB_TbbParallelTsqr_hpp - -#include -#include -#include "TbbTsqr_FactorTask.hpp" -#include "TbbTsqr_ApplyTask.hpp" -#include "TbbTsqr_ExplicitQTask.hpp" -#include "TbbTsqr_RevealRankTask.hpp" -#include "TbbTsqr_CacheBlockTask.hpp" -#include "TbbTsqr_UnCacheBlockTask.hpp" -#include "TbbTsqr_FillWithZerosTask.hpp" -#include "Tsqr_ApplyType.hpp" -#include "Teuchos_ScalarTraits.hpp" -#include -#include - -namespace TSQR { - namespace TBB { - /// \class TbbParallelTsqr - /// \brief Parallel implementation of \c TbbTsqr. - /// \author Mark Hoemmen - /// - /// This class implements the functionality of \c TbbTsqr. - /// It is not meant to be seen by users of \c TbbTsqr. - /// - /// The third template parameter, TimerType, allows different - /// timer implementations. TbbParallelTsqr times each task's - /// invocations of \c SequentialTsqr::factor() and \c - /// SequentialTsqr::apply(). \c TrivialTimer is a "timer" that - /// does nothing, in case you don't want to invoke timers. - template - class TbbParallelTsqr { - private: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair split_t; - typedef std::pair const_split_t; - typedef std::pair top_blocks_t; - typedef std::vector array_top_blocks_t; - - template - MatrixViewType - top_block_helper (const size_t P_first, - const size_t P_last, - const MatrixViewType& C, - const bool contiguous_cache_blocks) const - { - if (P_first > P_last) - throw std::logic_error ("P_first > P_last"); - else if (P_first == P_last) - return seq_.top_block (C, contiguous_cache_blocks); - else - { - typedef std::pair split_type; - - // Divide [P_first, P_last] into two intervals: [P_first, - // P_mid] and [P_mid+1, P_last]. Recurse on the first - // interval [P_first, P_mid]. - const size_t P_mid = (P_first + P_last) / 2; - split_type C_split = partitioner_.split (C, P_first, P_mid, P_last, - contiguous_cache_blocks); - // The partitioner may decide that the current block C has - // too few rows to be worth splitting. In that case, - // C_split.first should be the same block as C, and - // C_split.second (the bottom block) will be empty. We - // deal with this in the same way as the base case - // (P_first == P_last) above. - if (C_split.second.empty() || C_split.second.extent(0) == 0) - return seq_.top_block (C_split.first, contiguous_cache_blocks); - else - return top_block_helper (P_first, P_mid, C_split.first, - contiguous_cache_blocks); - } - } - - public: - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef LocalOrdinal ordinal_type; - - /// Whether or not this QR factorization produces an R factor - /// with all nonnegative diagonal entries. - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - typedef Combine combine_type; - return combine_type::QR_produces_R_factor_with_nonnegative_diagonal (); - } - - /// \typedef SeqOutput - /// \brief Results of SequentialTsqr for each core. - typedef typename SequentialTsqr::FactorOutput SeqOutput; - - /// \typedef ParOutput - /// \brief Array of numTasks_ "local tau arrays" from parallel TSQR. - /// - /// (Local Q factors are stored in place.) - typedef std::vector > ParOutput; - - /// \typedef FactorOutput - /// \brief Partial representation of the Q factor. - /// - /// The \c factor() method returns a pair: the results of - /// SequentialTsqr for data on each core, and the results of - /// combining the data on the cores. - typedef typename std::pair, ParOutput> FactorOutput; - - /// \brief Constructor. - /// - /// \param numTasks [in] Number of parallel tasks to use in the - /// factorization. This should be >= the number of cores with - /// which Intel TBB was initialized. - /// \param cacheSizeHint [in] Cache size hint in bytes. Zero - /// means that TSQR will pick a reasonable nonzero default. - TbbParallelTsqr (const size_t numTasks = 1, - const size_t cacheSizeHint = 0) : - seq_ (cacheSizeHint), - min_seq_factor_timing_ (std::numeric_limits::max()), - max_seq_factor_timing_ (std::numeric_limits::min()), - min_seq_apply_timing_ (std::numeric_limits::max()), - max_seq_apply_timing_ (std::numeric_limits::min()) - { - if (numTasks < 1) - numTasks_ = 1; // default is no parallelism - else - numTasks_ = numTasks; - } - - /// \brief Constructor (that takes a parameter list). - /// - /// \param plist [in/out] On input: list of parameters. On - /// output: missing parameters are filled in with default - /// values. - /// - /// For a list of accepted parameters and thei documentation, - /// see the parameter list returned by \c getValidParameters(). - TbbParallelTsqr (const Teuchos::RCP& plist) : - seq_ (plist), // SequentialTsqr has a plist-accepting constructor. - numTasks_ (1), // Set a safe default for now. - min_seq_factor_timing_ (std::numeric_limits::max()), - max_seq_factor_timing_ (std::numeric_limits::min()), - min_seq_apply_timing_ (std::numeric_limits::max()), - max_seq_apply_timing_ (std::numeric_limits::min()) - { - if (! plist.is_null()) { - const int defaultNumTasks = 1; // A reasonable safe default value. - int numTasks = plist->get ("Num Tasks", defaultNumTasks); - if (numTasks < 1) { // Default is no parallelism. - plist->set ("Num Tasks", defaultNumTasks); - } - numTasks_ = numTasks; - } - } - - Teuchos::RCP - getValidParameters () const - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - - // TbbTsqr recursively divides the tall skinny matrix on the - // node into TBB tasks. Each task works on a block row. The - // TBB task scheduler ensures that oversubscribing TBB tasks - // won't oversubscribe cores, so it's OK if - // default_num_threads() is too many. For example, TBB might - // say default_num_threads() is the number of cores on the - // node, but the TBB task scheduler might have been - // initialized with the number of cores per NUMA region, for - // hybrid MPI + TBB parallelism. - const int numTasks = - tbb::task_scheduler_init::default_num_threads(); - const size_t cacheSizeHint = 0; - const size_t sizeOfScalar = sizeof(Scalar); - - RCP params = parameterList ("NodeTsqr"); - params->set ("Num Tasks", numTasks, - "Number of tasks to use in the intranode parallel part " - "TSQR. There is little/no performance penalty for mild " - "oversubscription, but a potential performance penalty " - "for undersubscription."); - params->set ("Cache Size Hint", cacheSizeHint, - "Cache size hint in bytes (as a size_t) to use for " - "intranode TSQR. If zero, TSQR will pick a reasonable " - "default. See the documentation of SequentialTsqr for " - "a discussion of how to tune this parameter."); - params->set ("Size of Scalar", sizeOfScalar); - - return params; - } - - void - setParameterList (const Teuchos::RCP& plist) - { - seq_.setParameterList (plist); - - if (! plist.is_null()) { - const int defaultNumCores = 1; // A reasonable safe default value. - int numTasks = plist->get ("Num Tasks", defaultNumCores); - if (numTasks < 1) { // Default is no parallelism. - plist->set ("Num Tasks", defaultNumCores); - } - numTasks_ = numTasks; - } - } - - /// \brief Number of tasks that TSQR will use to solve the problem. - /// - /// This is the number of subproblems into which to divide the - /// main problem, in order to solve it in parallel. - size_t ntasks() const { return numTasks_; } - - /// \brief Cache size hint (in bytes) used for the factorization. - /// - /// This may be different from the corresponding constructor - /// argument, because TSQR may revise unreasonable suggestions - /// into reasonable values. - size_t cache_size_hint() const { return seq_.cache_size_hint(); } - - //! Fastest time over all tasks of the last SequentialTsqr::factor() call. - double - min_seq_factor_timing () const { return min_seq_factor_timing_; } - //! Slowest time over all tasks of the last SequentialTsqr::factor() call. - double - max_seq_factor_timing () const { return max_seq_factor_timing_; } - //! Fastest time over all tasks of the last SequentialTsqr::apply() call. - double - min_seq_apply_timing () const { return min_seq_apply_timing_; } - //! Slowest time over all tasks of the last SequentialTsqr::apply() call. - double - max_seq_apply_timing () const { return max_seq_apply_timing_; } - - FactorOutput - factor (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const - { - using tbb::task; - - mat_view_type A_view (nrows, ncols, A, lda); - // A_top will be modified in place by exactly one task, to - // indicate the partition from which we may extract the R - // factor after finishing the factorization. - mat_view_type A_top; - - std::vector seq_output (ntasks()); - ParOutput par_output (ntasks(), std::vector(ncols)); - if (ntasks() < 1) - { - if (! A_view.empty()) - throw std::logic_error("Zero subproblems, but A not empty!"); - else // Return empty results - return std::make_pair (seq_output, par_output); - } - - double my_seq_timing = double(0); - double min_seq_timing = double(0); - double max_seq_timing = double(0); - try { - typedef FactorTask factor_task_t; - - // When the root task completes, A_top will be set to the - // topmost partition of A. We can then extract the R factor - // from A_top. - factor_task_t& root_task = *new( task::allocate_root() ) - factor_task_t(0, ntasks()-1, A_view, &A_top, seq_output, - par_output, seq_, my_seq_timing, min_seq_timing, - max_seq_timing, contiguous_cache_blocks); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - // TBB can't guarantee on all systems that an exception - // thrown in another thread will have its type correctly - // propagated to this thread. If it can't, then it captures - // the exception as a tbb:captured_exception, and propagates - // it to here. It may be able to propagate the exception, - // though, so be prepared for that. We deal with the latter - // case by allowing the exception to propagate. - std::ostringstream os; - os << "Intel TBB caught an exception, while computing the QR factor" - "ization of a matrix A. Unfortunately, its type information was " - "lost, because the exception was thrown in another thread. Its " - "\"what()\" function returns the following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - - // Copy the R factor out of A_top into R. - seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.stride(1), R, ldr, contiguous_cache_blocks); - - // Save the timings for future reference - if (min_seq_timing < min_seq_factor_timing_) - min_seq_factor_timing_ = min_seq_timing; - if (max_seq_timing > max_seq_factor_timing_) - max_seq_factor_timing_ = max_seq_timing; - - return std::make_pair (seq_output, par_output); - } - - void - apply (const ApplyType& apply_type, - const LocalOrdinal nrows, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const FactorOutput& factor_output, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const - { - using tbb::task; - - if (apply_type.transposed()) - throw std::logic_error ("Applying Q^T and Q^H not implemented"); - - const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq); - mat_view_type C_view (nrows, ncols_C, C, ldc); - if (! apply_type.transposed()) - { - array_top_blocks_t top_blocks (ntasks()); - build_partition_array (0, ntasks()-1, top_blocks, Q_view, - C_view, contiguous_cache_blocks); - double my_seq_timing = 0.0; - double min_seq_timing = 0.0; - double max_seq_timing = 0.0; - try { - typedef ApplyTask apply_task_t; - apply_task_t& root_task = - *new( task::allocate_root() ) - apply_task_t (0, ntasks()-1, Q_view, C_view, top_blocks, - factor_output, seq_, my_seq_timing, - min_seq_timing, max_seq_timing, - contiguous_cache_blocks); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while applying a Q factor " - "computed previously by factor() to the matrix C. Unfortunate" - "ly, its type information was lost, because the exception was " - "thrown in another thread. Its \"what()\" function returns th" - "e following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - - // Save the timings for future reference - if (min_seq_timing < min_seq_apply_timing_) - min_seq_apply_timing_ = min_seq_timing; - if (max_seq_timing > max_seq_apply_timing_) - max_seq_apply_timing_ = max_seq_timing; - } - } - - - void - explicit_Q (const LocalOrdinal nrows, - const LocalOrdinal ncols_Q_in, - const Scalar Q_in[], - const LocalOrdinal ldq_in, - const FactorOutput& factor_output, - const LocalOrdinal ncols_Q_out, - Scalar Q_out[], - const LocalOrdinal ldq_out, - const bool contiguous_cache_blocks) const - { - using tbb::task; - - mat_view_type Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out); - try { - typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t; - explicit_Q_task_t& root_task = *new( task::allocate_root() ) - explicit_Q_task_t (0, ntasks()-1, Q_out_view, seq_, - contiguous_cache_blocks); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while preparing to compute" - " the explicit Q factor from a QR factorization computed previ" - "ously by factor(). Unfortunately, its type information was l" - "ost, because the exception was thrown in another thread. Its" - " \"what()\" function returns the following string: " - << ex.what(); - throw std::runtime_error (os.str()); - } - apply (ApplyType::NoTranspose, - nrows, ncols_Q_in, Q_in, ldq_in, factor_output, - ncols_Q_out, Q_out, ldq_out, - contiguous_cache_blocks); - } - - /// \brief Compute Q*B - /// - /// Compute matrix-matrix product Q*B, where Q is nrows by ncols - /// and B is ncols by ncols. Respect cache blocks of Q. - void - Q_times_B (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar Q[], - const LocalOrdinal ldq, - const Scalar B[], - const LocalOrdinal ldb, - const bool contiguous_cache_blocks) const - { - // Compute Q := Q*B in parallel. This works much like - // cache_block() (which see), in that each thread's instance - // does not need to communicate with the others. - try { - using tbb::task; - typedef RevealRankTask rrtask_type; - - mat_view_type Q_view (nrows, ncols, Q, ldq); - const_mat_view_type B_view (ncols, ncols, B, ldb); - - rrtask_type& root_task = *new( task::allocate_root() ) - rrtask_type (0, ntasks()-1, Q_view, B_view, seq_, - contiguous_cache_blocks); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while computing Q := Q*U. " - "Unfortunately, its type information was lost, because the " - "exception was thrown in another thread. Its \"what()\" function " - "returns the following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - } - - - /// Compute SVD \f$R = U \Sigma V^*\f$, not in place. Use the - /// resulting singular values to compute the numerical rank of R, - /// with respect to the relative tolerance tol. If R is full - /// rank, return without modifying R. If R is not full rank, - /// overwrite R with \f$\Sigma \cdot V^*\f$. - /// - /// \return Numerical rank of R: 0 <= rank <= ncols. - LocalOrdinal - reveal_R_rank (const LocalOrdinal ncols, - Scalar R[], - const LocalOrdinal ldr, - Scalar U[], - const LocalOrdinal ldu, - const magnitude_type tol) const - { - return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol); - } - - /// \brief Rank-revealing decomposition - /// - /// Using the R factor from factor() and the explicit Q factor - /// from explicit_Q(), compute the SVD of R (\f$R = U \Sigma - /// V^*\f$). R. If R is full rank (with respect to the given - /// relative tolerance tol), don't change Q or R. Otherwise, - /// compute \f$Q := Q \cdot U\f$ and \f$R := \Sigma V^*\f$ in - /// place (the latter may be no longer upper triangular). - /// - /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq ncols\f$. - /// - LocalOrdinal - reveal_rank (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar Q[], - const LocalOrdinal ldq, - Scalar R[], - const LocalOrdinal ldr, - const magnitude_type tol, - const bool contiguous_cache_blocks = false) const - { - // Take the easy exit if available. - if (ncols == 0) - return 0; - - Matrix U (ncols, ncols, Scalar(0)); - const LocalOrdinal rank = - reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol); - - if (rank < ncols) { - // If R is not full rank: reveal_R_rank() already computed - // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and - // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q - // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1), - contiguous_cache_blocks); - } - return rank; - } - - void - cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const Scalar A_in[], - const LocalOrdinal lda_in) const - { - using tbb::task; - - const_mat_view_type A_in_view (nrows, ncols, A_in, lda_in); - // A_out won't have leading dimension lda_in, but that's OK, - // as long as all the routines are told that A_out is - // cache-blocked. - mat_view_type A_out_view (nrows, ncols, A_out, lda_in); - try { - typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t; - cache_block_task_t& root_task = *new( task::allocate_root() ) - cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while cache-blocking a mat" - "rix. Unfortunately, its type information was lost, because t" - "he exception was thrown in another thread. Its \"what()\" fu" - "nction returns the following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - } - - void - un_cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const LocalOrdinal lda_out, - const Scalar A_in[]) const - { - using tbb::task; - - // A_in doesn't have leading dimension lda_out, but that's OK, - // as long as all the routines are told that A_in is cache- - // blocked. - const_mat_view_type A_in_view (nrows, ncols, A_in, lda_out); - mat_view_type A_out_view (nrows, ncols, A_out, lda_out); - try { - typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t; - un_cache_block_task_t& root_task = *new( task::allocate_root() ) - un_cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while un-cache-blocking a " - "matrix. Unfortunately, its type information was lost, becaus" - "e the exception was thrown in another thread. Its \"what()\"" - " function returns the following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - } - - template< class MatrixViewType > - MatrixViewType - top_block (const MatrixViewType& C, - const bool contiguous_cache_blocks = false) const - { - return top_block_helper (0, ntasks()-1, C, contiguous_cache_blocks); - } - - void - fill_with_zeros (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const - { - using tbb::task; - mat_view_type C_view (nrows, ncols, C, ldc); - - try { - typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t; - fill_task_t& root_task = *new( task::allocate_root() ) - fill_task_t (0, ntasks()-1, C_view, seq_, contiguous_cache_blocks); - task::spawn_root_and_wait (root_task); - } catch (tbb::captured_exception& ex) { - std::ostringstream os; - os << "Intel TBB caught an exception, while un-cache-blocking a " - "matrix. Unfortunately, its type information was lost, becaus" - "e the exception was thrown in another thread. Its \"what()\"" - " function returns the following string: " << ex.what(); - throw std::runtime_error (os.str()); - } - } - - private: - size_t numTasks_; - TSQR::SequentialTsqr seq_; - TSQR::Combine combine_; - Partitioner partitioner_; - - mutable double min_seq_factor_timing_; - mutable double max_seq_factor_timing_; - mutable double min_seq_apply_timing_; - mutable double max_seq_apply_timing_; - - void - build_partition_array (const size_t P_first, - const size_t P_last, - array_top_blocks_t& top_blocks, - const_mat_view_type& Q, - mat_view_type& C, - const bool contiguous_cache_blocks = false) const - { - if (P_first > P_last) { - return; - } - else if (P_first == P_last) { - const_mat_view_type Q_top = seq_.top_block (Q, contiguous_cache_blocks); - mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); - top_blocks[P_first] = - std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.stride(1)), - mat_view_type (C_top.extent(1), C_top.extent(1), - C_top.data(), C_top.stride(1))); - } - else { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - const_split_t Q_split = - partitioner_.split (Q, P_first, P_mid, P_last, - contiguous_cache_blocks); - split_t C_split = - partitioner_.split (C, P_first, P_mid, P_last, - contiguous_cache_blocks); - // The partitioner may decide that the current blocks Q - // and C have too few rows to be worth splitting. (The - // partitioner should split both Q and C in the same way.) - // In that case, Q_split.first should be the same block as - // Q, and Q_split.second (the bottom block) will be empty. - // Ditto for C_split. We deal with this in the same way - // as the base case (P_first == P_last) above. - if (Q_split.second.empty() || Q_split.second.extent(0) == 0) { - const_mat_view_type Q_top = - seq_.top_block (Q, contiguous_cache_blocks); - mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks); - top_blocks[P_first] = - std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.stride(1)), - mat_view_type (C_top.extent(1), C_top.extent(1), - C_top.data(), C_top.stride(1))); - } - else { - build_partition_array (P_first, P_mid, top_blocks, - Q_split.first, C_split.first, - contiguous_cache_blocks); - build_partition_array (P_mid+1, P_last, top_blocks, - Q_split.second, C_split.second, - contiguous_cache_blocks); - } - } - } - }; - } // namespace TBB -} // namespace TSQR - -#endif // __TSQR_TBB_TbbParallelTsqr_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp deleted file mode 100644 index e7f79fb0c15d..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp +++ /dev/null @@ -1,270 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TbbRecursiveTsqr_hpp -#define __TSQR_TbbRecursiveTsqr_hpp - -#include "Tsqr_ApplyType.hpp" -#include "Tsqr_CacheBlocker.hpp" -#include "Tsqr_SequentialTsqr.hpp" -#include "TbbTsqr_Partitioner.hpp" -#include -#include -#include // std::pair -#include - -namespace TSQR { - namespace TBB { - /// \class TbbRecursiveTsqr - /// \brief Non-parallel "functioning stub" implementation of \c TbbTsqr. - template - class TbbRecursiveTsqr { - public: - /// \brief Constructor. - /// - /// \param num_cores [in] Maximum parallelism to use (i.e., - /// maximum number of partitions into which to divide the - /// matrix to factor). - /// - /// \param cache_size_hint [in] Approximate cache size in bytes - /// per CPU core. A hint, not a command. If zero, set to a - /// reasonable default. - TbbRecursiveTsqr (const size_t num_cores = 1, - const size_t cache_size_hint = 0); - - /// Number of cores to use to solve the problem (i.e., number of - /// subproblems into which to divide the main problem, to solve - /// it in parallel). - size_t ncores() const { return ncores_; } - - //! Cache size hint (in bytes) used for the factorization. - size_t cache_size_hint() const { return seq_.cache_size_hint(); } - - //! Results of SequentialTsqr for each core. - typedef typename SequentialTsqr::FactorOutput SeqOutput; - - /// \typedef ParOutput - /// \brief Array of ncores "local tau arrays" from parallel TSQR. - /// - /// Local Q factors are stored in place. - typedef std::vector > ParOutput; - - /// \typedef FactorOutput - /// \brief Return type of factor(). - /// - /// factor() returns a pair: the results of SequentialTsqr for - /// data on each core, and the results of combining the data on - /// the cores. - typedef typename std::pair, ParOutput> FactorOutput; - - /// Copy the nrows by ncols matrix A_in (with leading dimension - /// lda_in >= nrows) into A_out, such that cache blocks are - /// arranged contiguously in memory. - void - cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const Scalar A_in[], - const LocalOrdinal lda_in) const; - - /// Copy the nrows by ncols matrix A_in, whose cache blocks are - /// arranged contiguously in memory, into A_out (with leading - /// dimension lda_out >= nrows), which is in standard - /// column-major order. - void - un_cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const LocalOrdinal lda_out, - const Scalar A_in[]) const; - - /// Compute the QR factorization of the nrows by ncols matrix A - /// (with leading dimension lda >= nrows), returning a - /// representation of the Q factor (which includes data stored - /// in-place in A), and overwriting R (an ncols by ncols matrix - /// in column-major order with leading dimension ldr >= ncols) - /// with the R factor. - FactorOutput - factor (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const; - - /// Apply the Q factor computed by factor() (which see) to the - /// nrows by ncols_C matrix C, with leading dimension ldc >= - /// nrows. - void - apply (const std::string& op, - const LocalOrdinal nrows, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const FactorOutput& factor_output, - const bool contiguous_cache_blocks) const; - - /// Compute the explicit representation of the Q factor computed - /// by factor(). - void - explicit_Q (const LocalOrdinal nrows, - const LocalOrdinal ncols_Q_in, - const Scalar Q_in[], - const LocalOrdinal ldq_in, - const LocalOrdinal ncols_Q_out, - Scalar Q_out[], - const LocalOrdinal ldq_out, - const FactorOutput& factor_output, - const bool contiguous_cache_blocks) const; - - private: - size_t ncores_; - TSQR::SequentialTsqr seq_; - Partitioner partitioner_; - - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair const_split_t; - typedef std::pair split_t; - typedef std::pair top_blocks_t; - typedef std::vector array_top_blocks_t; - - void - explicit_Q_helper (const size_t P_first, - const size_t P_last, - mat_view_type& Q_out, - const bool contiguous_cache_blocks) const; - - /// \brief Return a nonconst view of the topmost block. - /// - /// This is helpful for combining the R factors and extracting - /// the final R factor result. - mat_view_type - factor_helper (const size_t P_first, - const size_t P_last, - const size_t depth, - mat_view_type A, - std::vector& seq_outputs, - ParOutput& par_outputs, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const; - - bool - apply_helper_empty (const size_t P_first, - const size_t P_last, - const_mat_view_type &Q, - mat_view_type& C) const; - - /// \brief Build array of ncores() blocks, one for each partition. - /// - /// Each block is the topmost block in that partition. This is - /// useful for apply_helper. - void - build_partition_array (const size_t P_first, - const size_t P_last, - array_top_blocks_t& top_blocks, - const_mat_view_type& Q, - mat_view_type& C, - const bool contiguous_cache_blocks) const; - - /// Apply Q (not Q^T or Q^H, which is why we don't ask for "op") - /// to C. - void - apply_helper (const size_t P_first, - const size_t P_last, - const_mat_view_type Q, - mat_view_type C, - array_top_blocks_t& top_blocks, - const FactorOutput& factor_output, - const bool contiguous_cache_blocks) const; - - /// Apply Q^T or Q^H to C. - /// - /// \return Views of the topmost partitions of Q resp. C. - std::pair - apply_transpose_helper (const std::string& op, - const size_t P_first, - const size_t P_last, - const_mat_view_type Q, - mat_view_type C, - const FactorOutput& factor_output, - const bool contiguous_cache_blocks) const; - - void - factor_pair (const size_t P_top, - const size_t P_bot, - mat_view_type& A_top, - mat_view_type& A_bot, - std::vector< std::vector< Scalar > >& par_outputs, - const bool contiguous_cache_blocks) const; - - void - apply_pair (const std::string& trans, - const size_t P_top, - const size_t P_bot, - const_mat_view_type& Q_bot, - const std::vector< std::vector< Scalar > >& tau_arrays, - mat_view_type& C_top, - mat_view_type& C_bot, - const bool contiguous_cache_blocks) const; - - void - cache_block_helper (mat_view_type& A_out, - const_mat_view_type& A_in, - const size_t P_first, - const size_t P_last) const; - - void - un_cache_block_helper (mat_view_type& A_out, - const const_mat_view_type& A_in, - const size_t P_first, - const size_t P_last) const; - - }; // class TbbRecursiveTsqr - } // namespace TBB -} // namespace TSQR - -#include "TSQR/TBB/TbbRecursiveTsqr_Def.hpp" - -#endif // __TSQR_TbbRecursiveTsqr_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp deleted file mode 100644 index 27aef81f0328..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp +++ /dev/null @@ -1,538 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_TbbRecursiveTsqr_Def_hpp -#define __TSQR_TBB_TbbRecursiveTsqr_Def_hpp - -#include "TbbTsqr_TbbRecursiveTsqr.hpp" -#include "Tsqr_Util.hpp" - -namespace TSQR { - namespace TBB { - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - explicit_Q_helper (const size_t P_first, - const size_t P_last, - mat_view& Q_out, - const bool contiguous_cache_blocks) const - { - if (P_first > P_last || Q_out.empty ()) { - return; - } - else if (P_first == P_last) { - CacheBlocker< LocalOrdinal, Scalar > - blocker (Q_out.extent(0), Q_out.extent(1), - seq_.cache_blocking_strategy()); - // Fill my partition with zeros. - blocker.fill_with_zeros (Q_out, contiguous_cache_blocks); - - // If our partition is the first (topmost), fill it with - // the first Q_out.extent(1) columns of the identity matrix. - if (P_first == 0) { - // Fetch the topmost cache block of my partition. Its - // leading dimension should be set correctly by - // top_block(). - mat_view Q_out_top = - blocker.top_block (Q_out, contiguous_cache_blocks); - - for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j) - Q_out_top(j,j) = Scalar(1); - } - } - else { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - split_t Q_out_split = - partitioner_.split (Q_out, P_first, P_mid, P_last, - contiguous_cache_blocks); - explicit_Q_helper (P_first, P_mid, Q_out_split.first, - contiguous_cache_blocks); - explicit_Q_helper (P_mid+1, P_last, Q_out_split.second, - contiguous_cache_blocks); - } - } - - - template< class LocalOrdinal, class Scalar > - typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::mat_view - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - factor_helper (const size_t P_first, - const size_t P_last, - const size_t depth, - mat_view A, - std::vector::SeqOutput>& seq_outputs, - typename TbbRecursiveTsqr::ParOutput& par_outputs, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const - { - mat_view A_top; - if (P_first > P_last || A.empty()) { - return A; - } - else if (P_first == P_last) { - std::pair results = - seq_.factor (A.extent(0), A.extent(1), A.data(), A.stride(1), - contiguous_cache_blocks); - seq_outputs[P_first] = results.first; - A_top = A; - } - else { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - split_t A_split = - partitioner_.split (A, P_first, P_mid, P_last, - contiguous_cache_blocks); - A_top = factor_helper (P_first, P_mid, depth+1, A_split.first, - seq_outputs, par_outputs, R, ldr, - contiguous_cache_blocks); - mat_view A_bot = - factor_helper (P_mid+1, P_last, depth+1, A_split.second, - seq_outputs, par_outputs, R, ldr, - contiguous_cache_blocks); - // Combine the two results - factor_pair (P_first, P_mid+1, A_top, A_bot, par_outputs, - contiguous_cache_blocks); - } - - // If we're completely done, extract the final R factor from - // the topmost partition. - if (depth == 0) { - seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(), - A_top.stride(1), R, ldr, contiguous_cache_blocks); - } - return A_top; - } - - - template< class LocalOrdinal, class Scalar > - bool - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - apply_helper_empty (const size_t P_first, - const size_t P_last, - const_mat_view& Q, - mat_view& C) const - { - if (Q.empty ()) { - if (! C.empty()) - throw std::logic_error("Q is empty but C is not!"); - else - return true; - } - else if (C.empty()) { - if (! Q.empty()) - throw std::logic_error("C is empty but Q is not!"); - else - return true; - } - else if (P_first > P_last) - return true; - else - return false; - } - - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - build_partition_array (const size_t P_first, - const size_t P_last, - typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::array_top_blocks_t& top_blocks, - const_mat_view& Q, - mat_view& C, - const bool contiguous_cache_blocks) const - { - if (P_first > P_last) - return; - else if (P_first == P_last) - { - CacheBlocker< LocalOrdinal, Scalar > blocker (Q.extent(0), Q.extent(1), seq_.cache_blocking_strategy()); - const_mat_view Q_top = blocker.top_block (Q, contiguous_cache_blocks); - mat_view C_top = blocker.top_block (C, contiguous_cache_blocks); - top_blocks[P_first] = - std::make_pair (const_mat_view (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.stride(1)), - mat_view (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.stride(1))); - } - else - { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - const_split_t Q_split = - partitioner_.split (Q, P_first, P_mid, P_last, - contiguous_cache_blocks); - split_t C_split = - partitioner_.split (C, P_first, P_mid, P_last, - contiguous_cache_blocks); - build_partition_array (P_first, P_mid, top_blocks, Q_split.first, - C_split.first, contiguous_cache_blocks); - build_partition_array (P_mid+1, P_last, top_blocks, Q_split.second, - C_split.second, contiguous_cache_blocks); - } - } - - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - apply_helper (const size_t P_first, - const size_t P_last, - const_mat_view Q, - mat_view C, - typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::array_top_blocks_t& top_blocks, - const FactorOutput& factor_output, - const bool contiguous_cache_blocks) const - { - typedef std::pair< const_mat_view, mat_view > apply_t; - - if (apply_helper_empty (P_first, P_last, Q, C)) - return; - else if (P_first == P_last) - { - const std::vector< SeqOutput >& seq_outputs = factor_output.first; - seq_.apply ("N", Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), - seq_outputs[P_first], C.extent(1), C.data(), - C.stride(1), contiguous_cache_blocks); - } - else - { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - const_split_t Q_split = - partitioner_.split (Q, P_first, P_mid, P_last, - contiguous_cache_blocks); - split_t C_split = - partitioner_.split (C, P_first, P_mid, P_last, - contiguous_cache_blocks); - const ParOutput& par_output = factor_output.second; - - apply_pair ("N", P_first, P_mid+1, top_blocks[P_mid+1].first, - par_output, top_blocks[P_first].second, - top_blocks[P_mid+1].second, contiguous_cache_blocks); - apply_helper (P_first, P_mid, Q_split.first, C_split.first, - top_blocks, factor_output, contiguous_cache_blocks); - apply_helper (P_mid+1, P_last, Q_split.second, C_split.second, - top_blocks, factor_output, contiguous_cache_blocks); - } - } - - - template< class LocalOrdinal, class Scalar > - typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::top_blocks_t - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - apply_transpose_helper (const std::string& op, - const size_t P_first, - const size_t P_last, - const_mat_view Q, - mat_view C, - const typename TbbRecursiveTsqr::FactorOutput& factor_output, - const bool contiguous_cache_blocks) const - { - if (apply_helper_empty (P_first, P_last, Q, C)) { - return std::make_pair (Q, C); - } - else if (P_first == P_last) { - const std::vector& seq_outputs = factor_output.first; - seq_.apply (op, Q.extent(0), Q.extent(1), Q.data(), Q.stride(1), - seq_outputs[P_first], C.extent(1), C.data(), - C.stride(1), contiguous_cache_blocks); - return std::make_pair (Q, C); - } - else { - // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last] - const size_t P_mid = (P_first + P_last) / 2; - - const_split_t Q_split = - partitioner_.split (Q, P_first, P_mid, P_last, - contiguous_cache_blocks); - split_t C_split = - partitioner_.split (C, P_first, P_mid, P_last, - contiguous_cache_blocks); - const ParOutput& par_output = factor_output.second; - top_blocks_t Top = - apply_transpose_helper (op, P_first, P_mid, Q_split.first, - C_split.first, factor_output, - contiguous_cache_blocks); - top_blocks_t Bottom = - apply_transpose_helper (op, P_mid+1, P_last, Q_split.second, - C_split.second, factor_output, - contiguous_cache_blocks); - apply_pair (op, P_first, P_mid+1, Bottom.first, - par_output, Top.second, Bottom.second, - contiguous_cache_blocks); - return Top; - } - } - - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - factor_pair (const size_t P_top, - const size_t P_bot, - mat_view& A_top, - mat_view& A_bot, - std::vector>& par_outputs, - const bool contiguous_cache_blocks) const - { - if (P_top == P_bot) { - throw std::logic_error("factor_pair: should never get here!"); - } - // We only read and write the upper ncols x ncols triangle of - // each block. - const LocalOrdinal ncols = A_top.extent(1); - if (A_bot.extent(1) != ncols) { - throw std::logic_error("A_bot.extent(1) != A_top.extent(1)"); - } - std::vector& tau = par_outputs[P_bot]; - std::vector work (ncols); - - TSQR::Combine combine_; - combine_.factor_pair (A_top, A_bot, tau.data(), work.data()); - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - apply_pair (const std::string& trans, - const size_t P_top, - const size_t P_bot, - const_mat_view& Q_bot, - const std::vector >& tau_arrays, - mat_view& C_top, - mat_view& C_bot, - const bool contiguous_cache_blocks) const - { - if (P_top == P_bot) { - throw std::logic_error ("apply_pair: should never get here!"); - } - const std::vector& tau = tau_arrays[P_bot]; - std::vector work (C_top.extent(1)); - - TSQR::Combine combine_; - combine_.apply_pair (trans.c_str(), C_top.extent(1), Q_bot.extent(1), - Q_bot.data(), Q_bot.stride(1), &tau[0], - C_top.data(), C_top.stride(1), - C_bot.data(), C_bot.stride(1), &work[0]); - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - cache_block_helper (mat_view& A_out, - const_mat_view& A_in, - const size_t P_first, - const size_t P_last) const - { - if (P_first > P_last) - return; - else if (P_first == P_last) - seq_.cache_block (A_out.extent(0), A_out.extent(1), A_out.data(), - A_in.data(), A_in.stride(1)); - else - { - const size_t P_mid = (P_first + P_last) / 2; - const_split_t A_in_split = - partitioner_.split (A_in, P_first, P_mid, P_last, false); - split_t A_out_split = - partitioner_.split (A_out, P_first, P_mid, P_last, true); - cache_block_helper (A_out_split.first, A_in_split.first, - P_first, P_mid); - cache_block_helper (A_out_split.second, A_in_split.second, - P_mid+1, P_last); - } - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - un_cache_block_helper (mat_view& A_out, - const const_mat_view& A_in, - const size_t P_first, - const size_t P_last) const - { - if (P_first > P_last) { - return; - } - else if (P_first == P_last) { - seq_.un_cache_block (A_out.extent(0), A_out.extent(1), - A_out.data(), A_out.stride(1), - A_in.data()); - } - else { - const size_t P_mid = (P_first + P_last) / 2; - const const_split_t A_in_split = - partitioner_.split (A_in, P_first, P_mid, P_last, true); - split_t A_out_split = - partitioner_.split (A_out, P_first, P_mid, P_last, false); - - un_cache_block_helper (A_out_split.first, A_in_split.first, - P_first, P_mid); - un_cache_block_helper (A_out_split.second, A_in_split.second, - P_mid+1, P_last); - } - } - - template< class LocalOrdinal, class Scalar > - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - TbbRecursiveTsqr (const size_t num_cores, - const size_t cache_size_hint) - : seq_ (cache_size_hint), ncores_ (1) - { - if (num_cores < 1) - ncores_ = 1; // default is no parallelism - else - ncores_ = num_cores; - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const Scalar A_in[], - const LocalOrdinal lda_in) const - { - const_mat_view A_in_view (nrows, ncols, A_in, lda_in); - // Leading dimension doesn't matter, since we're going to cache block it. - mat_view A_out_view (nrows, ncols, A_out, lda_in); - cache_block_helper (A_out_view, A_in_view, 0, ncores()-1); - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - un_cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const LocalOrdinal lda_out, - const Scalar A_in[]) const - { - // Leading dimension doesn't matter, since it's cache-blocked. - const_mat_view A_in_view (nrows, ncols, A_in, lda_out); - mat_view A_out_view (nrows, ncols, A_out, lda_out); - un_cache_block_helper (A_out_view, A_in_view, 0, ncores()-1); - } - - template< class LocalOrdinal, class Scalar > - typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - factor (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const - { - mat_view A_view (nrows, ncols, A, lda); - std::vector< SeqOutput > seq_outputs (ncores()); - ParOutput par_outputs (ncores(), std::vector< Scalar >(ncols)); - (void) factor_helper (0, ncores()-1, 0, A_view, seq_outputs, - par_outputs, R, ldr, contiguous_cache_blocks); - return std::make_pair (seq_outputs, par_outputs); - } - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - apply (const std::string& op, - const LocalOrdinal nrows, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput& factor_output, - const bool contiguous_cache_blocks) const - { - const ApplyType apply_type (op); - if (apply_type == ApplyType::ConjugateTranspose && - Teuchos::ScalarTraits::isComplex) - throw std::logic_error("Applying Q^H for complex scalar types " - "not yet implemented"); - - const_mat_view Q_view (nrows, ncols_Q, Q, ldq); - mat_view C_view (nrows, ncols_C, C, ldc); - if (! apply_type.transposed ()) { - array_top_blocks_t top_blocks (ncores ()); - build_partition_array (0, ncores () - 1, top_blocks, Q_view, - C_view, contiguous_cache_blocks); - apply_helper (0, ncores () - 1, Q_view, C_view, top_blocks, - factor_output, contiguous_cache_blocks); - } - else { - apply_transpose_helper (op, 0, ncores () - 1, Q_view, C_view, - factor_output, contiguous_cache_blocks); - } - } - - - template< class LocalOrdinal, class Scalar > - void - TbbRecursiveTsqr< LocalOrdinal, Scalar >:: - explicit_Q (const LocalOrdinal nrows, - const LocalOrdinal ncols_Q_in, - const Scalar Q_in[], - const LocalOrdinal ldq_in, - const LocalOrdinal ncols_Q_out, - Scalar Q_out[], - const LocalOrdinal ldq_out, - const typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput& factor_output, - const bool contiguous_cache_blocks) const - { - if (ncols_Q_out != ncols_Q_in) - throw std::logic_error("FIXME Currently, explicit_Q() only works for ncols_Q_out == ncols_Q_in"); - - const_mat_view Q_in_view (nrows, ncols_Q_in, Q_in, ldq_in); - mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out); - - explicit_Q_helper (0, ncores()-1, Q_out_view, contiguous_cache_blocks); - apply ("N", nrows, ncols_Q_out, Q_out, ldq_out, ncols_Q_in, - Q_in, ldq_in, factor_output, contiguous_cache_blocks); - } - - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_TbbRecursiveTsqr_Def_hpp diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp deleted file mode 100644 index dc8068c2d9eb..000000000000 --- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp +++ /dev/null @@ -1,145 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_TBB_UnCacheBlockTask_hpp -#define __TSQR_TBB_UnCacheBlockTask_hpp - -#include -#include "TbbTsqr_Partitioner.hpp" -#include "Tsqr_SequentialTsqr.hpp" - -namespace TSQR { - namespace TBB { - /// \class UnCacheBlockTask - /// \brief TBB task for recursive TSQR un-(cache blocking) phase. - /// - /// "Un-(cache blocking)" here means copying the input matrix, - /// which is stored with contiguous cache blocks, to the output - /// matrix, which is stored with noncontiguous cache blocks. - template - class UnCacheBlockTask : public tbb::task { - public: - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; - typedef std::pair< mat_view_type, mat_view_type > split_t; - typedef std::pair< const_mat_view_type, const_mat_view_type > const_split_t; - - UnCacheBlockTask (const size_t P_first__, - const size_t P_last__, - mat_view_type& A_out, - const_mat_view_type& A_in, - const SequentialTsqr& seq) : - P_first_ (P_first__), - P_last_ (P_last__), - A_out_ (A_out), - A_in_ (A_in), - seq_ (seq) - {} - - tbb::task* execute () - { - using tbb::task; - - if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) { - return nullptr; - } - else if (P_first_ == P_last_) { - execute_base_case (); - return nullptr; - } - else { - // Recurse on two intervals: [P_first, P_mid] and - // [P_mid+1, P_last]. - const size_t P_mid = (P_first_ + P_last_) / 2; - split_t out_split = - partitioner_.split (A_out_, P_first_, P_mid, P_last_, false); - const_split_t in_split = - partitioner_.split (A_in_, P_first_, P_mid, P_last_, true); - - // The partitioner may decide that the current blocks A_out_ - // and A_in_ have too few rows to be worth splitting. (It - // should split both A_out_ and A_in_ in the same way.) In - // that case, out_split.second and in_split.second (the - // bottom block) will be empty. We can deal with this by - // treating it as the base case. - if (out_split.second.empty() || out_split.second.extent(0) == 0) { - execute_base_case (); - return nullptr; - } - - // "c": continuation task - tbb::empty_task& c = - *new( allocate_continuation() ) tbb::empty_task; - // Recurse on the split - UnCacheBlockTask& topTask = *new( c.allocate_child() ) - UnCacheBlockTask (P_first_, P_mid, out_split.first, - in_split.first, seq_); - UnCacheBlockTask& botTask = *new( c.allocate_child() ) - UnCacheBlockTask (P_mid+1, P_last_, out_split.second, - in_split.second, seq_); - // Set reference count of parent (in this case, the - // continuation task) to 2 (since 2 children -- no - // additional task since no waiting). - c.set_ref_count (2); - c.spawn (botTask); - return &topTask; // scheduler bypass optimization - } - } - - private: - size_t P_first_, P_last_; - mat_view_type A_out_; - const_mat_view_type A_in_; - SequentialTsqr seq_; - Partitioner partitioner_; - - void - execute_base_case () - { - seq_.un_cache_block (A_out_.extent(0), A_out_.extent(1), - A_out_.data(), A_out_.stride(1), - A_in_.data()); - } - }; - - } // namespace TBB -} // namespace TSQR - - -#endif // __TSQR_TBB_UnCacheBlockTask_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp index 31d1be6b9d01..293fba119542 100644 --- a/packages/tpetra/tsqr/src/Tsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr.hpp @@ -40,8 +40,8 @@ /// \file Tsqr.hpp /// \brief Parallel Tall Skinny QR (TSQR) implementation -#ifndef __TSQR_Tsqr_hpp -#define __TSQR_Tsqr_hpp +#ifndef TSQR_TSQR_HPP +#define TSQR_TSQR_HPP #include "Tsqr_ApplyType.hpp" #include "Tsqr_Matrix.hpp" @@ -90,8 +90,7 @@ namespace TSQR { /// distributed linear algebra libraries, such as Tpetra, the /// local and global ordinal types may be different. template> + class Scalar> class Tsqr { public: typedef MatView mat_view_type; @@ -103,16 +102,16 @@ namespace TSQR { typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType magnitude_type; - typedef NodeTsqrType node_tsqr_type; - typedef DistTsqr dist_tsqr_type; + using node_tsqr_type = NodeTsqr; + using dist_tsqr_type = DistTsqr; typedef typename Teuchos::RCP node_tsqr_ptr; typedef typename Teuchos::RCP dist_tsqr_ptr; /// \typedef rank_type /// \brief "Rank" here means MPI rank, not linear algebra rank. typedef typename dist_tsqr_type::rank_type rank_type; - typedef typename node_tsqr_type::FactorOutput NodeOutput; - typedef typename dist_tsqr_type::FactorOutput DistOutput; + using NodeOutput = typename node_tsqr_type::factor_output_type; + using DistOutput = typename dist_tsqr_type::FactorOutput; /// \typedef FactorOutput /// \brief Return value of \c factor(). @@ -120,7 +119,8 @@ namespace TSQR { /// Part of the implicit representation of the Q factor returned /// by \c factor(). The other part of that representation is /// stored in the A matrix on output. - typedef std::pair FactorOutput; + using FactorOutput = + std::pair, DistOutput>; /// \brief Constructor /// @@ -133,14 +133,9 @@ namespace TSQR { const dist_tsqr_ptr& distTsqr) : nodeTsqr_ (nodeTsqr), distTsqr_ (distTsqr) - {} - - /// \brief Get the intranode part of TSQR. - /// - /// Sometimes we need this in order to do post-construction - /// initialization. - Teuchos::RCP getNodeTsqr () { - return nodeTsqr_; + { + TEUCHOS_ASSERT( ! nodeTsqr_.is_null () ); + TEUCHOS_ASSERT( ! distTsqr_.is_null () ); } /// \brief Cache size hint in bytes used by the intranode part of TSQR. @@ -166,6 +161,13 @@ namespace TSQR { distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal(); } + /// \brief Whether the implementation wants device memory for + /// "large" arrays, like the input matrix, and the output Q + /// factor or C apply result. + bool wants_device_memory () const { + return nodeTsqr_->wants_device_memory (); + } + /// \brief Compute QR factorization with explicit Q factor: "raw" /// arrays interface, for column-major data. /// @@ -227,84 +229,11 @@ namespace TSQR { const LocalOrdinal LDR, const bool forceNonnegativeDiagonal=false) { - const bool contiguousCacheBlocks = false; - - // Sanity checks for matrix dimensions. - if (numRows < numCols) { - std::ostringstream os; - os << "In Tsqr::factorExplicit: input matrix A has " << numRows - << " local rows, and " << numCols << " columns. The input " - "matrix must have at least as many rows on each processor as " - "there are columns."; - throw std::invalid_argument (os.str ()); - } - - // Check for quick exit, based on matrix dimensions. - if (numCols == 0) { - return; - } - - // Fill R initially with zeros. - { - Scalar* R_j = R; - for (LocalOrdinal j = 0; j < numCols; ++j) { - for (LocalOrdinal i = 0; i < numCols; ++i) { - R_j[i] = STS::zero (); - } - R_j += LDR; - } - } - // Compute the local QR factorization, in place in A, with the R - // factor written to R. - NodeOutput nodeResults = - nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR, - contiguousCacheBlocks); - // Prepare the output matrix Q by filling with zeros. - nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ, - contiguousCacheBlocks); - // Wrap the output matrix Q in a "view." - mat_view_type Q_rawView (numRows, numCols, Q, LDQ); - // Wrap the uppermost cache block of Q. We will need to extract - // its numCols x numCols uppermost block below. We can't just - // extract the numCols x numCols top block from all of Q, in - // case Q is arranged using contiguous cache blocks. - mat_view_type Q_top_block = - nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks); - if (Q_top_block.extent (0) < numCols) { - std::ostringstream os; - os << "The top block of Q has too few rows. This means that the " - << "the intranode TSQR implementation has a bug in its top_block" - << "() method. The top block should have at least " << numCols - << " rows, but instead has only " << Q_top_block.extent (1) - << " rows."; - throw std::logic_error (os.str ()); - } - // Use the numCols x numCols top block of Q and the local R - // factor (computed above) to compute the distributed-memory - // part of the QR factorization. - { - mat_view_type Q_top (numCols, numCols, Q_top_block.data(), - Q_top_block.stride(1)); - mat_view_type R_view (numCols, numCols, R, LDR); - distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); - } - // Apply the local part of the Q factor to the result of the - // distributed-memory QR factorization, to get the explicit Q - // factor. - nodeTsqr_->apply (ApplyType::NoTranspose, - numRows, numCols, A, LDA, - nodeResults, numCols, Q, LDQ, - contiguousCacheBlocks); - - // If necessary, and if the user asked, force the R factor to - // have a nonnegative diagonal. - if (forceNonnegativeDiagonal && - ! QR_produces_R_factor_with_nonnegative_diagonal ()) { - details::NonnegDiagForcer forcer; - mat_view_type Q_mine (numRows, numCols, Q, LDQ); - mat_view_type R_mine (numCols, numCols, R, LDR); - forcer.force (Q_mine, R_mine); - } + constexpr bool contiguousCacheBlocks = false; + this->factorExplicitRaw (numRows, numCols, + A, LDA, Q, LDQ, R, LDR, + contiguousCacheBlocks, + forceNonnegativeDiagonal); } void @@ -319,6 +248,8 @@ namespace TSQR { const bool contiguousCacheBlocks, const bool forceNonnegativeDiagonal = false) { + const char prefix[] = "TSQR::Tsqr::factorExplicitRaw: "; + // Sanity checks for matrix dimensions. if (numRows < numCols) { std::ostringstream os; @@ -335,23 +266,41 @@ namespace TSQR { } // Fill R initially with zeros. - { - Scalar* R_j = R; - for (LocalOrdinal j = 0; j < numCols; ++j) { - for (LocalOrdinal i = 0; i < numCols; ++i) { - R_j[i] = STS::zero (); - } - R_j += LDR; - } + mat_view_type R_view (numCols, numCols, R, LDR); + try { + deep_copy (R_view, Scalar {}); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "deep_copy(R_view, 0.0) threw: " << e.what ()); } + // Compute the local QR factorization, in place in A, with the R // factor written to R. - NodeOutput nodeResults = - nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR, - contiguousCacheBlocks); + Teuchos::RCP nodeResults; + try { + nodeResults = + nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR, + contiguousCacheBlocks); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "nodeTsqr_->factor(...) threw: " << e.what ()); + } + // Prepare the output matrix Q by filling with zeros. - nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ, - contiguousCacheBlocks); + try { + nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ, + contiguousCacheBlocks); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "nodeTsqr_->fill_with_zeros(...) threw: " << e.what ()); + } + // Wrap the output matrix Q in a "view." mat_view_type Q_rawView (numRows, numCols, Q, LDQ); // Wrap the uppermost cache block of Q. We will need to extract @@ -373,27 +322,79 @@ namespace TSQR { // factor (computed above) to compute the distributed-memory // part of the QR factorization. { - mat_view_type Q_top (numCols, numCols, Q_top_block.data(), - Q_top_block.stride(1)); + mat_view_type Q_top (numCols, numCols, Q_top_block.data (), + Q_top_block.stride (1)); mat_view_type R_view (numCols, numCols, R, LDR); - distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal); + + if (nodeTsqr_->wants_device_memory ()) { + // DistTsqr doesn't know what to do with device memory, so + // if Q_top is device memory, we need to work in a host copy + // and copy back to Q_top. Q_top is an output argument + // here, so we can just fill Q_top_copy with zeros. + matrix_type Q_top_copy (Q_top.extent (0), Q_top.extent (1), + Scalar {}); + try { + distTsqr_->factorExplicit (R_view, Q_top_copy.view (), + forceNonnegativeDiagonal); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "distTsqr_->" + "factorExplicit (wants_device_memory()=true case) " + "threw: " << e.what ()); + } + try { + nodeTsqr_->copy_from_host (Q_top, Q_top_copy.view ()); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "nodeTsqr_->" + "copy_from_host threw: " << e.what ()); + } + } + else { + try { + distTsqr_->factorExplicit (R_view, Q_top, + forceNonnegativeDiagonal); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "distTsqr_->" + "factorExplicit (wants_device_memory()=false case) " + "threw: " << e.what ()); + } + } } // Apply the local part of the Q factor to the result of the // distributed-memory QR factorization, to get the explicit Q // factor. - nodeTsqr_->apply (ApplyType::NoTranspose, - numRows, numCols, A, LDA, - nodeResults, numCols, Q, LDQ, - contiguousCacheBlocks); + try { + nodeTsqr_->apply (ApplyType::NoTranspose, + numRows, numCols, A, LDA, + *nodeResults, numCols, Q, LDQ, + contiguousCacheBlocks); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "nodeTsqr_->" + "apply threw: " << e.what ()); + } // If necessary, and if the user asked, force the R factor to // have a nonnegative diagonal. if (forceNonnegativeDiagonal && ! QR_produces_R_factor_with_nonnegative_diagonal ()) { - details::NonnegDiagForcer forcer; - mat_view_type Q_mine (numRows, numCols, Q, LDQ); - mat_view_type R_mine (numCols, numCols, R, LDR); - forcer.force (Q_mine, R_mine); + // We ignore contiguousCacheBlocks here, since we're only + // looking at the top block of Q. + try { + nodeTsqr_->force_nonnegative_diagonal (numRows, numCols, + Q, LDQ, R, LDR); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "nodeTsqr_->" + "force_nonnegative_diagonal threw: " << e.what ()); + } } } @@ -451,12 +452,12 @@ namespace TSQR { { mat_view_type R_view (ncols, ncols, R, ldr); deep_copy (R_view, Scalar {}); - NodeOutput nodeResults = + auto nodeResults = nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, - R_view.data(), R_view.stride(1), - contiguousCacheBlocks); + R_view.data (), R_view.stride (1), + contiguousCacheBlocks); DistOutput distResults = distTsqr_->factor (R_view); - return std::make_pair (nodeResults, distResults); + return {nodeResults, distResults}; } /// \brief Apply Q factor to the global dense matrix C @@ -496,7 +497,6 @@ namespace TSQR { /// /// \param contiguousCacheBlocks [in] Whether or not the cache /// blocks of Q and C are stored contiguously. - /// void apply (const std::string& op, const LocalOrdinal nrows_local, @@ -533,49 +533,50 @@ namespace TSQR { mat_view_type C_top_view (ncols_C, ncols_C, C_view_top_block.data(), C_view_top_block.stride(1)); + // DistTsqr doesn't know what to do with device memory, so we + // need to copy the top block of C if applicable. The NodeTsqr + // implementation can decide if that's necessary. + // + // That "matrix_type C_top" is the temporary copy of C_top_view. + // C_top_view here is the "top block of C" that might live in + // device memory. + if (! transposed) { // C_top (small compact storage) gets a deep copy of the top // ncols_C by ncols_C block of C_local. - matrix_type C_top (C_top_view); - - // Compute in place on all processors' C_top blocks. - distTsqr_->apply (applyType, C_top.extent(1), ncols_Q, C_top.data(), - C_top.stride(1), factor_output.second); - - // Copy the result from C_top back into the top ncols_C by - // ncols_C block of C_local. - deep_copy (C_top_view, C_top); - - // Apply the local Q factor (in Q_local and - // factor_output.first) to C_local. + matrix_type C_top = nodeTsqr_->copy_to_host (C_top_view); + // Compute in place on all processes' C_top blocks. + distTsqr_->apply (applyType, C_top.extent (1), ncols_Q, + C_top.data (), C_top.stride (1), + factor_output.second); + // Copy result back to the top block of C_local. + nodeTsqr_->copy_from_host (C_top_view, C_top.view ()); + // Apply the local Q factor to C_local. nodeTsqr_->apply (applyType, nrows_local, ncols_Q, - Q_local, ldq_local, factor_output.first, + Q_local, ldq_local, *(factor_output.first), ncols_C, C_local, ldc_local, contiguousCacheBlocks); } else { - // Apply the (transpose of the) local Q factor (in Q_local - // and factor_output.first) to C_local. + // Apply the (transpose of the) local Q factor to C_local. nodeTsqr_->apply (applyType, nrows_local, ncols_Q, - Q_local, ldq_local, factor_output.first, + Q_local, ldq_local, *(factor_output.first), ncols_C, C_local, ldc_local, contiguousCacheBlocks); // C_top (small compact storage) gets a deep copy of the top // ncols_C by ncols_C block of C_local. - matrix_type C_top (C_top_view); + matrix_type C_top = nodeTsqr_->copy_to_host (C_top_view); // Compute in place on all processors' C_top blocks. distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.data(), C_top.stride(1), factor_output.second); - - // Copy the result from C_top back into the top ncols_C by - // ncols_C block of C_local. - deep_copy (C_top_view, C_top); + // Copy result back to the top block of C_local. + nodeTsqr_->copy_from_host (C_top_view, C_top.view ()); } } - /// \brief Compute the explicit Q factor from factor() + /// \brief Compute the explicit Q factor from result of factor(). /// /// Compute the explicit version of the Q factor computed by /// factor() and represented implicitly (via Q_local_in and @@ -633,11 +634,11 @@ namespace TSQR { mat_view_type Q_out_top = nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks); - // Fill (topmost cache block of) Q_out with the first - // ncols_Q_out columns of the identity matrix. - for (ordinal_type j = 0; j < ncols_Q_out; ++j) { - Q_out_top(j, j) = Scalar (1); - } + // Q_out_top is device memory, so we shouldn't write directly + // to it. Instead, let NodeTsqr fill it with the first + // ncols_Q_out columns of the identity matrix. Note that + // we've already filled Q_out with zeros above. + nodeTsqr_->set_diagonal_entries_to_one (Q_out_top); } apply ("N", nrows_local, ncols_Q_in, Q_local_in, ldq_local_in, factorOutput, @@ -754,23 +755,21 @@ namespace TSQR { if (ncols == 0) { return 0; } - // // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as // the copy B) on Proc 0 only. This would ensure that all // processors get the same SVD and rank (esp. in a heterogeneous // computing environment). For now, we just do this computation // redundantly, and hope that all the returned rank values are // the same. - // - matrix_type U (ncols, ncols, STS::zero()); + matrix_type U (ncols, ncols, Scalar {}); const ordinal_type rank = - reveal_R_rank (ncols, R, ldr, U.data(), U.stride(1), tol); + reveal_R_rank (ncols, R, ldr, U.data (), U.stride (1), tol); if (rank < ncols) { // If R is not full rank: reveal_R_rank() already computed // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and // overwrote R with \f$\Sigma V^*\f$. Now, we compute \f$Q // := Q \cdot U\f$, respecting cache blocks of Q. - Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1), + Q_times_B (nrows, ncols, Q, ldq, U.data (), U.stride (1), contiguousCacheBlocks); } return rank; @@ -815,4 +814,4 @@ namespace TSQR { } // namespace TSQR -#endif // __TSQR_Tsqr_hpp +#endif // TSQR_TSQR_HPP diff --git a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp index 89236be2068c..b7cf98c735e4 100644 --- a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp @@ -74,10 +74,7 @@ namespace TSQR { /// /// TsqrAdaptor uses the appropriate specialization of /// TsqrTypeAdaptor to figure out which variant of TSQR to use on - /// the given multivector type. For example, with - /// Tpetra::MultiVector, if NodeType is - /// KokkosClassic::DoNotUse::TBBNode, the TBB-parallel intranode - /// variant of TSQR will be used. The caller is responsible for + /// the given multivector type. The caller is responsible for /// constructing the intranode and internode TSQR objects. /// /// \tparam S Scalar type diff --git a/packages/tpetra/tsqr/src/TsqrFactory.hpp b/packages/tpetra/tsqr/src/TsqrFactory.hpp index ad4be2e7f831..7841207a06b9 100644 --- a/packages/tpetra/tsqr/src/TsqrFactory.hpp +++ b/packages/tpetra/tsqr/src/TsqrFactory.hpp @@ -45,10 +45,10 @@ /// /// \warning TSQR users should _not_ include this file directly. -#include "Tsqr_NodeTsqrFactory.hpp" -#include "Teuchos_Comm.hpp" -#include "Tsqr_MessengerBase.hpp" #include "Tsqr.hpp" +#include "Teuchos_Comm.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_RCP.hpp" namespace TSQR { namespace Trilinos { @@ -63,8 +63,8 @@ namespace TSQR { /// \tparam LO The (local) ordinal type used by TSQR. /// \tparam S The Scalar type used by TSQR; the type of the /// entries of the matrices to factor. - /// \tparam NodeTsqrType The type of the intranode part of TSQR. - /// \tparam DistTsqrType The type of the internode part of TSQR. + /// \tparam NodeTsqrType The type of the intraprocess part of TSQR. + /// \tparam DistTsqrType The type of the interprocess part of TSQR. /// /// \note Unless you need to change the interface between Trilinos /// and TSQR, you don't need to do anything with TsqrFactory or @@ -72,19 +72,19 @@ namespace TSQR { /// \c TsqrAdaptor. TsqrFactory and its subclasses don't have /// anything to do with any of the Trilinos multivector classes. /// - /// \note If you have implemented a new intranode TSQR + /// \note If you have implemented a new intraprocess TSQR /// factorization type (NodeTsqrType), you may need to /// create a subclass (not specialization) of TsqrFactory that - /// knows how to instantiate that intranode TSQR class. + /// knows how to instantiate that intraprocess TSQR class. /// Alternately, you could write NodeTsqrType so that the - /// provided default implementation of \c makeNodeTsqr() works. + /// provided default implementation of makeNodeTsqr works. /// - /// \note If you have implemented a new internode TSQR + /// \note If you have implemented a new interprocess TSQR /// factorization type (DistTsqrType), you may need to /// create a subclass (not specialization) of TsqrFactory that - /// knows how to instantiate that internode TSQR class. + /// knows how to instantiate that interprocess TSQR class. /// Alternately, you could write DistTsqrType so that the - /// provided default implementation of \c makeDistTsqr() works. + /// provided default implementation of makeDistTsqr works. /// /// \note If you want to change which TSQR implementation is /// invoked for a particular multivector (MV) class, you don't @@ -102,22 +102,22 @@ namespace TSQR { typedef DistTsqrType dist_tsqr_type; typedef MessengerBase scalar_messenger_type; - typedef Tsqr tsqr_type; + typedef Tsqr tsqr_type; /// \brief Instantiate and return the TSQR implementation. /// /// \param plist [in/out] Parameter list (keys depend on the - /// subclass; keys are accessed in the subclass' - /// makeNodeTsqr() method). On output: On output: Missing - /// parameters are filled in with default values. + /// subclass; keys are accessed in the subclass' makeNodeTsqr + /// method). On output: On output: Missing parameters are + /// filled in with default values. /// /// \param nodeTsqr [out] On output, points to the - /// node_tsqr_type object that TSQR will use for the intranode - /// part of its computations. + /// node_tsqr_type object that TSQR will use for the + /// intraprocess part of its computations. /// /// \param distTsqr [out] On output, points to the - /// dist_tsqr_type object that TSQR will use for the internode - /// part of its computations. + /// dist_tsqr_type object that TSQR will use for the + /// interprocess part of its computations. /// /// \return The node_tsqr_type instance that implements TSQR. Teuchos::RCP @@ -133,62 +133,57 @@ namespace TSQR { return rcp (new tsqr_type (nodeTsqr, distTsqr)); } - void - prepareTsqr - - const Teuchos::RCP& messenger, - //! Virtual destructor for memory safety of derived classes. - virtual ~TsqrFactory () {}; + virtual ~TsqrFactory () = default; private: - /// \brief Instantiate and return the TSQR's intranode object. + /// \brief Instantiate and return TSQR's intraprocess object. /// /// \param plist [in/out] Same as the epinonymous input of - /// \c makeTsqr(). + /// makeTsqr. /// /// \return The node_tsqr_type object that TSQR will use for the - /// intranode part of its computations. + /// intraprocess part of its computations. /// - /// \note For implementers: this and \c makeDistTsqr() are the - /// two methods to implement. makeTsqr()'s implementation is + /// \note For implementers: this and makeDistTsqr are the two + /// methods to implement. makeTsqr's implementation is /// "generic"; it does not depend on node_tsqr_type or - /// dist_tsqr_type. The implementation of makeNodeTsqr() - /// varies for different node_tsqr_type types. This pattern - /// is the compile-time polymorphism equivalent of the - /// "Non-Virtual Interface" (NVI) idiom, where the "virtual" - /// methods (here, the methods that vary for different - /// template parameters) are private, and the "nonvirtual" - /// methods (here, the methods that are the same for different - /// template parameters) are part of the public interface. + /// dist_tsqr_type. The implementation of makeNodeTsqr varies + /// for different node_tsqr_type types. This pattern is the + /// compile-time polymorphism equivalent of the "Non-Virtual + /// Interface" (NVI) idiom, where the "virtual" methods (here, + /// the methods that vary for different template parameters) + /// are private, and the "nonvirtual" methods (here, the + /// methods that are the same for different template + /// parameters) are part of the public interface. virtual Teuchos::RCP makeNodeTsqr (const Teuchos::RCP& plist) const { return Teuchos::rcp (new node_tsqr_type (plist)); } - /// \brief Instantiate and return TSQR's internode object. + /// \brief Instantiate and return TSQR's interprocess object. /// /// \param messenger [in] Object used by TSQR for communicating /// between MPI processes. /// /// \param plist [in/out] Same as the epinonymous input of - /// \c makeTsqr(). + /// makeTsqr. /// /// \return The dist_tsqr_type object that TSQR will use for the - /// internode part of its computations. + /// interprocess part of its computations. /// - /// \note For implementers: this and \c makeNodeTsqr() are the - /// two interesting methods. makeTsqr()'s implementation is + /// \note For implementers: this and makeNodeTsqr are the two + /// interesting methods. makeTsqr's implementation is /// "generic"; it does not depend on node_tsqr_type or - /// dist_tsqr_type. The implementation of makeDistTsqr() + /// dist_tsqr_type. The implementation of makeDistTsqr /// varies for different dist_tsqr_type types. virtual Teuchos::RCP makeDistTsqr (const Teuchos::RCP& messenger, const Teuchos::RCP& plist) const { - (void) plist; - return Teuchos::rcp (new dist_tsqr_type (messenger)); + auto ret = Teuchos::rcp (new dist_tsqr_type (messenger)); + ret->setParameterList (plist); } }; } // namespace Trilinos diff --git a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp index 5e6dccdbb87a..5de0142c768d 100644 --- a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp +++ b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp @@ -114,8 +114,8 @@ namespace TSQR { /// \brief Type representing the whole TSQR method. /// /// Depends on \c node_tsqr_type and \c dist_tsqr_type. - typedef TSQR::Tsqr tsqr_type; - typedef Teuchos::RCP tsqr_ptr; + using tsqr_type = TSQR::Tsqr; + typedef Teuchos::RCP tsqr_ptr; /// \typedef factory_type /// diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp index 56c9ef51f076..b650fbf37050 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp @@ -158,14 +158,14 @@ namespace TSQR { /// matrix with which this CacheBlocker was initialized. template< class MatrixViewType > MatrixViewType - split_top_block (MatrixViewType& A, const bool contiguous_cache_blocks) const + split_top_block (MatrixViewType& A, + const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; const ordinal_type nrows_top = strategy_.top_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); - // split_top() sets A to A_rest, and returns A_top. - return A.split_top (nrows_top, contiguous_cache_blocks); + return split_top (A, nrows_top, contiguous_cache_blocks); } /// \brief View of the topmost cache block of A. @@ -188,7 +188,7 @@ namespace TSQR { strategy_.top_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); MatrixViewType A_copy (A); - return A_copy.split_top (nrows_top, contiguous_cache_blocks); + return split_top (A_copy, nrows_top, contiguous_cache_blocks); } /// \brief Split A in place into [A_rest; A_bot]. @@ -207,7 +207,8 @@ namespace TSQR { /// template< class MatrixViewType > MatrixViewType - split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const + split_bottom_block (MatrixViewType& A, + const bool contiguous_cache_blocks) const { typedef typename MatrixViewType::ordinal_type ordinal_type; // Ignore the number of columns in A, since we want to block all @@ -216,7 +217,7 @@ namespace TSQR { strategy_.bottom_block_split_nrows (A.extent(0), extent(1), nrows_cache_block()); // split_bottom() sets A to A_rest, and returns A_bot. - return A.split_bottom (nrows_bottom, contiguous_cache_blocks); + return split_bottom (A, nrows_bottom, contiguous_cache_blocks); } /// \brief Fill the matrix A with zeros, respecting cache blocks. @@ -241,7 +242,7 @@ namespace TSQR { // won't be the correct leading dimension of A, but it won't // matter: we only ever operate on A_cur here, and A_cur's // leading dimension is set correctly by split_top_block(). - while (! A.empty()) { + while (! empty (A)) { // This call modifies the matrix view A, but that's OK since // we passed the input view by copy, not by reference. MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks); @@ -280,10 +281,10 @@ namespace TSQR { // Note: if the cache blocks are stored contiguously, lda won't // be the correct leading dimension of A, but it won't matter: // we only ever operate on A_cur here, and A_cur's leading - // dimension is set correctly by A_rest.split_top(). + // dimension is set correctly by split_top_block. mat_view_type A_rest (num_rows, num_cols, A, lda); - while (! A_rest.empty()) { + while (! empty (A_rest)) { // This call modifies A_rest. mat_view_type A_cur = split_top_block (A_rest, contiguous_cache_blocks); deep_copy (A_cur, Scalar {}); @@ -322,8 +323,8 @@ namespace TSQR { // Leading dimension doesn't matter since A_out will be cache blocked. mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_in); - while (! A_in_rest.empty()) { - if (A_out_rest.empty()) { + while (! empty (A_in_rest)) { + if (empty (A_out_rest)) { throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); } // This call modifies A_in_rest. @@ -351,8 +352,8 @@ namespace TSQR { const_mat_view_type A_in_rest (num_rows, num_cols, A_in, lda_out); mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_out); - while (! A_in_rest.empty()) { - if (A_out_rest.empty()) { + while (! empty (A_in_rest)) { + if (empty (A_out_rest)) { throw std::logic_error("A_out_rest is empty, but A_in_rest is not"); } // This call modifies A_in_rest. @@ -389,9 +390,9 @@ namespace TSQR { const ordinal_type num_cache_blocks = strategy_.num_cache_blocks (A.extent(0), A.extent(1), nrows_cache_block()); - if (cache_block_index >= num_cache_blocks) - return MatrixViewType (0, 0, NULL, 0); // empty - + if (cache_block_index >= num_cache_blocks) { + return MatrixViewType {}; // empty + } // result[0] = starting row index of the cache block // result[1] = number of rows in the cache block // result[2] = pointer offset (A.data() + result[2]) @@ -402,8 +403,7 @@ namespace TSQR { nrows_cache_block(), contiguous_cache_blocks); if (result[1] == 0) { - // For some reason, the cache block is empty. - return MatrixViewType (0, 0, nullptr, 0); + return MatrixViewType {}; } // We expect that ordinal_type is signed, so adding signed @@ -414,19 +414,6 @@ namespace TSQR { result[3]); } - /// \brief Equality operator. - /// - /// Two cache blockers are "equal" if they correspond to matrices - /// with the same dimensions (number of rows and number of - /// columns), and if their cache blocking strategies are equal. - bool - operator== (const CacheBlockingStrategy& rhs) const - { - return extent(0) == rhs.extent(0) && - extent(1) == rhs.extent(1) && - strategy_ == rhs.strategy_; - } - private: //! Number of rows in the matrix to block. Ordinal nrows_ = 0; diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp index aa70035044ac..716c55467991 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp @@ -156,18 +156,6 @@ namespace TSQR { /// most cases, however. size_t size_of_scalar () const { return size_of_scalar_; } - //! True if and only if the two strategies are the same. - bool operator== (const CacheBlockingStrategy& rhs) const { - return cache_size_hint() == rhs.cache_size_hint() && - size_of_scalar() == rhs.size_of_scalar(); - } - - //! True if and only if the two strategies are not the same. - bool operator!= (const CacheBlockingStrategy& rhs) const { - return cache_size_hint() != rhs.cache_size_hint() || - size_of_scalar() != rhs.size_of_scalar(); - } - /// \brief Pointer offset for the cache block with the given index. /// /// The pointer offset depends on whether cache blocks are stored @@ -221,14 +209,14 @@ namespace TSQR { const LocalOrdinal nrows_cache_block, const bool contiguous_cache_blocks) const { - if (contiguous_cache_blocks) - { - std::pair result = - cache_block (index, nrows, ncols, nrows_cache_block); - return result.second; // Number of rows in the cache block - } - else + if (contiguous_cache_blocks) { + std::pair result = + cache_block (index, nrows, ncols, nrows_cache_block); + return result.second; // Number of rows in the cache block + } + else { return lda; + } } /// \brief Start and size of cache block number \c index. @@ -257,39 +245,46 @@ namespace TSQR { LocalOrdinal my_row_start, my_nrows; my_row_start = index * nrows_cache_block; - if (quotient == 0) - { // There is only one cache block. - if (index == 0) - my_nrows = remainder; - else - my_nrows = 0; // Out-of-range block, therefore empty + if (quotient == 0) { // There is only one cache block. + if (index == 0) { + my_nrows = remainder; } - else if (remainder < ncols) - { // There are quotient cache blocks. - if (index < 0) - my_nrows = 0; // Out-of-range block, therefore empty - else if (index < quotient - 1) - my_nrows = nrows_cache_block; - else if (index == quotient - 1) - // The last cache block gets the leftover rows, so that no - // cache block has fewer than ncols rows. - my_nrows = nrows_cache_block + remainder; - else - my_nrows = 0; // Out-of-range block, therefore empty + else { + my_nrows = 0; // Out-of-range block, therefore empty } - else - { // There are quotient+1 cache blocks. - if (index < 0) - my_nrows = 0; // Out-of-range block, therefore empty - else if (index < quotient) - my_nrows = nrows_cache_block; - else if (index == quotient) - // The last cache block has the leftover rows, which are - // >= ncols and < nrows_cache_block. - my_nrows = remainder; - else - my_nrows = 0; // Out-of-range block, therefore empty + } + else if (remainder < ncols) { // There are quotient cache blocks. + if (index < 0) { + my_nrows = 0; // Out-of-range block, therefore empty + } + else if (index < quotient - 1) { + my_nrows = nrows_cache_block; + } + else if (index == quotient - 1) { + // The last cache block gets the leftover rows, so that no + // cache block has fewer than ncols rows. + my_nrows = nrows_cache_block + remainder; + } + else { + my_nrows = 0; // Out-of-range block, therefore empty } + } + else { // There are quotient+1 cache blocks. + if (index < 0) { + my_nrows = 0; // Out-of-range block, therefore empty + } + else if (index < quotient) { + my_nrows = nrows_cache_block; + } + else if (index == quotient) { + // The last cache block has the leftover rows, which are + // >= ncols and < nrows_cache_block. + my_nrows = remainder; + } + else { + my_nrows = 0; // Out-of-range block, therefore empty + } + } return std::make_pair (my_row_start, my_nrows); } @@ -316,7 +311,6 @@ namespace TSQR { /// \note This method has an \f$O(1)\f$ cost, so that /// parallelization by calling this method repeatedly for a /// sequence of cache block indices is not expensive. - /// std::vector cache_block_details (const LocalOrdinal index, const LocalOrdinal nrows, diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp index 7b1f15f0f8ae..5bdd5608ba22 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp @@ -38,18 +38,17 @@ //@HEADER /// \file Tsqr_Combine.hpp -/// \brief TSQR's six computational kernels. +/// \brief Interface to TSQR's six computational kernels. -#ifndef __TSQR_Combine_hpp -#define __TSQR_Combine_hpp +#ifndef TSQR_COMBINE_HPP +#define TSQR_COMBINE_HPP -#include "Teuchos_ScalarTraits.hpp" #include "Tsqr_ApplyType.hpp" -#include "Tsqr_CombineNative.hpp" +#include "Tsqr_MatView.hpp" namespace TSQR { /// \class Combine - /// \brief TSQR's six computational kernels + /// \brief Interface to TSQR's six computational kernels /// \author Mark Hoemmen /// /// This class provides the six computational primitives required by @@ -69,46 +68,52 @@ namespace TSQR { /// /// \tparam Ordinal Type of indices into matrices. /// \tparam Scalar Type of entries of matrices. - /// \tparam CombineImpl Type of a particular implementation of - /// Combine. Its public interface must contain this class' - /// interface. /// - /// All Combine methods are implemented using CombineImpl methods - /// with the same name. TSQR includes three implementations of the - /// CombineImpl interface: + /// TSQR includes two implementations of the Combine interface: /// ///
    ///
  • CombineDefault, which uses LAPACK and copies in and out of - /// scratch space that it owns,
  • + /// scratch space that it owns, and ///
  • CombineNative, a C++ in-place (no scratch space) generic - /// implementation), and
  • - ///
  • CombineFortran, a Fortran 9x in-place implementation for - /// LAPACK's four data types (S, D, C, and Z).
  • + /// implementation) ///
- template< class Ordinal, - class Scalar, - class CombineImpl = CombineNative::isComplex> > + /// + /// There used to be a third implementation, CombineFortran, but it + /// relied on a Fortran 9x compiler and was thus not often tested, + /// so we removed it. + template class Combine { public: - /// \typedef scalar_type - /// \brief Type of matrix entries. - typedef Scalar scalar_type; - /// \typedef ordinal_type - /// \brief Type of (intranode) matrix indices. - typedef Ordinal ordinal_type; - /// \typedef combine_impl_type - /// \brief Type of the implementation of Combine. - typedef CombineImpl combine_impl_type; + //! Type of matrix entries. + using scalar_type = Scalar; + //! Type of (intraprocess) matrix indices. + using ordinal_type = Ordinal; + + virtual ~Combine () = default; - //! Constructor. - Combine () = default; + /// \brief Whether or not the QR factorizations computed by + /// methods of this class produce an R factor with all + /// nonnegative diagonal entries. + virtual bool + QR_produces_R_factor_with_nonnegative_diagonal () const = 0; - /// Whether or not the QR factorizations computed by methods of - /// this class produce an R factor with all nonnegative diagonal - /// entries. - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return combine_impl_type::QR_produces_R_factor_with_nonnegative_diagonal(); - } + /// \brief Best work array size. + /// + /// \param num_rows_Q [in] Number of rows in each block of the + /// matrix to factor. ("Block" means the part of the matrix + /// passed directly to factor_first or factor_inner.) + /// + /// \param num_cols_Q [in] Number of columns of the matrix to + /// factor (the input/output matrix of factor_first or + /// factor_inner). + /// + /// \param num_cols_C [in] Number of columns of the matrix output + /// of apply_first, apply_inner, or apply_pair (use the max of + /// all three). + virtual ordinal_type + work_size (const ordinal_type num_rows_Q, + const ordinal_type num_cols_Q, + const ordinal_type num_cols_C) const = 0; /// \brief Factor the first cache block. /// @@ -118,84 +123,30 @@ namespace TSQR { /// (along with the length ncols tau array) with the implicitly /// stored Q factor. /// - /// \param nrows [in] Number of rows in A - /// \param ncols [in] Number of columns in A /// \param A [in/out] On input: the nrows by ncols matrix (in /// column-major order, with leading dimension lda) to factor. /// On output: upper triangle contains the R factor, and lower /// part contains the implicitly stored Q factor. - /// \param lda [in] Leading dimension of A /// \param tau [out] Array of length ncols; on output, the /// scaling factors for the Householder reflectors /// \param work [out] Workspace array of length ncols - void - factor_first (const MatView& A, + virtual void + factor_first (const MatView& A, Scalar tau[], - Scalar work[]) const - { - return impl_.factor_first (A, tau, work); - } + Scalar work[], + const ordinal_type lwork) = 0; - /// \brief Apply the result of \c factor_first(). + /// \brief Apply the result of factor_first() to C. /// /// Apply the Q factor, as computed by factor_first() and stored /// implicitly in A and tau, to the matrix C. - void + virtual void apply_first (const ApplyType& applyType, - const MatView& A, + const MatView& A, const Scalar tau[], - const MatView& C, - Scalar work[]) - { - return impl_.apply_first (applyType, A, tau, C, work); - } - - /// Apply the result of \c factor_inner(). - /// - /// Apply the Q factor stored in [R; A] to [C_top; C_bot]. The C - /// blocks are allowed, but not required, to have different leading - /// dimensions (ldc_top resp. ldc_bottom). R is upper triangular, so - /// we do not need it; the Householder reflectors representing the Q - /// factor are stored compactly in A (specifically, in all of A, not - /// just the lower triangle). - /// - /// In the "sequential under parallel" version of TSQR, this function - /// belongs to the sequential part (i.e., operating on cache blocks on - /// a single processor). - /// - /// \param apply_type [in] NoTranspose means apply Q, Transpose - /// means apply Q^T, and ConjugateTranspose means apply Q^H. - /// \param m [in] number of rows of A - /// \param ncols_C [in] number of columns of [C_top; C_bot] - /// \param ncols_Q [in] number of columns of [R; A] - /// \param A [in] m by ncols_Q matrix, in which the Householder - /// reflectors representing the Q factor are stored - /// \param lda [in] leading dimension of A - /// \param tau [in] array of length ncols_Q, storing the scaling - /// factors for the Householder reflectors representing Q - /// \param C_top [inout] ncols_Q by ncols_C matrix - /// \param ldc_top [in] leading dimension of C_top - /// \param C_bot [inout] m by ncols_C matrix - /// \param ldc_bot [in] leading dimension of C_bot - /// \param work [out] workspace array of length ncols_C - void - apply_inner (const ApplyType& apply_type, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, - const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const - { - impl_.apply_inner (apply_type, m, ncols_C, ncols_Q, - A, lda, tau, - C_top, ldc_top, C_bot, ldc_bot, work); - } + const MatView& C, + Scalar work[], + const ordinal_type lwork) = 0; /// \brief Factor [R; A] for square upper triangular R and cache block A. /// @@ -231,61 +182,82 @@ namespace TSQR { /// Corresponds to the TAU output of LAPACK's _GEQRF. /// \param work [out] Workspace (length >= n; don't need lwork or /// workspace query) - void - factor_inner (const MatView& R, - const MatView& A, + virtual void + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], - Scalar work[]) const - { - impl_.factor_inner (R, A, tau, work); - } + Scalar work[], + const ordinal_type lwork) = 0; - /// \brief Factor the pair of square upper triangular matrices [R_top; R_bot]. + /// Apply the result of factor_inner(). + /// + /// Apply the Q factor stored in [R; A] to [C_top; C_bot], where + /// + ///
    + ///
  • A is m by ncols_Q,
  • + ///
  • R is ncols_Q by ncols Q,
  • + ///
  • C_top is ncols_Q by ncols_C, and
  • + ///
  • C_bot is m by ncols_C.
  • + ///
+ /// + /// The C blocks are allowed, but not required, to have different + /// strides ("leading dimensions," in BLAS and LAPACK terms). R + /// is upper triangular, so we do not need an explicit version of + /// R here. The Householder reflectors representing the Q factor + /// are stored compactly in A (specifically, in all of A, not just + /// the lower triangle) and tau. + /// + /// \param apply_type [in] NoTranspose means apply Q, Transpose + /// means apply Q^T, and ConjugateTranspose means apply Q^H. + /// \param A [in] m by ncols_Q matrix, in which the Householder + /// reflectors representing the Q factor are stored + /// \param tau [in] array of length ncols_Q, storing the scaling + /// factors for the Householder reflectors representing Q + /// \param C_top [inout] ncols_Q by ncols_C matrix + /// \param C_bot [inout] m by ncols_C matrix + /// \param work [out] workspace array of length ncols_C + virtual void + apply_inner (const ApplyType& apply_type, + const MatView& A, + const Scalar tau[], + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) = 0; + + /// \brief Factor the pair of square upper triangular matrices + /// [R_top; R_bot]. /// /// Store the resulting R factor in R_top, and the resulting /// Householder reflectors implicitly in R_bot and tau. - void - factor_pair (const MatView& R_top, - const MatView& R_bot, + virtual void + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) const - { - impl_.factor_pair (R_top, R_bot, tau, work); - } + Scalar work[], + const ordinal_type lwork) = 0; /// \brief Apply the result of \c factor_pair(). /// /// Apply Q factor (or Q^T or Q^H) of the 2*ncols_Q by ncols_Q /// matrix [R_top; R_bot] (stored in R_bot and tau) to the /// 2*ncols_Q by ncols_C matrix [C_top; C_bot]. The two blocks - /// C_top and C_bot may have different leading dimensions (ldc_top - /// resp. ldc_bot). + /// C_top and C_bot need not be stored contiguously in memory, and + /// they may have different strides ("leading dimensions," in BLAS + /// and LAPACK terms). /// /// \param apply_type [in] NoTranspose means apply Q, Transpose /// means apply Q^T, and ConjugateTranspose means apply Q^H. - void + virtual void apply_pair (const ApplyType& apply_type, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar R_bot[], - const Ordinal ldr_bot, + const MatView& R_bot, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const - { - impl_.apply_pair (apply_type, ncols_C, ncols_Q, - R_bot, ldr_bot, tau, - C_top, ldc_top, C_bot, ldc_bot, work); - } - - private: - //! The implementation of Combine. - combine_impl_type impl_; + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) = 0; }; } // namespace TSQR -#endif // __TSQR_Combine_hpp +#endif // TSQR_COMBINE_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp index e77802f173ec..0e7e16d42d92 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp @@ -44,9 +44,6 @@ #include "Tsqr_CombineBenchmarker.hpp" #include "Tsqr_CombineDefault.hpp" #include "Tsqr_CombineNative.hpp" -#ifdef HAVE_KOKKOSTSQR_FORTRAN -# include "Tsqr_CombineFortran.hpp" -#endif // HAVE_KOKKOSTSQR_FORTRAN #include #include @@ -320,52 +317,19 @@ namespace TSQR { params.additionalData); const double slowdown = nativeTimings[1] / defaultTimings[1]; const bool tooSlow = slowdown > params.allowance; - // FIXME (mfh 24 May 2011) Replace std::runtime_error with a - // more appropriately named exception. - TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, - std::runtime_error, - "CombineNative is too slow! For cache block " - "benchmark with numRows=" << numRows << " and numCols=" - << numCols << ", CombineNative time (= " - << nativeTimings[1] << ") / CombineDefault time (= " - << defaultTimings[1] << ") = " << slowdown - << " > the allowed fraction " << params.allowance - << "."); + // FIXME (mfh 10 Dec 2019) Return an error code / bool, + // instead of throwing. + TEUCHOS_TEST_FOR_EXCEPTION + (params.strictPerfTests && tooSlow, std::runtime_error, + "CombineNative is too slow! For cache block benchmark " + "with numRows=" << numRows << " and numCols=" << numCols + << ", CombineNative time (= " << nativeTimings[1] << + ") / CombineDefault time (= " << defaultTimings[1] << + ") = " << slowdown << " > the allowed fraction " << + params.allowance << "."); } - -#ifdef HAVE_KOKKOSTSQR_FORTRAN - std::vector fortranTimings; - { - typedef CombineFortran combine_type; - std::string combineTypeName ("Fortran"); - fortranTimings = - benchmarkCombineType (out, params.seed, - dataTypeName, - combineTypeName, - numRows, - numCols, - cacheBlockNumTrials, - pairNumTrials, - params.averageTimings, - params.additionalData); - const double slowdown = fortranTimings[1] / defaultTimings[1]; - const bool tooSlow = slowdown > params.allowance; - // FIXME (mfh 24 May 2011) Replace std::runtime_error with a - // more appropriately named exception. - TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, - std::runtime_error, - "CombineFortran is too slow! For cache block " - "benchmark with numRows=" << numRows << " and numCols=" - << numCols << ", CombineFortran time (= " - << fortranTimings[1] << ") / CombineDefault time (= " - << defaultTimings[1] << ") = " << slowdown - << " > the allowed fraction " << params.allowance - << "."); - } -#endif // HAVE_KOKKOSTSQR_FORTRAN } - template static void benchmarkAllCombineTypesAndScalars (std::ostream& out, @@ -393,7 +357,7 @@ namespace TSQR { } if (params.testComplex) { -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX using std::complex; dataTypeName = "complex"; @@ -403,9 +367,9 @@ namespace TSQR { benchmarkAllCombineTypes, TimerType> (out, dataTypeName, params, timerResolution); -#else // Don't HAVE_KOKKOSTSQR_COMPLEX +#else // Don't HAVE_TPETRATSQR_COMPLEX throw std::logic_error("TSQR not built with complex arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX } } diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp index 54d5f199b0ad..18ea69f0ad3e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp @@ -37,8 +37,8 @@ // ************************************************************************ //@HEADER -#ifndef __Tsqr_CombineBenchmarker_hpp -#define __Tsqr_CombineBenchmarker_hpp +#ifndef TSQR_COMBINEBENCHMARKER_HPP +#define TSQR_COMBINEBENCHMARKER_HPP #include "Tsqr_ConfigDefs.hpp" #include "Tsqr_Random_NormalGenerator.hpp" @@ -60,6 +60,19 @@ namespace TSQR { namespace Test { + template + void + fill_with_identity_columns (const MatView& A) + { + deep_copy (A, Scalar {}); + const Ordinal numCols = A.extent (1); + // FIXME (mfh 08 Dec 2019) Eventually stop writing to Matrix or + // MatView entries on host, for eventual GPU-ization. + for (Ordinal j = 0; j < numCols; ++j) { + A(j,j) = Scalar (1.0); + } + } + /// \fn computeTimerResolution /// \brief Compute resolution in seconds of the TimerType timer. /// @@ -74,15 +87,15 @@ namespace TSQR { double computeTimerResolution () { - typedef TimerType timer_type; + using timer_type = TimerType; timer_type timer ("Timer resolution"); - // Warmup run for the timer. - for (int warmup = 0; warmup < 5; ++warmup) - { - timer.start(); - (void) timer.stop(); - } + // Warmup run for the timer. Some timer implementations needed + // to be called at least once in order to get sensible results. + for (int warmup = 0; warmup < 5; ++warmup) { + timer.start (); + (void) timer.stop (); + } // Keep a count of the total number of times timer.stop() is // called (once per outer loop iteration). If bigger than @@ -177,21 +190,21 @@ namespace TSQR { template class CombineBenchmarker { public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef CombineType combine_type; - typedef TimerType timer_type; + using ordinal_type = Ordinal; + using scalar_type = Scalar; + using combine_type = CombineType; + using timer_type = TimerType; private: - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - typedef Teuchos::ScalarTraits STM; - typedef TSQR::Random::NormalGenerator normgen_type; - typedef TSQR::Random::MatrixGenerator matgen_type; - typedef Matrix matrix_type; + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; + using normgen_type = + TSQR::Random::NormalGenerator; + using matgen_type = + TSQR::Random::MatrixGenerator; + using matrix_type = Matrix; public: - /// \brief Constructor with user-specified seed. /// /// \param timerRes [in] Resolution in seconds of the TimerType @@ -291,33 +304,34 @@ namespace TSQR { // Generate a random cache block A. matrix_type A (numRows, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); matGen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) { - Q(j,j) = STS::one(); - } + fill_with_identity_columns (Q.view ()); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), - A.view(), tau.data(), - Q.view(), work.data()); + combiner.factor_first (A.view (), tau.data (), + work.data (), lwork); + combiner.apply_first (ApplyType ("N"), + A.view (), tau.data (), + Q.view (), work.data (), lwork); } // How much time numTrials runs must take in order for @@ -342,10 +356,11 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), - A.view(), tau.data(), - Q.view(), work.data()); + combiner.factor_first (A.view (), tau.data (), + work.data (), lwork); + combiner.apply_first (ApplyType ("N"), + A.view (), tau.data (), + Q.view (), work.data (), lwork); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -388,32 +403,34 @@ namespace TSQR { // Generate a random cache block A. matrix_type A (numRows, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); matGen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), sigmas.data()); // A place to put the Q factor. matrix_type Q (numRows, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) - Q(j,j) = STS::one(); + fill_with_identity_columns (Q.view ()); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), - A.view(), tau.data(), - Q.view(), work.data()); + combiner.factor_first (A.view (), tau.data (), + work.data (), lwork); + combiner.apply_first (ApplyType ("N"), + A.view (), tau.data (), + Q.view (), work.data (), lwork); } // // The actual timing runs. @@ -421,10 +438,11 @@ namespace TSQR { timer_type timer ("Combine first"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_first (A.view(), tau.data(), work.data()); - combiner.apply_first (ApplyType("N"), - A.view(), tau.data(), - Q.view(), work.data()); + combiner.factor_first (A.view (), tau.data (), + work.data (), lwork); + combiner.apply_first (ApplyType ("N"), + A.view (), tau.data (), + Q.view (), work.data (), lwork); } return timer.stop(); } @@ -459,53 +477,56 @@ namespace TSQR { const Ordinal numCols, const double accuracyFactor) { - if (numRows == 0 || numCols == 0) + if (numRows == 0 || numCols == 0) { throw std::invalid_argument("Calibrating timings is impossible for " "a matrix with either zero rows or zero " "columns."); - else if (accuracyFactor < 0) + } + else if (accuracyFactor < 0) { throw std::invalid_argument("Accuracy factor for Combine numTrials " "calibration must be nonnegative."); + } // Random matrix generator. matgen_type matGen (normGenS_); // Generate a random R factor first. matrix_type R (numCols, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R.data(), - R.stride(1), sigmas.data()); + matGen.fill_random_R (numCols, R.data (), + R.stride (1), sigmas.data ()); // Now generate a random cache block. matrix_type A (numRows, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_svd (numRows, numCols, A.data(), - A.stride(1), sigmas.data()); + matGen.fill_random_svd (numRows, numCols, A.data (), + A.stride (1), sigmas.data ()); // A place to put the Q factor. - matrix_type Q (numRows + numCols, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) - Q(j,j) = STS::one(); + matrix_type Q (numCols + numRows, numCols); + fill_with_identity_columns (Q.view ()); + auto Q_top_Q_bot = partition_2x1 (Q, numCols); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (R.view(), A.view(), - tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_inner (R.view (), A.view (), tau.data (), + work.data (), lwork); + combiner.apply_inner (ApplyType ("N"), A.view (), + tau.data (), Q_top_Q_bot.first, + Q_top_Q_bot.second, + work.data (), lwork); } // How much time numTrials runs must take in order for @@ -530,13 +551,12 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (R.view(), A.view(), - tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_inner (R.view (), A.view (), tau.data (), + work.data (), lwork); + combiner.apply_inner (ApplyType ("N"), A.view (), + tau.data (), Q_top_Q_bot.first, + Q_top_Q_bot.second, work.data (), + lwork); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -544,7 +564,6 @@ namespace TSQR { return std::make_pair (numTrials, theTime); } - /// \brief Benchmark TSQR::Combine on [R; A]; /// /// TSQR::Combine implementations use factor_inner() to factor a @@ -581,7 +600,7 @@ namespace TSQR { // Generate a random R factor first. matrix_type R (numCols, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); matGen.fill_random_R (numCols, R.data(), R.stride(1), sigmas.data()); @@ -591,47 +610,45 @@ namespace TSQR { matGen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), sigmas.data()); // A place to put the Q factor. - matrix_type Q (numRows + numCols, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) - Q(j,j) = STS::one(); + matrix_type Q (numCols + numRows, numCols); + fill_with_identity_columns (Q.view ()); + auto Q_top_Q_bot = partition_2x1 (Q, numCols); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_inner (R.view(), A.view(), - tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), - numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_inner (R.view (), A.view (), tau.data (), + work.data (), lwork); + combiner.apply_inner (ApplyType ("N"), A.view (), + tau.data (), Q_top_Q_bot.first, + Q_top_Q_bot.second, + work.data (), lwork); } // // The actual timing runs. // timer_type timer ("Combine cache block"); - timer.start(); + timer.start (); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_inner (R.view(), A.view(), - tau.data(), work.data()); - combiner.apply_inner (ApplyType("N"), - numRows, numCols, numCols, - A.data(), A.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_inner (R.view (), A.view (), tau.data (), + work.data (), lwork); + combiner.apply_inner (ApplyType ("N"), A.view (), + tau.data (), Q_top_Q_bot.first, + Q_top_Q_bot.second, + work.data (), lwork); } - return timer.stop(); + return timer.stop (); } /// \brief Estimate number of trials for TSQR::Combine on [R1; R2]. @@ -672,38 +689,43 @@ namespace TSQR { // Generate R1 first. matrix_type R1 (numCols, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data()); + matGen.fill_random_R (numCols, R2.data (), + R2.stride (1), sigmas.data ()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) - Q(j,j) = STS::one(); + fill_with_identity_columns (Q.view ()); + auto Q_top_Q_bot = partition_2x1 (Q.view (), numCols); + + auto R1_view = R1.view (); + auto R2_view = R2.view (); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (2 * numCols, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_pair (R1_view, R2_view, tau.data (), + work.data (), lwork); + combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (), + Q_top_Q_bot.first, Q_top_Q_bot.second, + work.data (), lwork); } // How much time numTrials runs must take in order for @@ -728,13 +750,12 @@ namespace TSQR { numTrials *= 2; // First value of numTrials is 4. timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (R1.view(), R2.view(), - tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_pair (R1_view, R2_view, tau.data (), + work.data (), lwork); + combiner.apply_pair (ApplyType ("N"), R2_view, + tau.data (), Q_top_Q_bot.first, + Q_top_Q_bot.second, + work.data (), lwork); } theTime = timer.stop(); } while (theTime < minAcceptableTime && numTrials < maxNumTrials); @@ -742,7 +763,6 @@ namespace TSQR { return std::make_pair (numTrials, theTime); } - /// \brief Benchmark TSQR::Combine on [R1; R2]. /// /// TSQR::Combine implementations use factor_pair() to factor a @@ -763,50 +783,57 @@ namespace TSQR { benchmarkPair (const Ordinal numCols, const int numTrials) { - if (numCols == 0) - throw std::invalid_argument("Benchmarking does not make sense for " - "a matrix with zero columns."); - TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument, - "The number of trials must be positive, but " - "numTrials = " << numTrials << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (numCols == 0, std::invalid_argument, "Benchmarking does " + "not make sense for a matrix with zero columns."); + TEUCHOS_TEST_FOR_EXCEPTION + (numTrials < 1, std::invalid_argument, "The number of " + "trials must be positive, but numTrials = " << numTrials + << "."); // Random matrix generator. matgen_type matGen (normGenS_); // Generate R1 first. matrix_type R1 (numCols, numCols); - std::vector sigmas (numCols); + std::vector sigmas (numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data()); + matGen.fill_random_R (numCols, R1.data (), R1.stride (1), + sigmas.data ()); // Now generate R2. matrix_type R2 (numCols, numCols); randomSingularValues (sigmas, numCols); - matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data()); + matGen.fill_random_R (numCols, R2.data (), R2.stride (1), + sigmas.data ()); // A place to put the Q factor of [R1; R2]. matrix_type Q (2*numCols, numCols); - deep_copy (Q, Scalar {}); - for (Ordinal j = 0; j < numCols; ++j) - Q(j,j) = STS::one(); + fill_with_identity_columns (Q.view ()); + auto Q_top_Q_bot = partition_2x1 (Q.view (), numCols); + + auto R1_view = R1.view (); + auto R2_view = R2.view (); // TAU array (Householder reflector scaling factors). std::vector tau (numCols); - // Work space array for factorization and applying the Q factor. - std::vector work (numCols); // The Combine instance to benchmark. combine_type combiner; + // Work space array for factorization and applying the Q factor. + const Ordinal lwork = + combiner.work_size (2 * numCols, numCols, numCols); + std::vector work (lwork); + // A few warmup runs just to avoid timing anomalies. const int numWarmupRuns = 3; for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_pair (R1_view, R2_view, tau.data (), + work.data (), lwork); + combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (), + Q_top_Q_bot.first, Q_top_Q_bot.second, + work.data (), lwork); } // // The actual timing runs. @@ -814,23 +841,21 @@ namespace TSQR { timer_type timer ("Combine pair"); timer.start(); for (int trial = 0; trial < numTrials; ++trial) { - combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), tau.data(), - &Q(0, 0), Q.stride(1), - &Q(numCols, 0), Q.stride(1), - work.data()); + combiner.factor_pair (R1_view, R2_view, tau.data (), + work.data (), lwork); + combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (), + Q_top_Q_bot.first, Q_top_Q_bot.second, + work.data (), lwork); } return timer.stop(); } private: - //! Pseudorandom normal(0,1) generator for Scalar values. TSQR::Random::NormalGenerator normGenS_; - //! Pseudorandom normal(0,1) generator for magnitude_type values. - TSQR::Random::NormalGenerator normGenM_; + //! Pseudorandom normal(0,1) generator for mag_type values. + TSQR::Random::NormalGenerator normGenM_; //! Timer resolution (in seconds) for TimerType timers. double timerResolution_; @@ -842,33 +867,33 @@ namespace TSQR { /// \param numValues [in] Number of random singular values to /// generate. void - randomSingularValues (std::vector& sigmas, + randomSingularValues (std::vector& sigmas, const Ordinal numValues) { - // Cast to avoid compiler warnings for signed / unsigned - // comparisons. - typedef typename std::vector::size_type size_type; - if (sigmas.size() < static_cast (numValues)) - sigmas.resize (numValues); + using STM = Teuchos::ScalarTraits; + if (sigmas.size () < size_t (numValues)) { + sigmas.resize (numValues); + } // Relative amount by which to perturb each singular value. The // perturbation will be multiplied by a normal(0,1) pseudorandom // number drawn from magGen. - const magnitude_type perturbationFactor = magnitude_type(10) * STM::eps(); - const magnitude_type one = STM::one(); - for (Ordinal k = 0; k < numValues; ++k) - { - magnitude_type perturbation = perturbationFactor * normGenM_(); - // If (1 - perturbation) is a small or nonpositive number, - // subtract instead. - if (one - perturbation <= perturbationFactor) - perturbation = -perturbation; - sigmas[k] = one - perturbation; + const mag_type perturbationFactor = + mag_type (10.0) * STM::eps (); + const mag_type one (1.0); + for (Ordinal k = 0; k < numValues; ++k) { + mag_type perturbation = perturbationFactor * normGenM_ (); + // If (1 - perturbation) is a small or nonpositive number, + // subtract instead. + if (one - perturbation <= perturbationFactor) { + perturbation = -perturbation; } + sigmas[k] = one - perturbation; + } } }; } // namespace Test } // namespace TSQR -#endif // __Tsqr_CombineBenchmarker_hpp +#endif // TSQR_COMBINEBENCHMARKER_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp index f5e5ed7c9ce7..eb5ee23b5ff0 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp @@ -38,15 +38,16 @@ //@HEADER /// \file Tsqr_CombineDefault.hpp -/// \brief Default copy-in, copy-out implementation of \c TSQR::Combine. -/// -#ifndef __TSQR_CombineDefault_hpp -#define __TSQR_CombineDefault_hpp +/// \brief Default copy-in, copy-out implementation of TSQR::Combine. -#include "Teuchos_ScalarTraits.hpp" -#include "Tsqr_ApplyType.hpp" +#ifndef TSQR_COMBINEDEFAULT_HPP +#define TSQR_COMBINEDEFAULT_HPP + +#include "Tsqr_Combine.hpp" #include "Tsqr_Impl_Lapack.hpp" #include "Tsqr_Matrix.hpp" +#include "Teuchos_Assert.hpp" +#include "Teuchos_ScalarTraits.hpp" namespace TSQR { @@ -62,13 +63,14 @@ namespace TSQR { /// that should be zero because of the input's structure (e.g., /// upper triangular). template - class CombineDefault { + class CombineDefault : public Combine { public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef MatView const_mat_view_type; - typedef MatView mat_view_type; + using ordinal_type = Ordinal; + using scalar_type = Scalar; + using const_mat_view_type = MatView; + using mat_view_type = MatView; + + ~CombineDefault () override = default; /// \brief Does the R factor have a nonnegative diagonal? /// @@ -78,44 +80,74 @@ namespace TSQR { /// entries. This Boolean tells you whether CombineDefault /// promises to compute an R factor whose diagonal entries are all /// nonnegative. - static bool QR_produces_R_factor_with_nonnegative_diagonal() + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override { - return false; // lapack_type::QR_produces_R_factor_with_nonnegative_diagonal(); + // FIXME (mfh 19 Dec 2019) This _should_ depend on Impl::Lapack. + return false; + } + + ordinal_type + work_size (const ordinal_type num_rows_Q, + const ordinal_type num_cols_Q, + const ordinal_type num_cols_C) const override + { + using STS = Teuchos::ScalarTraits; + + const int ncols = num_cols_Q < num_cols_C ? + num_cols_C : num_cols_Q; + const int nrows = num_rows_Q + ncols; + const int lda = nrows; + + const int lwork1 = + lapack_.compute_QR_lwork (nrows, ncols, nullptr, lda); + TEUCHOS_ASSERT( lwork1 >= num_cols_Q ); + + const int ldc = nrows; + const int lwork2 = + lapack_.apply_Q_factor_lwork ('L', 'N', + nrows, num_cols_C, num_cols_Q, + nullptr, lda, nullptr, + nullptr, ldc); + TEUCHOS_ASSERT( lwork2 >= 0 ); + return std::max (lwork1, lwork2); } void - factor_first (const MatView& A, + factor_first (const MatView& A, Scalar tau[], - Scalar work[]) + Scalar work[], + const ordinal_type lwork) override { - const int lwork = A.extent (1); lapack_.compute_QR (A.extent (0), A.extent (1), A.data (), A.stride (1), tau, work, lwork); } void - factor_first (Matrix& A, + factor_first (Matrix& A, Scalar tau[], - Scalar work[]) + Scalar work[], + const ordinal_type lwork) { - MatView A_view + MatView A_view (A.extent (0), A.extent (1), A.data (), A.stride (1)); - factor_first (A_view, tau, work); + this->factor_first (A_view, tau, work, lwork); } void apply_first (const ApplyType& applyType, - const MatView& A, + const MatView& A, const Scalar tau[], - const MatView& C, - Scalar work[]) + const MatView& C, + Scalar work[], + const ordinal_type lwork) override { - const Ordinal nrows = A.extent(0); - const Ordinal ncols_C = C.extent(1); - const Ordinal ncols_A = A.extent(1); - const Ordinal lda = A.stride(1); - const Ordinal ldc = C.stride(1); + const ordinal_type nrows = A.extent(0); + const ordinal_type ncols_C = C.extent(1); + const ordinal_type ncols_A = A.extent(1); + const ordinal_type lda = A.stride(1); + const ordinal_type ldc = C.stride(1); // LAPACK has the nice feature that it only reads the first // letter of input strings that specify things like which side @@ -123,78 +155,76 @@ namespace TSQR { // transpose. That means we can make the strings more verbose, // as in "Left" here for the SIDE parameter. const std::string trans = applyType.toString (); - const int lwork = ncols_C; lapack_.apply_Q_factor ('L', trans[0], nrows, ncols_C, ncols_A, A.data(), lda, tau, C.data(), ldc, - work, lwork); + work, static_cast (lwork)); + } + + void + factor_inner (const MatView& R, + const MatView& A, + Scalar tau[], + Scalar work[], + const ordinal_type lwork) override + { + const ordinal_type m = A.extent (0); + const ordinal_type n = A.extent (1); + const ordinal_type lda = A.stride (1); + factor_inner_impl (m, n, R.data (), R.stride (1), + A.data (), lda, tau, work, lwork); } void apply_inner (const ApplyType& apply_type, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override { - const Ordinal numRows = m + ncols_Q; + const ordinal_type m = A.extent (0); + TEUCHOS_ASSERT( m == ordinal_type (C_bot.extent (0)) ); + const ordinal_type ncols_Q = A.extent (1); + const ordinal_type ncols_C = C_top.extent (1); + TEUCHOS_ASSERT( ncols_C == ordinal_type (C_bot.extent (1)) ); + const ordinal_type numRows = ncols_Q + m; A_buf_.reshape (numRows, ncols_Q); deep_copy (A_buf_, Scalar {}); - const_mat_view_type A_bot (m, ncols_Q, A, lda); - mat_view_type A_buf_bot (m, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.stride(1)); - deep_copy (A_buf_bot, A_bot); + auto A_buf_top_bot = partition_2x1 (A_buf_.view (), ncols_Q); + deep_copy (A_buf_top_bot.second, A); C_buf_.reshape (numRows, ncols_C); deep_copy (C_buf_, Scalar {}); - mat_view_type C_buf_top (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.stride(1)); - mat_view_type C_buf_bot (m, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.stride(1)); - mat_view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top); - mat_view_type C_bot_view (m, ncols_C, C_bot, ldc_bot); - deep_copy (C_buf_top, C_top_view); - deep_copy (C_buf_bot, C_bot_view); + auto C_buf_top_bot = partition_2x1 (C_buf_.view (), ncols_Q); + deep_copy (C_buf_top_bot.first, C_top); + deep_copy (C_buf_top_bot.second, C_bot); const std::string trans = apply_type.toString (); - const int lwork = ncols_C; - lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.data(), A_buf_.stride(1), tau, - C_buf_.data(), C_buf_.stride(1), + lapack_.apply_Q_factor ('L', trans[0], + numRows, ncols_C, ncols_Q, + A_buf_.data (), A_buf_.stride (1), tau, + C_buf_.data (), C_buf_.stride (1), work, lwork); // Copy back the results. - deep_copy (C_top_view, C_buf_top); - deep_copy (C_bot_view, C_buf_bot); - } - - void - factor_inner (const MatView& R, - const MatView& A, - Scalar tau[], - Scalar work[]) - { - const Ordinal m = A.extent(0); - const Ordinal n = A.extent(1); - factor_inner_impl (m, n, R.data(), R.stride(1), - A.data(), A.stride(1), tau, work); + deep_copy (C_top, C_buf_top_bot.first); + deep_copy (C_bot, C_buf_top_bot.second); } private: void - factor_inner_impl (const Ordinal m, - const Ordinal n, + factor_inner_impl (const ordinal_type m, + const ordinal_type n, Scalar R[], - const Ordinal ldr, + const ordinal_type ldr, Scalar A[], - const Ordinal lda, + const ordinal_type lda, Scalar tau[], - Scalar work[]) + Scalar work[], + const ordinal_type lwork) { - const Ordinal numRows = m + n; + const ordinal_type numRows = m + n; A_buf_.reshape (numRows, n); deep_copy (A_buf_, Scalar {}); @@ -202,58 +232,46 @@ namespace TSQR { // we only want to include the upper triangle in the // factorization. Thus, only copy the upper triangle of R into // the appropriate place in the buffer. - MatView R_view (n, n, R, ldr); - MatView A_buf_top (n, n, A_buf_.data(), + MatView R_view (n, n, R, ldr); + MatView A_buf_top (n, n, A_buf_.data(), A_buf_.stride(1)); deep_copy (A_buf_top, R_view); - MatView A_view (m, n, A, lda); - MatView A_buf_bot (m, n, &A_buf_(n, 0), + MatView A_view (m, n, A, lda); + MatView A_buf_bot (m, n, &A_buf_(n, 0), A_buf_.stride(1)); deep_copy (A_buf_bot, A_view); - - const int lwork = n; - lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1), - tau, work, lwork); + lapack_.compute_QR (numRows, n, A_buf_.data (), + A_buf_.stride (1), tau, work, lwork); // Copy back the results. R might be a view of the upper // triangle of a cache block, so only copy into the upper // triangle of R. - copy_upper_triangle (n, n, R, ldr, A_buf_top.data(), - A_buf_top.stride(1)); + copy_upper_triangle (R_view, A_buf_top); deep_copy (A_view, A_buf_bot); } public: void - factor_pair (const MatView& R_top, - const MatView& R_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) + Scalar work[], + const ordinal_type lwork) override { - const Ordinal numRows = Ordinal(2) * R_top.extent (1); - const Ordinal numCols = R_top.extent (1); + const ordinal_type numRows = ordinal_type(2) * R_top.extent (1); + const ordinal_type numCols = R_top.extent (1); A_buf_.reshape (numRows, numCols); deep_copy (A_buf_, Scalar {}); - MatView A_buf_top (numCols, numCols, - &A_buf_(0, 0), - A_buf_.stride(1)); - MatView A_buf_bot (numCols, numCols, - &A_buf_(numCols, 0), - A_buf_.stride(1)); + auto A_buf_tb = partition_2x1 (A_buf_.view (), numCols); // Copy the inputs into the compute buffer. Only touch the // upper triangles of R_top and R_bot, since they each may be // views of some cache block (where the strict lower triangle // contains things we don't want to include in the // factorization). - copy_upper_triangle (numCols, numCols, - A_buf_top.data(), A_buf_top.stride(1), - R_top.data(), R_top.stride(1)); - copy_upper_triangle (numCols, numCols, - A_buf_bot.data(), A_buf_bot.stride(1), - R_bot.data(), R_bot.stride(1)); - - const int lwork = static_cast (numCols); + copy_upper_triangle (A_buf_tb.first, R_top); + copy_upper_triangle (A_buf_tb.second, R_bot); + lapack_.compute_QR (numRows, numCols, A_buf_.data(), A_buf_.stride(1), tau, work, lwork); @@ -261,63 +279,49 @@ namespace TSQR { // two n by n row blocks of A_buf_ (this means we don't have to // zero out the strict lower triangles), and only touch the // upper triangles of R_top and R_bot. - copy_upper_triangle (numCols, numCols, - R_top.data(), R_top.stride(1), - A_buf_top.data(), A_buf_top.stride(1)); - copy_upper_triangle (numCols, numCols, - R_bot.data(), R_bot.stride(1), - A_buf_bot.data(), A_buf_bot.stride(1)); + copy_upper_triangle (R_top, A_buf_tb.first); + copy_upper_triangle (R_bot, A_buf_tb.second); } void apply_pair (const ApplyType& apply_type, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar R_bot[], - const Ordinal ldr_bot, + const MatView& R_bot, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override { - const Ordinal numRows = Ordinal(2) * ncols_Q; + const ordinal_type ncols_C = C_top.extent (1); + const ordinal_type ncols_Q = R_bot.extent (1); + const ordinal_type numRows = ordinal_type(2) * ncols_Q; A_buf_.reshape (numRows, ncols_Q); deep_copy (A_buf_, Scalar {}); - copy_upper_triangle (ncols_Q, ncols_Q, - &A_buf_(ncols_Q, 0), A_buf_.stride(1), - R_bot, ldr_bot); - C_buf_.reshape (numRows, ncols_C); - - using view_type = MatView; - view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top); - view_type C_buf_top (ncols_Q, ncols_C, - C_buf_.data (), C_buf_.stride (1)); - deep_copy (C_buf_top, C_top_view); + auto A_buf_tb = partition_2x1 (A_buf_.view (), ncols_Q); + copy_upper_triangle (A_buf_tb.second, R_bot); - view_type C_bot_view (ncols_Q, ncols_C, C_bot, ldc_bot); - view_type C_buf_bot (ncols_Q, ncols_C, - &C_buf_(ncols_Q, 0), C_buf_.stride (1)); - deep_copy (C_buf_bot, C_bot_view); + C_buf_.reshape (numRows, ncols_C); + auto C_buf_tb = partition_2x1 (C_buf_.view (), ncols_Q); + deep_copy (C_buf_tb.first, C_top); + deep_copy (C_buf_tb.second, C_bot); - const int lwork = ncols_Q; const std::string trans = apply_type.toString (); - lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q, - A_buf_.data(), A_buf_.stride(1), tau, - C_buf_.data(), C_buf_.stride(1), + lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, + ncols_Q, A_buf_.data (), + A_buf_.stride (1), tau, + C_buf_.data (), C_buf_.stride (1), work, lwork); // Copy back the results. - deep_copy (C_top_view, C_buf_top); - deep_copy (C_bot_view, C_buf_bot); + deep_copy (C_top, C_buf_tb.first); + deep_copy (C_bot, C_buf_tb.second); } private: Impl::Lapack lapack_; - Matrix A_buf_; - Matrix C_buf_; + Matrix A_buf_; + Matrix C_buf_; }; } // namespace TSQR -#endif // __TSQR_CombineDefault_hpp +#endif // TSQR_COMBINEDEFAULT_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp new file mode 100644 index 000000000000..e2f1dbc289e8 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp @@ -0,0 +1,105 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos: Node API and Parallel Node Kernels +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER + +/// \file Tsqr_Combine.hpp +/// \brief Interface to TSQR's six computational kernels. + +#ifndef TSQR_COMBINEFACTORY_HPP +#define TSQR_COMBINEFACTORY_HPP + +#include "Tsqr_CombineDefault.hpp" +#include "Tsqr_CombineNative.hpp" +#include "Teuchos_TestForException.hpp" +#include +#include + +namespace TSQR { + /// \class CombineFactory + /// \brief Factory for creating Combine instances. + /// \author Mark Hoemmen + template + class CombineFactory { + public: + /// \brief Given the maximum number of columns in either the + /// matrix to factor, or the matrix to which to apply a Q factor + /// or compute an explicit Q factor, return an appropriate + /// Combine implementation. + static std::unique_ptr> + create (const Ordinal maxNumCols) + { + // FIXME (mfh 19 Dec 2019) This _should_ depend on the BLAS + // implementation. + constexpr Ordinal blas_3_threshold = 32; + if (maxNumCols >= blas_3_threshold) { + using impl_type = CombineDefault; + // NOTE (mfh 19 Dec 2019) We can't use std::make_unique yet, + // because it requires C++14. + return std::unique_ptr (new impl_type); + } + else { + using impl_type = CombineNative; + return std::unique_ptr (new impl_type); + } + } + + static std::unique_ptr> + create (const std::string& combineType) + { + if (combineType == "CombineNative" || + combineType == "Native") { + using impl_type = CombineNative; + return std::unique_ptr (new impl_type); + } + else if (combineType == "CombineDefault" || + combineType == "Default") { + using impl_type = CombineDefault; + return std::unique_ptr (new impl_type); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::invalid_argument, "TSQR::CombineFactory: " + "Invalid Combine subclass name \"" << combineType << + "\"."); + } + } + }; + +} // namespace TSQR + +#endif // TSQR_COMBINEFACTORY_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp index 8e44d0fe8b75..c8d5cc759be6 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp @@ -38,10 +38,10 @@ //@HEADER /// \file Tsqr_CombineNative.hpp -/// \brief Interface to C++ back end of \c TSQR::Combine. -/// -#ifndef __TSQR_CombineNative_hpp -#define __TSQR_CombineNative_hpp +/// \brief Interface to C++ back end of TSQR::Combine. + +#ifndef TSQR_COMBINENATIVE_HPP +#define TSQR_COMBINENATIVE_HPP #include "Teuchos_ScalarTraits.hpp" #include "Tsqr_ApplyType.hpp" @@ -57,30 +57,36 @@ namespace TSQR { /// \class CombineNative /// \brief Interface to C++ back end of TSQR::Combine /// - /// \c TSQR::Combine has three implementations: \c CombineDefault, - /// CombineNative, and \c CombineFortran. CombineNative, - /// implemented in this file, is a fully C++ (therefore "native," as - /// opposed to \c CombineFortran (implemented in Fortran) or \c - /// CombineNative (implemented by wrappers around LAPACK calls)) - /// implementation. + /// TSQR::Combine has two implementations: CombineDefault and + /// CombineNative. (It used to have CombineFortran as well, which + /// was a Fortran 9x implementation wrapped in C++ wrappers. I got + /// rid of that because it complicated Trilinos' build system to + /// have to ask whether the Fortran compiler could handle Fortran + /// 9x.) CombineNative, implemented in this file, is a "fully" C++ + /// (therefore "native") implementation of Combine. (I'm ignoring + /// calls to some BLAS functions.) /// - /// \warning CombineNative has no complex-arithmetic implementation + /// \note CombineNative has no complex-arithmetic implementation /// yet. It's not hard to implement this (use LAPACK's ZGEQR2(P) /// and ZUNM2R as models), but it will take time that the author /// doesn't have at the moment. - /// - template< class Ordinal, class Scalar, bool isComplex = Teuchos::ScalarTraits< Scalar >::isComplex > - class CombineNative - { + template::isComplex> + class CombineNative : public Combine { public: - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef Ordinal ordinal_type; + using ordinal_type = Ordinal; + using scalar_type = Scalar; private: - typedef CombineDefault combine_default_type; + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; + using combine_default_type = + CombineDefault; public: + ~CombineNative () override = default; + /// Whether or not the QR factorizations computed by methods of /// this class produce an R factor with all nonnegative diagonal /// entries. It depends on LAPACK because this implementation @@ -88,358 +94,345 @@ namespace TSQR { /// Householder reflectors; only LAPACK versions >= 3.2 have one /// of {LARFGP, LARFP}, which is necessary to ensure that the BETA /// output of the function is always nonnegative. - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + return default_. + QR_produces_R_factor_with_nonnegative_diagonal (); + } + + ordinal_type + work_size (const ordinal_type /* num_rows_Q */, + const ordinal_type num_cols_Q, + const ordinal_type num_cols_C) const override + { + return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q; } void - factor_first (const MatView& A, + factor_first (const MatView& A, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) override { - return default_.factor_first (A, tau, work); + return default_.factor_first (A, tau, work, lwork); } void apply_first (const ApplyType& applyType, - const MatView& A, + const MatView& A, const Scalar tau[], - const MatView& C, - Scalar work[]) + const MatView& C, + Scalar work[], + const ordinal_type lwork) override { - return default_.apply_first (applyType, A, tau, C, work); + return default_.apply_first (applyType, A, tau, C, work, lwork); } void - apply_inner (const ApplyType& applyType, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, - const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const; + factor_inner (const MatView& R, + const MatView& A, + Scalar tau[], + Scalar work[], + const ordinal_type lwork) override; void - factor_inner (const MatView& R, - const MatView& A, - Scalar tau[], - Scalar work[]) const; + apply_inner (const ApplyType& applyType, + const MatView& A, + const Scalar tau[], + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override; void - factor_pair (const MatView& R_top, - const MatView& R_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) const; + Scalar work[], + const ordinal_type lwork) override; void apply_pair (const ApplyType& applyType, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar R_bot[], - const Ordinal ldr_bot, + const MatView& R_bot, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const; + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override; private: - mutable combine_default_type default_; + combine_default_type default_; }; - //! Specialization of CombineNative for the real-arithmetic case. - template< class Ordinal, class Scalar > - class CombineNative< Ordinal, Scalar, false > - { - private: - using memory_space = Kokkos::HostSpace; -#ifdef KOKKOS_ENABLE_SERIAL - using execution_space = Kokkos::Serial; -#else // NOT KOKKOS_ENABLE_SERIAL - using execution_space = Kokkos::HostSpace::execution_space; -#endif // KOKKOS_ENABLE_SERIAL - + template + class CombineNative : + public Combine { public: - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef Ordinal ordinal_type; - using device_type = Kokkos::Device; + using ordinal_type = Ordinal; + using scalar_type = Scalar; private: - typedef CombineDefault combine_default_type; + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; + using execution_space = Kokkos::DefaultHostExecutionSpace; + using memory_space = Kokkos::HostSpace; + using device_type = Kokkos::Device; + template + using matrix_type = + Kokkos::View>; + template + using vector_type = + Kokkos::View>; void - GER (const magnitude_type alpha, - const Kokkos::View& x, - const Kokkos::View& y, - const Kokkos::View& A) const; + GER (const mag_type alpha, + const vector_type& x, + const vector_type& y, + const matrix_type& A) const; void - LARFG (const Ordinal n, + LARFG (const ordinal_type n, scalar_type& alpha, - const Kokkos::View& x, + const vector_type& x, scalar_type& tau) const { - constexpr Ordinal incx {1}; + constexpr ordinal_type incx {1}; Impl::Lapack lapack; lapack.LARFG (n, alpha, x.data (), incx, tau); } - magnitude_type - LAPY2 (const scalar_type& x, const scalar_type& y) const - { - using KAT = Kokkos::ArithTraits; - if (KAT::isNan (x)) { - return x; - } - else if (KAT::isNan (y)) { - return y; - } - else { - const magnitude_type xabs = KAT::abs (x); - const magnitude_type yabs = KAT::abs (y); - const scalar_type w = xabs >= yabs ? xabs : yabs; // max (xabs, yabs); - const scalar_type z = xabs <= yabs ? xabs : yabs; // min (xabs, yabs); - - if (z == KAT::zero ()) { - return w; - } - else { - const scalar_type z_div_w = z / w; - return w * KAT::sqrt (KAT::one () + z_div_w * z_div_w); - } - } - } - void GEMV (const char trans[], const scalar_type alpha, - const Kokkos::View& A, - const Kokkos::View& x, + const matrix_type& A, + const vector_type& x, const scalar_type beta, - const Kokkos::View& y) const; + const vector_type& y) const; void - factor_pair (const Kokkos::View& R_top, - const Kokkos::View& R_bot, - const Kokkos::View& tau_view, - const Kokkos::View& work_view) const; + factor_pair (const matrix_type& R_top, + const matrix_type& R_bot, + const vector_type& tau_view, + const vector_type& work_view) const; void - factor_inner (const Kokkos::View& R_view, - const Kokkos::View& A_view, - const Kokkos::View& tau_view, - const Kokkos::View& work_view) const; + factor_inner (const matrix_type& R_view, + const matrix_type& A_view, + const vector_type& tau_view, + const vector_type& work_view) const; void apply_pair (const ApplyType& applyType, - const Kokkos::View& R_bot, // ncols_Q - const Kokkos::View& tau_view, - const Kokkos::View& C_top, // ncols_C - const Kokkos::View& C_bot, - const Kokkos::View& work_view) const; + const matrix_type& R_bot, // ncols_Q + const vector_type& tau_view, + const matrix_type& C_top, // ncols_C + const matrix_type& C_bot, + const vector_type& work_view) const; void apply_inner (const ApplyType& applyType, - const Kokkos::View& A, - const Kokkos::View& tau, - const Kokkos::View& C_top, - const Kokkos::View& C_bot, - const Kokkos::View& work) const; + const matrix_type& A, + const vector_type& tau, + const matrix_type& C_top, + const matrix_type& C_bot, + const vector_type& work) const; public: - CombineNative () = default; + ~CombineNative () override = default; - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + return default_. + QR_produces_R_factor_with_nonnegative_diagonal (); + } + + ordinal_type + work_size (const ordinal_type /* num_rows_Q */, + const ordinal_type num_cols_Q, + const ordinal_type num_cols_C) const override + { + return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q; } void - factor_first (const MatView& A, + factor_first (const MatView& A, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) override { - return default_.factor_first (A, tau, work); + return default_.factor_first (A, tau, work, lwork); } void apply_first (const ApplyType& applyType, - const MatView& A, + const MatView& A, const Scalar tau[], - const MatView& C, - Scalar work[]) + const MatView& C, + Scalar work[], + const ordinal_type lwork) override { - return default_.apply_first (applyType, A, tau, C, work); + return default_.apply_first (applyType, A, tau, C, work, lwork); } void - factor_inner (const MatView& R, - const MatView& A, + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], - Scalar work[]) const; - + Scalar work[], + const ordinal_type lwork) override; void apply_inner (const ApplyType& applyType, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const; + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override; + void - factor_pair (const MatView& R_top, - const MatView& R_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) const; - + Scalar work[], + const ordinal_type lwork) override; void apply_pair (const ApplyType& applyType, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const scalar_type R_bot[], - const Ordinal ldr_bot, - const scalar_type tau[], - scalar_type C_top[], - const Ordinal ldc_top, - scalar_type C_bot[], - const Ordinal ldc_bot, - scalar_type work[]) const; + const MatView& R_bot, + const Scalar tau[], + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override; private: - mutable combine_default_type default_; + CombineDefault default_; }; - - /// "Forward declaration" for the complex-arithmetic case. - /// - template< class Ordinal, class Scalar > - class CombineNative< Ordinal, Scalar, true > - { + //! Specialization of CombineNative for complex Scalar. + template + class CombineNative : + public Combine { public: - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type; - typedef Ordinal ordinal_type; + using ordinal_type = Ordinal; + using scalar_type = Scalar; private: - typedef CombineDefault combine_default_type; + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; public: - static bool QR_produces_R_factor_with_nonnegative_diagonal() { - return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal(); + ~CombineNative () override = default; + + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + return default_. + QR_produces_R_factor_with_nonnegative_diagonal (); + } + + ordinal_type + work_size (const ordinal_type /* num_rows_Q */, + const ordinal_type num_cols_Q, + const ordinal_type num_cols_C) const override + { + return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q; } void - factor_first (const MatView& A, + factor_first (const MatView& A, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) override { - return default_.factor_first (A, tau, work); + return default_.factor_first (A, tau, work, lwork); } void apply_first (const ApplyType& applyType, - const MatView& A, + const MatView& A, const Scalar tau[], - const MatView& C, - Scalar work[]) + const MatView& C, + Scalar work[], + const ordinal_type lwork) override { - return default_.apply_first (applyType, A, tau, C, work); + return default_.apply_first (applyType, A, tau, C, work, lwork); } void - apply_inner (const ApplyType& applyType, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, - const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const + factor_inner (const MatView& R, + const MatView& A, + Scalar tau[], + Scalar work[], + const ordinal_type lwork) override { - return default_.apply_inner (applyType, m, ncols_C, ncols_Q, - A, lda, tau, - C_top, ldc_top, C_bot, ldc_bot, - work); + return default_.factor_inner (R, A, tau, work, lwork); } void - factor_inner (const MatView& R, - const MatView& A, - Scalar tau[], - Scalar work[]) const + apply_inner (const ApplyType& applyType, + const MatView& A, + const Scalar tau[], + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override { - return default_.factor_inner (R, A, tau, work); + return default_.apply_inner (applyType, A, tau, + C_top, C_bot, work, lwork); } void - factor_pair (const MatView& R_top, - const MatView& R_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) override { - return default_.factor_pair (R_top, R_bot, tau, work); + return default_.factor_pair (R_top, R_bot, tau, work, lwork); } void apply_pair (const ApplyType& applyType, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar R_bot[], - const Ordinal ldr_bot, + const MatView& R_bot, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) override { - return default_.apply_pair (applyType, ncols_C, ncols_Q, - R_bot, ldr_bot, tau, - C_top, ldc_top, C_bot, ldc_bot, - work); + return default_.apply_pair (applyType, R_bot, tau, + C_top, C_bot, work, lwork); } private: - mutable combine_default_type default_; + CombineDefault default_; }; - - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: - GER (const magnitude_type alpha, - const Kokkos::View& x, - const Kokkos::View& y, - const Kokkos::View& A) const + CombineNative:: + GER (const mag_type alpha, + const vector_type& x, + const vector_type& y, + const matrix_type& A) const { constexpr scalar_type ZERO {0.0}; - const Ordinal m = A.extent (0); - const Ordinal n = A.extent (1); + const ordinal_type m = A.extent (0); + const ordinal_type n = A.extent (1); - constexpr Ordinal incy {1}; - //Ordinal jy = (incy > 0) ? 1 : 1 - (n-1) * incy; - Ordinal jy = 1; + constexpr ordinal_type incy {1}; + //ordinal_type jy = (incy > 0) ? 1 : 1 - (n-1) * incy; + ordinal_type jy = 1; - for (Ordinal j = 0; j < n; ++j) { + for (ordinal_type j = 0; j < n; ++j) { if (y[jy-1] != ZERO) { const scalar_type temp = alpha * y[jy-1]; - for (Ordinal i = 0; i < m; ++i) { + for (ordinal_type i = 0; i < m; ++i) { A(i,j) = A(i,j) + x[i] * temp; } } @@ -447,23 +440,22 @@ namespace TSQR { } } - - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: + CombineNative:: GEMV (const char trans[], const scalar_type alpha, - const Kokkos::View& A, - const Kokkos::View& x, + const matrix_type& A, + const vector_type& x, const scalar_type beta, - const Kokkos::View& y) const + const vector_type& y) const { - using y_vec_type = Kokkos::View; - using x_vec_type = Kokkos::View; - using range_type = std::pair; + using y_vec_type = vector_type; + using x_vec_type = vector_type; + using range_type = std::pair; - const Ordinal m = A.extent (0); - const Ordinal n = A.extent (1); + const ordinal_type m = A.extent (0); + const ordinal_type n = A.extent (1); const bool no_trans = (trans[0] == 'N' || trans[0] == 'n'); x_vec_type x_view = Kokkos::subview (x, range_type (0, no_trans ? n : m)); @@ -472,36 +464,36 @@ namespace TSQR { KokkosBlas::gemv (trans, alpha, A, x_view, beta, y_view); } - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: - factor_inner (const Kokkos::View& R_view, - const Kokkos::View& A_view, - const Kokkos::View& tau_view, - const Kokkos::View& work_view) const + CombineNative:: + factor_inner (const matrix_type& R_view, + const matrix_type& A_view, + const vector_type& tau_view, + const vector_type& work_view) const { using Kokkos::ALL; using Kokkos::subview; - using range_type = std::pair; + using range_type = std::pair; constexpr scalar_type ZERO {0.0}; constexpr scalar_type ONE {1.0}; + const ordinal_type m = A_view.extent (0); + const ordinal_type n = A_view.extent (1); - const Ordinal m = A_view.extent (0); - const Ordinal n = A_view.extent (1); - - for (Ordinal k = 0; k < n; ++k) { + for (ordinal_type k = 0; k < n; ++k) { work_view(k) = ZERO; } - for (Ordinal k = 0; k < n-1; ++k) { + for (ordinal_type k = 0; k < n-1; ++k) { Scalar& R_kk = R_view(k, k); auto A_1k = subview (A_view, ALL (), k); - auto A_1kp1 = subview (A_view, range_type (0, m), range_type (k+1, n)); + auto A_1kp1 = + subview (A_view, range_type (0, m), range_type (k+1, n)); this->LARFG (m + 1, R_kk, A_1k, tau_view[k]); this->GEMV ("T", ONE, A_1kp1, A_1k, ZERO, work_view); - for (Ordinal j = k+1; j < n; ++j) { + for (ordinal_type j = k+1; j < n; ++j) { Scalar& R_kj = R_view(k, j); work_view(j-k-1) += R_kj; @@ -515,58 +507,60 @@ namespace TSQR { this->LARFG (m+1, R_nn, A_1n, tau_view[n-1]); } - - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: - factor_inner (const MatView& R, - const MatView& A, + CombineNative:: + factor_inner (const MatView& R, + const MatView& A, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) { using Kokkos::ALL; using Kokkos::subview; - using mat_type = - Kokkos::View; - using nonconst_vec_type = - Kokkos::View; - using range_type = std::pair; - - mat_type A_full (A.data(), A.stride(1), A.extent(1)); - mat_type A_view = subview (A_full, range_type (0, A.extent(0)), ALL ()); - mat_type R_full (R.data(), R.stride(1), R.extent(1)); - mat_type R_view = subview (R_full, range_type (0, R.extent(1)), ALL ()); - nonconst_vec_type tau_view (tau, R.extent(1)); - nonconst_vec_type work_view (work, R.extent(1)); + using mat_type = matrix_type; + using nonconst_vec_type = vector_type; + using range = std::pair; + + const ordinal_type numRows (A.extent (0)); + const ordinal_type A_numCols (A.extent (1)); + const ordinal_type lda (A.stride (1)); + const ordinal_type R_numCols (R.extent (1)); + + mat_type A_full (A.data (), lda, A_numCols); + mat_type A_view = subview (A_full, range (0, numRows), ALL ()); + mat_type R_full (R.data (), R.stride (1), R_numCols); + mat_type R_view = subview (R_full, range (0, R_numCols), ALL ()); + nonconst_vec_type tau_view (tau, R_numCols); + nonconst_vec_type work_view (work, lwork); this->factor_inner (R_view, A_view, tau_view, work_view); } - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: + CombineNative:: apply_inner (const ApplyType& applyType, - const Kokkos::View& A, - const Kokkos::View& tau, - const Kokkos::View& C_top, - const Kokkos::View& C_bot, - const Kokkos::View& work) const + const matrix_type& A, + const vector_type& tau, + const matrix_type& C_top, + const matrix_type& C_bot, + const vector_type& work) const { using Kokkos::ALL; using Kokkos::subview; - using const_vec_type = - Kokkos::View; + using const_vec_type = vector_type; constexpr scalar_type ZERO {0.0}; - const Ordinal m = A.extent (0); - const Ordinal ncols_Q = A.extent (1); - const Ordinal ncols_C = C_top.extent (1); + const ordinal_type m = A.extent (0); + const ordinal_type ncols_Q = A.extent (1); + const ordinal_type ncols_C = C_top.extent (1); - for (Ordinal i = 0; i < ncols_C; ++i) { + for (ordinal_type i = 0; i < ncols_C; ++i) { work(i) = ZERO; } - Ordinal j_start, j_end, j_step; + ordinal_type j_start, j_end, j_step; if (applyType == ApplyType::NoTranspose) { j_start = ncols_Q - 1; j_end = -1; // exclusive @@ -577,18 +571,18 @@ namespace TSQR { j_end = ncols_Q; // exclusive j_step = +1; } - for (Ordinal j = j_start; j != j_end; j += j_step) { + for (ordinal_type j = j_start; j != j_end; j += j_step) { const_vec_type A_1j = subview (A, ALL (), j); //blas.GEMV ("T", m, ncols_C, ONE, C_bot, ldc_bot, A_1j, 1, ZERO, &y[0], 1); - for (Ordinal i = 0; i < ncols_C; ++i) { + for (ordinal_type i = 0; i < ncols_C; ++i) { work(i) = ZERO; - for (Ordinal k = 0; k < m; ++k) { + for (ordinal_type k = 0; k < m; ++k) { work(i) += A_1j(k) * C_bot(k, i); } work(i) += C_top(j, i); } - for (Ordinal k = 0; k < ncols_C; ++k) { + for (ordinal_type k = 0; k < ncols_C; ++k) { C_top(j, k) -= tau[j] * work(k); } @@ -596,70 +590,69 @@ namespace TSQR { } } - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: + CombineNative:: apply_inner (const ApplyType& applyType, - const Ordinal m, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const Scalar A[], - const Ordinal lda, + const MatView& A, const Scalar tau[], - Scalar C_top[], - const Ordinal ldc_top, - Scalar C_bot[], - const Ordinal ldc_bot, - Scalar work[]) const + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) { using Kokkos::ALL; using Kokkos::subview; - using const_mat_type = - Kokkos::View; - using nonconst_mat_type = - Kokkos::View; - using const_vec_type = - Kokkos::View; - using nonconst_vec_type = - Kokkos::View; - using range_type = std::pair; - - const_mat_type A_full (A, lda, ncols_Q); + using const_mat_type = matrix_type; + using nonconst_mat_type = matrix_type; + using const_vec_type = vector_type; + using nonconst_vec_type = vector_type; + using range_type = std::pair; + + const ordinal_type m = A.extent (0); + const ordinal_type ncols_Q = A.extent (1); + const ordinal_type ncols_C = C_top.extent (1); + + const_mat_type A_full (A.data (), A.stride (1), ncols_Q); auto A_view = subview (A_full, range_type (0, m), ALL ()); - nonconst_mat_type C_top_full (C_top, ldc_top, ncols_C); + nonconst_mat_type C_top_full + (C_top.data (), C_top.stride (1), ncols_C); auto C_top_view = subview (C_top_full, range_type (0, m), ALL ()); - nonconst_mat_type C_bot_full (C_bot, ldc_bot, ncols_C); + nonconst_mat_type C_bot_full + (C_bot.data (), C_bot.stride (1), ncols_C); auto C_bot_view = subview (C_bot_full, range_type (0, m), ALL ()); const_vec_type tau_view (tau, ncols_Q); - nonconst_vec_type work_view (work, ncols_C); + nonconst_vec_type work_view (work, lwork); - this->apply_inner (applyType, A_view, tau_view, C_top_view, C_bot_view, work_view); + this->apply_inner (applyType, A_view, tau_view, C_top_view, + C_bot_view, work_view); } - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: - factor_pair (const Kokkos::View& R_top, - const Kokkos::View& R_bot, - const Kokkos::View& tau_view, - const Kokkos::View& work_view) const + CombineNative:: + factor_pair (const matrix_type& R_top, + const matrix_type& R_bot, + const vector_type& tau_view, + const vector_type& work_view) const { using Kokkos::ALL; using Kokkos::subview; - using range_type = std::pair; + using range_type = std::pair; constexpr scalar_type ZERO {0.0}; constexpr scalar_type ONE {1.0}; - const Ordinal n = R_top.extent (0); - for (Ordinal k = 0; k < n; ++k) { + const ordinal_type n = R_top.extent (0); + for (ordinal_type k = 0; k < n; ++k) { work_view(k) = ZERO; } - for (Ordinal k = 0; k < n-1; ++k) { + for (ordinal_type k = 0; k < n-1; ++k) { scalar_type& R_top_kk = R_top(k, k); auto R_bot_1k = subview (R_bot, ALL (), k); - auto R_bot_1kp1 = subview (R_bot, range_type (0, k+1), range_type (k+1, n)); + auto R_bot_1kp1 = + subview (R_bot, range_type (0, k+1), range_type (k+1, n)); // k+2: 1 element in R_top (R_top(k,k)), and k+1 elements in // R_bot (R_bot(1:k,k), in 1-based indexing notation). @@ -669,7 +662,7 @@ namespace TSQR { this->GEMV ("T", ONE, R_bot_1kp1, R_bot_1k, ZERO, work_view); - for (Ordinal j = k+1; j < n; ++j) { + for (ordinal_type j = k+1; j < n; ++j) { scalar_type& R_top_kj = R_top(k, j); work_view(j-k-1) += R_top_kj; R_top_kj -= tau_view[k] * work_view(j-k-1); @@ -685,106 +678,113 @@ namespace TSQR { } - template< class Ordinal, class Scalar > + template void CombineNative:: - factor_pair (const MatView& R_top, - const MatView& R_bot, + factor_pair (const MatView& R_top, + const MatView& R_bot, Scalar tau[], - Scalar work[]) const + Scalar work[], + const ordinal_type lwork) { using Kokkos::ALL; using Kokkos::subview; - using range_type = std::pair; + using range_type = std::pair; - const Ordinal numCols = R_top.extent (1); - Kokkos::View R_top_full + const ordinal_type numCols = R_top.extent (1); + matrix_type R_top_full (R_top.data(), R_top.stride (1), numCols); - Kokkos::View R_bot_full + matrix_type R_bot_full (R_bot.data(), R_bot.stride (1), R_bot.extent (1)); - Kokkos::View tau_view - (tau, numCols); - Kokkos::View work_view - (work, numCols); + vector_type tau_view (tau, numCols); + vector_type work_view (work, lwork); if (R_top.stride(1) == numCols) { if (R_bot.stride(1) == numCols) { - this->factor_pair (R_top_full, R_bot_full, tau_view, work_view); + this->factor_pair (R_top_full, R_bot_full, tau_view, + work_view); } else { - auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ()); - this->factor_pair (R_top_full, R_bot_view, tau_view, work_view); + auto R_bot_view = + subview (R_bot_full, range_type (0, numCols), ALL ()); + this->factor_pair (R_top_full, R_bot_view, tau_view, + work_view); } } else { - auto R_top_view = subview (R_top_full, range_type (0, numCols), ALL ()); + auto R_top_view = + subview (R_top_full, range_type (0, numCols), ALL ()); if (R_bot.stride(1) == numCols) { - this->factor_pair (R_top_view, R_bot_full, tau_view, work_view); + this->factor_pair (R_top_view, R_bot_full, tau_view, + work_view); } else { - auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ()); - this->factor_pair (R_top_view, R_bot_view, tau_view, work_view); + auto R_bot_view = + subview (R_bot_full, range_type (0, numCols), ALL ()); + this->factor_pair (R_top_view, R_bot_view, tau_view, + work_view); } } } - - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: + CombineNative:: apply_pair (const ApplyType& applyType, - const Ordinal ncols_C, - const Ordinal ncols_Q, - const scalar_type R_bot[], - const Ordinal ldr_bot, - const scalar_type tau[], - scalar_type C_top[], - const Ordinal ldc_top, - scalar_type C_bot[], - const Ordinal ldc_bot, - scalar_type work[]) const + const MatView& R_bot, + const Scalar tau[], + const MatView& C_top, + const MatView& C_bot, + Scalar work[], + const ordinal_type lwork) { using Kokkos::ALL; using Kokkos::subview; - using range_type = std::pair; - using const_mat_type = - Kokkos::View; - using nonconst_mat_type = - Kokkos::View; - using const_vec_type = - Kokkos::View; - using nonconst_vec_type = - Kokkos::View; - - const_mat_type R_bot_full (R_bot, ldr_bot, ncols_Q); - nonconst_mat_type C_top_full (C_top, ldc_top, ncols_C); - nonconst_mat_type C_bot_full (C_bot, ldc_bot, ncols_C); + using range_type = std::pair; + using const_mat_type = matrix_type; + using nonconst_mat_type = matrix_type; + using const_vec_type = vector_type; + using nonconst_vec_type = vector_type; + + const ordinal_type ncols_Q = R_bot.extent (1); + const ordinal_type ncols_C = C_top.extent (1); + const_mat_type R_bot_full + (R_bot.data (), R_bot.stride (1), ncols_Q); + nonconst_mat_type C_top_full + (C_top.data (), C_top.stride (1), ncols_C); + nonconst_mat_type C_bot_full + (C_bot.data (), C_bot.stride (1), ncols_C); const_vec_type tau_view (tau, ncols_Q); - nonconst_vec_type work_view (work, ncols_C); - - auto R_bot_view = subview (R_bot_full, range_type (0, ncols_Q), ALL ()); - auto C_top_view = subview (C_top_full, range_type (0, ncols_C), ALL ()); - auto C_bot_view = subview (C_bot_full, range_type (0, ncols_C), ALL ()); - this->apply_pair (applyType, R_bot_view, tau_view, C_top_view, C_bot_view, work_view); + nonconst_vec_type work_view (work, lwork); + + auto R_bot_view = + subview (R_bot_full, range_type (0, ncols_Q), ALL ()); + auto C_top_view = + subview (C_top_full, range_type (0, ncols_C), ALL ()); + auto C_bot_view = + subview (C_bot_full, range_type (0, ncols_C), ALL ()); + this->apply_pair (applyType, R_bot_view, tau_view, + C_top_view, C_bot_view, work_view); } - template< class Ordinal, class Scalar > + template void - CombineNative< Ordinal, Scalar, false >:: + CombineNative:: apply_pair (const ApplyType& applyType, - const Kokkos::View& R_bot, // ncols_Q - const Kokkos::View& tau_view, - const Kokkos::View& C_top, // ncols_C - const Kokkos::View& C_bot, - const Kokkos::View& work_view) const + const matrix_type& R_bot, // ncols_Q + const vector_type& tau_view, + const matrix_type& C_top, // ncols_C + const matrix_type& C_bot, + const vector_type& work_view) const { - using const_vec_type = - Kokkos::View; + using Kokkos::ALL; + using Kokkos::subview; + using const_vec_type = vector_type; constexpr scalar_type ZERO {0.0}; - const Ordinal ncols_C = C_top.extent (1); - const Ordinal ncols_Q = R_bot.extent (1); + const ordinal_type ncols_C = C_top.extent (1); + const ordinal_type ncols_Q = R_bot.extent (1); - Ordinal j_start, j_end, j_step; + ordinal_type j_start, j_end, j_step; if (applyType == ApplyType::NoTranspose) { j_start = ncols_Q - 1; j_end = -1; // exclusive @@ -795,29 +795,29 @@ namespace TSQR { j_end = ncols_Q; // exclusive j_step = +1; } - for (Ordinal j_Q = j_start; j_Q != j_end; j_Q += j_step) { + for (ordinal_type j_Q = j_start; j_Q != j_end; j_Q += j_step) { // Using Householder reflector stored in column j_Q of R_bot - const_vec_type R_bot_col = Kokkos::subview (R_bot, Kokkos::ALL (), j_Q); + const_vec_type R_bot_col = subview (R_bot, ALL (), j_Q); // In 1-based indexing notation, with k in 1, 2, ..., ncols_C // (inclusive): (Output is length ncols_C row vector) // // work(1:j) := R_bot(1:j,j)' * C_bot(1:j, 1:ncols_C) - C_top(j, 1:ncols_C) - for (Ordinal j_C = 0; j_C < ncols_C; ++j_C) { + for (ordinal_type j_C = 0; j_C < ncols_C; ++j_C) { // For each column j_C of [C_top; C_bot], update row j_Q // of C_top and rows 1:j_Q of C_bot. (Again, this is in // 1-based indexing notation. scalar_type work_j_C = ZERO; - const_vec_type C_bot_col = Kokkos::subview (C_bot, Kokkos::ALL (), j_C); + const_vec_type C_bot_col = subview (C_bot, ALL (), j_C); - for (Ordinal k = 0; k <= j_Q; ++k) + for (ordinal_type k = 0; k <= j_Q; ++k) { work_j_C += R_bot_col(k) * C_bot_col(k); - + } work_j_C += C_top(j_Q, j_C); work_view(j_C) = work_j_C; } - for (Ordinal j_C = 0; j_C < ncols_C; ++j_C) { + for (ordinal_type j_C = 0; j_C < ncols_C; ++j_C) { C_top(j_Q, j_C) -= tau_view[j_Q] * work_view(j_C); } this->GER (-tau_view[j_Q], R_bot_col, work_view, C_bot); @@ -825,6 +825,4 @@ namespace TSQR { } } // namespace TSQR - - -#endif // __TSQR_CombineNative_hpp +#endif // TSQR_COMBINENATIVE_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp new file mode 100644 index 000000000000..28b41c7bf640 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp @@ -0,0 +1,334 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos: Node API and Parallel Node Kernels +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER + +/// \file Tsqr_CombineNodeTsqr.hpp +/// \brief Declaration and definition of an implementation of NodeTsqr +/// (intranode TSQR) that just uses Combine for all the operations +/// on an MPI process. + +#ifndef TSQR_COMBINENODETSQR_HPP +#define TSQR_COMBINENODETSQR_HPP + +#include "Tsqr_NodeTsqr.hpp" +#include "Tsqr_Impl_CombineUser.hpp" +#include "Tsqr_Impl_SystemBlas.hpp" +#include "Teuchos_TypeNameTraits.hpp" +#include + +namespace TSQR { + namespace Impl { + template + using span = Kokkos::View>; + + template + class CombineNodeFactorOutput : + public NodeFactorOutput { + public: + CombineNodeFactorOutput (std::vector&& tau) : + tau_ (tau) + {} + ~CombineNodeFactorOutput () override = default; + span tau () const { + return span (tau_.data (), tau_.size ()); + } + private: + std::vector tau_; + }; + } // namespace Impl + + /// \class CombineNodeTsqr + /// \brief Implementation of NodeTsqr (intranode TSQR) that just + /// uses Combine for all the operations on an MPI process. + template + class CombineNodeTsqr : + public NodeTsqr, + private Impl::CombineUser { + private: + using base_type = NodeTsqr; + using my_factor_output_type = + Impl::CombineNodeFactorOutput; + + public: + using ordinal_type = typename base_type::ordinal_type; + using scalar_type = typename base_type::scalar_type; + using mat_view_type = typename base_type::mat_view_type; + using const_mat_view_type = + typename base_type::const_mat_view_type; + using magnitude_type = typename base_type::magnitude_type; + using factor_output_type = typename base_type::factor_output_type; + + ~CombineNodeTsqr () override = default; + + Teuchos::RCP + getValidParameters () const override { + return Teuchos::parameterList ("CombineNodeTsqr"); + } + + void + setParameterList (const Teuchos::RCP&) override + {} + + bool ready() const override { + return true; + } + + size_t cache_size_hint() const override { + return size_t (0); + } + + std::string description () const override { + using Teuchos::TypeNameTraits; + std::ostringstream os; + os << "CombineNodeTsqr::name() << ", Scalar=" + << TypeNameTraits::name() << ">: Intranode " + "Intraprocess TSQR based on TSQR::Combine"; + return os.str(); + } + + private: + void + factorImpl (const mat_view_type& R, + const mat_view_type& A, + std::vector& tau) const + { + const ordinal_type ncols = A.extent (1); + TEUCHOS_ASSERT( R.extent (0) == ncols && + R.extent (1) == ncols ); + auto& combine = this->getCombine (ncols); + const ordinal_type lwork = + combine.work_size (A.extent (0), ncols, ncols); + std::vector work (lwork); + combine.factor_first (A, tau.data (), work.data (), lwork); + + // Copy the R factor resulting from the factorization out of the + // topmost block of A) into the R output argument. + deep_copy (R, Scalar {}); + copy_upper_triangle (R, A); + } + + public: + Teuchos::RCP + factor (const ordinal_type nrows, + const ordinal_type ncols, + Scalar A[], + const ordinal_type lda, + Scalar R[], + const ordinal_type ldr, + const bool /* contiguousCacheBlocks */) const override + { + // The "contiguous cache blocks" option does nothing here, since + // we just defer to an internal library that expects + // column-major matrices. + mat_view_type A_view (nrows, ncols, A, lda); + mat_view_type R_view (ncols, ncols, R, ldr); + std::vector tau (ncols); + factorImpl (R_view, A_view, tau); + using Teuchos::rcp; + return rcp (new my_factor_output_type (std::move (tau))); + } + + void + apply (const ApplyType& applyType, + const ordinal_type nrows, + const ordinal_type ncols_Q, + const Scalar Q[], + const ordinal_type ldq, + const factor_output_type& factorOutput, + const ordinal_type ncols_C, + Scalar C[], + const ordinal_type ldc, + const bool /* contiguousCacheBlocks */) const override + { + const char prefix[] = "TSQR::CombineNodeTsqr::apply: "; + + // Quick exit and error tests + if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) { + return; + } + else if (ldc < nrows) { + std::ostringstream os; + os << prefix << "ldc (= " << ldc << ") < nrows (= " + << nrows << ")"; + throw std::invalid_argument (os.str()); + } + else if (ldq < nrows) { + std::ostringstream os; + os << prefix << "ldq (= " << ldq << ") < nrows (= " + << nrows << ")"; + throw std::invalid_argument (os.str()); + } + + const my_factor_output_type& output = [&] () { + const my_factor_output_type* output_ptr = + dynamic_cast (&factorOutput); + if (output_ptr == nullptr) { + using Teuchos::demangleName; + using Teuchos::TypeNameTraits; + using Teuchos::typeName; + std::ostringstream os; + os << prefix << "Input factor_output_type object was not " + "created by the same type of NodeTsqr object as this " + "one. This object has type " << typeName (*this) << + " and its subclass of factor_output_type has type " << + TypeNameTraits::name () << ", but " + "the input factor_output_type object has dynamic type " + << demangleName (typeid (factorOutput).name ()); + throw std::invalid_argument (os.str ()); + } + return *output_ptr; + } (); + + auto& combine = this->getCombine (std::max (ncols_Q, ncols_C)); + const ordinal_type lwork = + combine.work_size (nrows, ncols_C, ncols_C); + std::vector work (lwork); + + const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq); + mat_view_type C_view (nrows, ncols_C, C, ldc); + const auto tau = output.tau (); + combine.apply_first (applyType, Q_view, tau.data (), + C_view, work.data (), lwork); + } + + void + explicit_Q (const ordinal_type nrows, + const ordinal_type ncols_Q, + const Scalar Q[], + const ordinal_type ldq, + const factor_output_type& factorOutput, + const ordinal_type ncols_C, + Scalar C[], + const ordinal_type ldc, + const bool contiguousCacheBlocks) const override + { + mat_view_type C_view (nrows, ncols_C, C, ldc); + deep_copy (C_view, Scalar {}); + this->set_diagonal_entries_to_one (C_view); + // Apply the Q factor to C, to extract the first ncols_C columns + // of Q in explicit form. + apply (ApplyType::NoTranspose, + nrows, ncols_Q, Q, ldq, factorOutput, + ncols_C, C, ldc, contiguousCacheBlocks); + } + + void + cache_block (const ordinal_type /* nrows */, + const ordinal_type /* ncols */, + Scalar /* A_out */ [], + const Scalar /* A_in */ [], + const ordinal_type /* lda_in */) const override + {} + + void + un_cache_block (const ordinal_type /* nrows */, + const ordinal_type /* ncols */, + Scalar /* A_out */ [], + const ordinal_type /* lda_out */, + const Scalar /* A_in */ []) const override + {} + + void + Q_times_B (const ordinal_type nrows, + const ordinal_type ncols, + Scalar Q[], + const ordinal_type ldq, + const Scalar B[], + const ordinal_type ldb, + const bool /* contiguousCacheBlocks */) const override + { + using Teuchos::NO_TRANS; + + // We don't do any other error checking here (e.g., matrix + // dimensions), though it would be a good idea to do so. + + // Take the easy exit if available. + if (ncols == 0 || nrows == 0) { + return; + } + + Impl::SystemBlas blas; + mat_view_type Q_view (nrows, ncols, Q, ldq); + // GEMM doesn't like its input and output arguments to alias + // each other, so we use a (deep) copy. + Matrix Q_copy (Q_view); + + // Q_view := Q_copy * B. + blas.GEMM (NO_TRANS, NO_TRANS, + nrows, ncols, ncols, + Scalar (1.0), Q_copy.data (), Q_copy.stride (1), + B, ldb, + Scalar {}, Q_view.data (), Q_view.stride (1)); + } + + void + fill_with_zeros (const ordinal_type nrows, + const ordinal_type ncols, + Scalar A[], + const ordinal_type lda, + const bool /* contiguousCacheBlocks */) const override + { + mat_view_type A_view (nrows, ncols, A, lda); + deep_copy (A_view, Scalar {}); + } + + protected: + const_mat_view_type + const_top_block (const const_mat_view_type& C, + const bool /* contiguousCacheBlocks */) const override + { + return C; // For this class, "cache blocking" does nothing. + } + + public: + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + // FIXME (19 Dec 2019) If the combine type is dynamic, we can't + // answer this question without knowing the number of columns. + // Just guess for now. + constexpr ordinal_type fakeNumCols = 10; + auto& c = this->getCombine (fakeNumCols); + return c.QR_produces_R_factor_with_nonnegative_diagonal (); + } + }; +} // namespace TSQR + +#endif // TSQR_COMBINENODETSQR_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp index 341b22ae9d32..790160667e58 100644 --- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp @@ -42,7 +42,7 @@ #include "Tsqr_Random_NormalGenerator.hpp" #include "Tsqr_Random_MatrixGenerator.hpp" -#include "Tsqr_Combine.hpp" +#include "Tsqr_CombineFactory.hpp" #include "Tsqr_LocalVerify.hpp" #include "Tsqr_Matrix.hpp" #include "Tsqr_Util.hpp" @@ -59,36 +59,50 @@ namespace TSQR { namespace Test { + template + void + fill_with_identity_columns (const MatView& A) + { + deep_copy (A, Scalar {}); + const Ordinal numCols = A.extent (1); + // FIXME (mfh 08 Dec 2019) Eventually stop writing to Matrix or + // MatView entries on host, for eventual GPU-ization. + for (Ordinal j = 0; j < numCols; ++j) { + A(j,j) = Scalar (1.0); + } + } + template - static void + void generateSingularValues (NormalGenType& magGen, std::vector& sigma, const Ordinal numValues) { - typedef MagnitudeType magnitude_type; - const magnitude_type machEps = - std::numeric_limits::epsilon(); + using mag_type = MagnitudeType; + const mag_type machEps = + std::numeric_limits::epsilon(); sigma.resize (numValues); // Relative amount by which to perturb each singular value. The // perturbation will be multiplied by a normal(0,1) pseudorandom // number drawn from magGen. - const magnitude_type perturbationFactor = magnitude_type(10) * machEps; - - sigma[0] = magnitude_type (1); - for (Ordinal k = 1; k < numValues; ++k) - { - const magnitude_type perturbation = perturbationFactor * magGen(); - const magnitude_type beforePerturb = sigma[k-1] / magnitude_type(2); - const magnitude_type candidate = beforePerturb + perturbation; - - // If adding the perturbation to beforePerturb would result - // in a nonpositive number, subtract instead. - if (candidate <= magnitude_type(0)) - sigma[k] = beforePerturb - perturbation; - else - sigma[k] = candidate; + const mag_type perturbationFactor = mag_type(10) * machEps; + + sigma[0] = mag_type (1); + for (Ordinal k = 1; k < numValues; ++k) { + const mag_type perturbation = perturbationFactor * magGen(); + const mag_type beforePerturb = sigma[k-1] / mag_type(2); + const mag_type candidate = beforePerturb + perturbation; + + // If adding the perturbation to beforePerturb would result + // in a nonpositive number, subtract instead. + if (candidate <= mag_type {}) { + sigma[k] = beforePerturb - perturbation; + } + else { + sigma[k] = candidate; } + } } static void @@ -98,41 +112,42 @@ namespace TSQR { using std::endl; const char prefix[] = "%"; - cout << prefix - << "method" - << ",kernel" + cout << prefix << "kernel" + << ",combiner" << ",scalarType" << ",numRows" << ",numCols" + << ",frobA" << ",absFrobResid" << ",absFrobOrthog" - << ",frobA" << endl; } template static void - printR1R2results (const std::string& datatype, + printR1R2results (const std::string& combinerName, + const std::string& scalarName, const int numCols, const std::vector& results) { using std::cout; using std::endl; - cout << "Combine" - << "," << "R1R2" - << "," << datatype + cout << "R1R2" + << "," << combinerName + << "," << scalarName << "," << (2*numCols) << "," << numCols + << "," << results[2] << "," << results[0] << "," << results[1] - << "," << results[2] << endl; } template static void - printR3Aresults (const std::string& datatype, + printR3Aresults (const std::string& combinerName, + const std::string& scalarName, const int numRows, const int numCols, const std::vector& results) @@ -140,62 +155,68 @@ namespace TSQR { using std::cout; using std::endl; - cout << "Combine" - << "," << "R3A" - << "," << datatype + cout << "R3A" + << "," << combinerName + << "," << scalarName << "," << numRows << "," << numCols + << "," << results[5] << "," << results[3] << "," << results[4] - << "," << results[5] << endl; } template static void - printResults (const std::string& datatype, + printResults (const std::string& combinerName, + const std::string& scalarName, const int numRows, const int numCols, - const std::vector& results, - const bool printFieldNames) + const std::vector& results) + { + printR1R2results (combinerName, scalarName, numCols, results); + printR3Aresults (combinerName, scalarName, + numRows, numCols, results); + } + + static void + printSimSeqTsqrFieldNames () { - if (printFieldNames) - printCombineFieldNames(); - printR1R2results (datatype, numCols, results); - printR3Aresults (datatype, numRows, numCols, results); + using std::cout; + using std::endl; + + const char prefix[] = "%"; + cout << prefix + << "method" + << ",combiner" + << ",scalarType" + << ",numRows" + << ",numCols" + << ",frobA" + << ",absFrobResid" + << ",absFrobOrthog" + << endl; } template static void - printSimSeqTsqrResults (const std::string& datatype, + printSimSeqTsqrResults (const std::string& combinerName, + const std::string& scalarName, const int numRows, const int numCols, - const std::vector& results, - const bool printFieldNames) + const std::vector& results) { using std::cout; using std::endl; - if (printFieldNames) - { - const char prefix[] = "%"; - cout << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA" - << endl; - } cout << "CombineSimSeqTsqr" - << "," << datatype + << "," << combinerName + << "," << scalarName << "," << numRows << "," << numCols + << "," << results[2] << "," << results[0] << "," << results[1] - << "," << results[2] << endl; } @@ -204,7 +225,8 @@ namespace TSQR { printMatrix (std::ostream& out, const MatrixViewType& A) { - print_local_matrix (out, A.extent(0), A.extent(1), A.data(), A.stride(1)); + print_local_matrix (out, A.extent(0), A.extent(1), + A.data(), A.stride(1)); } template @@ -218,8 +240,10 @@ namespace TSQR { const MatrixViewType& Q, const MatrixViewType& R) { - return local_verify (A.extent(0), A.extent(1), A.data(), A.stride(1), - Q.data(), Q.stride(1), R.data(), R.stride(1)); + return local_verify (A.extent(0), A.extent(1), + A.data(), A.stride(1), + Q.data(), Q.stride(1), + R.data(), R.stride(1)); } /// \brief Test accuracy of TSQR::Combine @@ -230,13 +254,17 @@ namespace TSQR { /// 2. [R; A] where R is ncols by ncols upper triangular, and A is /// nrows by ncols general dense. /// - /// \return ($\|A - QR\|_F$, $\|I - Q^* Q\|_F$, $\|A\|_F$) for each - /// test problem (so, a vector of six elements). + /// Print ($\|A - QR\|_F$, $\|I - Q^* Q\|_F$, $\|A\|_F$) for each + /// test problem (6 numbers in total). /// - template - static std::vector::magnitudeType> + template + void verifyCombineTemplate (TSQR::Random::NormalGenerator& gen, TSQR::Random::NormalGenerator::magnitudeType>& magGen, + CombineType& combiner, + const std::string& combinerName, const Ordinal numRows, const Ordinal numCols, const bool debug) @@ -251,11 +279,11 @@ namespace TSQR { using std::vector; typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; + typedef typename STS::magnitudeType mag_type; typedef NormalGenerator normgen_type; typedef MatrixGenerator matgen_type; typedef Matrix matrix_type; - typedef vector results_type; + typedef vector results_type; if (numRows < numCols) { ostringstream os; @@ -271,37 +299,25 @@ namespace TSQR { // Generate four different sets of singular values. Randomly // perturb them, but make sure all are positive. // - vector< magnitude_type > sigma_R1 (numCols); - vector< magnitude_type > sigma_R2 (numCols); - vector< magnitude_type > sigma_R3 (numCols); - vector< magnitude_type > sigma_A (numCols); + vector sigma_R1 (numCols); + vector sigma_R2 (numCols); + vector sigma_R3 (numCols); + vector sigma_A (numCols); generateSingularValues (magGen, sigma_R1, numCols); generateSingularValues (magGen, sigma_R2, numCols); generateSingularValues (magGen, sigma_R3, numCols); generateSingularValues (magGen, sigma_A, numCols); - matrix_type R1 (numCols, numCols, Scalar(0)); - matrix_type R2 (numCols, numCols, Scalar(0)); - matrix_type R3 (numCols, numCols, Scalar(0)); - matrix_type A (numRows, numCols, Scalar(0)); + matrix_type R1 (numCols, numCols, Scalar{}); + matrix_type R2 (numCols, numCols, Scalar{}); + matrix_type R3 (numCols, numCols, Scalar{}); + matrix_type A (numRows, numCols, Scalar{}); matgen_type matgen (gen); matgen.fill_random_R (numCols, R1.data(), R1.stride(1), &sigma_R1[0]); matgen.fill_random_R (numCols, R2.data(), R2.stride(1), &sigma_R2[0]); matgen.fill_random_R (numCols, R3.data(), R3.stride(1), &sigma_R3[0]); matgen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), &sigma_A[0]); - if (false && debug) { - cerr << endl << "First test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R1.data(), R1.stride(1)); - print_local_matrix (cerr, numCols, numCols, R2.data(), R2.stride(1)); - cerr << endl; - - cerr << endl << "Second test problem:" << endl; - print_local_matrix (cerr, numCols, numCols, R3.data(), R3.stride(1)); - print_local_matrix (cerr, numRows, numCols, A.data(), A.stride(1)); - cerr << endl; - } - // Space to put the original test problem, expressed as one // dense matrix rather than in two blocks. These will be deep // copies of the test problems, since the test problem matrices @@ -324,18 +340,13 @@ namespace TSQR { } // Space to put the explicit Q factors. - matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0)); - matrix_type Q_R3A (numRows + numCols, numCols, Scalar(0)); - - // Fill the explicit Q factor matrices with the first numCols - // columns of the identity matrix. - for (Ordinal k = 0; k < numCols; ++k) { - // FIXME (mfh 26 Nov 2019) Eventually we want to get away from - // direct modification of the entries of a Matrix or MatView, - // in favor of only doing so with a Kokkos kernel or TPL. - Q_R1R2(k, k) = Scalar(1.0); - Q_R3A(k, k) = Scalar(1.0); - } + matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar {}); + auto Q_R1_Q_R2 = partition_2x1 (Q_R1R2.view (), numCols); + matrix_type Q_R3A (numCols + numRows, numCols, Scalar {}); + auto Q_R3_A = partition_2x1 (Q_R3A.view (), numCols); + + fill_with_identity_columns (Q_R1R2.view ()); + fill_with_identity_columns (Q_R3A.view ()); // tau factor arrays, one for each factorization test. vector tau_R1R2 (numCols); @@ -343,7 +354,9 @@ namespace TSQR { // Workspace array for factorization and applying the Q factor. // We recycle this workspace for all tests. - vector work (numCols); + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + vector work (lwork); if (debug) { cerr << endl << "----------------------------------------" << endl @@ -351,48 +364,47 @@ namespace TSQR { << "qr( [R1; R2] ), with R1 and R2 " << numCols << " by " << numCols << endl << endl; } - Combine combiner; - combiner.factor_pair (R1.view(), R2.view(), - tau_R1R2.data(), work.data()); - combiner.apply_pair (ApplyType("N"), numCols, numCols, - R2.data(), R2.stride(1), tau_R1R2.data(), - &Q_R1R2(0, 0), Q_R1R2.stride(1), - &Q_R1R2(numCols, 0), Q_R1R2.stride(1), - work.data()); + combiner.factor_pair (R1.view (), R2.view (), + tau_R1R2.data (), work.data (), lwork); + combiner.apply_pair (ApplyType ("N"), R2.view (), + tau_R1R2.data (), + Q_R1_Q_R2.first, Q_R1_Q_R2.second, + work.data (), lwork); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Copy of test problem:" << endl; - print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1), - A_R1R2.data(), A_R1R2.stride(1)); + print_local_matrix (cerr, A_R1R2.extent (0), + A_R1R2.extent (1), A_R1R2.data (), + A_R1R2.stride (1)); cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1), - Q_R1R2.data(), Q_R1R2.stride(1)); + print_local_matrix (cerr, Q_R1R2.extent (0), + Q_R1R2.extent (1), Q_R1R2.data (), + Q_R1R2.stride (1)); cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, R1.extent(0), R1.extent(1), - R1.data(), R1.stride(1)); + print_local_matrix (cerr, R1.extent (0), R1.extent (1), + R1.data (), R1.stride (1)); cerr << endl; } const results_type firstResults = - local_verify (A_R1R2.extent(0), A_R1R2.extent(1), - A_R1R2.data(), A_R1R2.stride(1), - Q_R1R2.data(), Q_R1R2.stride(1), - R1.data(), R1.stride(1)); + local_verify (A_R1R2.extent (0), A_R1R2.extent (1), + A_R1R2.data (), A_R1R2.stride (1), + Q_R1R2.data (), Q_R1R2.stride (1), + R1.data (), R1.stride (1)); if (debug) { cerr << "\\| A - Q*R \\|_F = " << firstResults[0] << endl << "\\| I - Q'*Q \\|_F = " << firstResults[1] << endl << "\\| A \\|_A = " << firstResults[2] << endl; - cerr << endl << "----------------------------------------" << endl - << "TSQR::Combine second test problem:" << endl - << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols - << " and A " << numRows << " by " << numCols << endl << endl; + cerr << endl << "----------------------------------------" + << endl << "TSQR::Combine second test problem:" << endl + << "qr( [R3; A] ), with R3 " << numCols << " by " + << numCols << " and A " << numRows << " by " << numCols + << endl << endl; } - combiner.factor_inner (R3.view(), A.view(), - tau_R3A.data(), work.data()); - combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols, - A.data(), A.stride(1), tau_R3A.data(), - &Q_R3A(0, 0), Q_R3A.stride(1), - &Q_R3A(numCols, 0), Q_R3A.stride(1), - work.data()); + combiner.factor_inner (R3.view (), A.view (), + tau_R3A.data (), work.data (), lwork); + combiner.apply_inner (ApplyType ("N"), A.view (), + tau_R3A.data (), Q_R3_A.first, + Q_R3_A.second, work.data (), lwork); if (debug) { cerr << "Results of second test problem:" << endl; cerr << "-- Copy of test problem:" << endl; @@ -416,7 +428,7 @@ namespace TSQR { << "\\| I - Q'*Q \\|_F = " << secondResults[1] << endl << "\\| A \\|_A = " << secondResults[2] << endl; } - vector finalResults; + vector finalResults; finalResults.push_back (firstResults[0]); finalResults.push_back (firstResults[1]); finalResults.push_back (firstResults[2]); @@ -424,14 +436,74 @@ namespace TSQR { finalResults.push_back (secondResults[0]); finalResults.push_back (secondResults[1]); finalResults.push_back (secondResults[2]); - return finalResults; + + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + printResults (combinerName, scalarName, numRows, numCols, + finalResults); + } + + template + void + verifyCombineTemplateAllCombiners (std::vector& iseed, + const Ordinal numRows, + const Ordinal numCols, + const bool debug) + { + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + + Random::NormalGenerator normgenS (iseed); + Random::NormalGenerator normgenM (iseed); + + using factory_type = CombineFactory; + { + const std::string combinerName ("Native"); + auto combiner = factory_type::create (combinerName); + TEUCHOS_ASSERT( combiner.get () != nullptr ); + // Make sure it's the right type. + using expected_type = CombineNative; + expected_type* combinerPtr = + dynamic_cast (combiner.get ()); + TEUCHOS_ASSERT( combinerPtr != nullptr ); + verifyCombineTemplate (normgenS, normgenM, *combiner, + combinerName, numRows, numCols, + debug); + } + { + const std::string combinerName ("Default"); + auto combiner = factory_type::create (combinerName); + TEUCHOS_ASSERT( combiner.get () != nullptr ); + // Make sure it's the right type. + using expected_type = CombineDefault; + expected_type* combinerPtr = + dynamic_cast (combiner.get ()); + TEUCHOS_ASSERT( combinerPtr != nullptr ); + verifyCombineTemplate (normgenS, normgenM, *combiner, + combinerName, numRows, numCols, + debug); + } + + // Fetch the pseudorandom seed from the previous test. + // + // Even though normgenS and normgenM each updated the random + // seed independently, for now we just fetch the updated seed + // from normgenS. This should still produce reproducible + // results. + normgenS.getSeed (iseed); } //! Simulate one combine step of Sequential TSQR - template - static std::vector::magnitudeType> + template + std::vector::magnitudeType> verifyCombineSeqTemplate (TSQR::Random::NormalGenerator& gen, TSQR::Random::NormalGenerator::magnitudeType>& magGen, + CombineType& combiner, const Ordinal numRows, const Ordinal numCols, const bool debug) @@ -446,12 +518,12 @@ namespace TSQR { using std::vector; typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; + typedef typename STS::magnitudeType mag_type; typedef NormalGenerator< Ordinal, Scalar > normgen_type; typedef MatrixGenerator< Ordinal, Scalar, normgen_type > matgen_type; typedef Matrix matrix_type; typedef MatView mat_view_type; - typedef vector results_type; + typedef vector results_type; if (numRows < numCols) { ostringstream os; @@ -464,32 +536,24 @@ namespace TSQR { } // Generate two different sets of singular values. - vector< magnitude_type > sigma_A1 (numCols); - vector< magnitude_type > sigma_A2 (numCols); + vector sigma_A1 (numCols); + vector sigma_A2 (numCols); generateSingularValues (magGen, sigma_A1, numCols); generateSingularValues (magGen, sigma_A2, numCols); - // Matrix consisting of two cache blocks. - matrix_type A (Ordinal(2)*numRows, numCols, Scalar(0)); + // Matrix consisting of two "cache blocks." + matrix_type A (Ordinal(2)*numRows, numCols, Scalar{}); + auto A1_A2 = partition_2x1 (A, numRows); // Views of the two cache blocks. - mat_view_type A1 (numRows, numCols, &A(0,0), A.stride(1)); - mat_view_type A2 (numRows, numCols, &A(numRows,0), A.stride(1)); + mat_view_type A1 = A1_A2.first; + mat_view_type A2 = A1_A2.second; // Fill the two cache blocks with random test problems. matgen_type matgen (gen); - matgen.fill_random_svd (numRows, numCols, A1.data(), A1.stride(1), &sigma_A1[0]); - matgen.fill_random_svd (numRows, numCols, A2.data(), A2.stride(1), &sigma_A2[0]); - - if (false && debug) { - cerr << endl << "Test problem:" << endl; - cerr << endl << "Original matrix:" << endl; - printMatrix (cerr, A); - cerr << endl << "First cache block:" << endl; - printMatrix (cerr, A1); - cerr << endl << "Second cache block:" << endl; - printMatrix (cerr, A2); - cerr << endl; - } + matgen.fill_random_svd (numRows, numCols, A1.data(), + A1.stride(1), sigma_A1.data ()); + matgen.fill_random_svd (numRows, numCols, A2.data(), + A2.stride(1), sigma_A2.data ()); // Copy of the resulting test problem, stored as one dense // matrix rather than as two blocks. We will use A_copy to @@ -498,20 +562,10 @@ namespace TSQR { matrix_type A_copy (A); // Space to put the explicit Q factor. - matrix_type Q (Ordinal(2) * numRows, numCols, Scalar(0)); - - // Fill Q with the first numCols columns of the identity matrix. - for (Ordinal k = 0; k < numCols; ++k) { - // FIXME (mfh 26 Nov 2019) I'm assuming I can write to the - // Matrix or MatView on host, outside of Kokkos. TSQR always - // assumed this, but if we want to use Kokkos, we'll need to - // get rid of that assumption. - Q(k, k) = Scalar(1.0); - } - - // Two cache blocks (as views) of Q. - mat_view_type Q1 (numRows, numCols, &Q(0,0), Q.stride(1)); - mat_view_type Q2 (numRows, numCols, &Q(numRows,0), Q.stride(1)); + matrix_type Q (Ordinal(2) * numRows, numCols, Scalar {}); + fill_with_identity_columns (Q.view ()); + // Two "cache blocks" (as views) of Q. + auto Q1_Q2 = partition_2x1 (Q.view (), numRows); // Two tau factor arrays, one for each cache block. vector tau1 (numCols); @@ -519,21 +573,23 @@ namespace TSQR { // Workspace array for factorization and applying the Q factor. // We recycle this workspace for all tests. - vector work (numCols); + const Ordinal lwork = + combiner.work_size (numRows, numCols, numCols); + vector work (lwork); if (debug) { - cerr << endl << "----------------------------------------" << endl - << "TSQR::Combine SequentialTsqr simulation with 2 cache blocks:" - << endl << "qr( [A1; A2] ), with A1 and A2 being each " - << numRows << " by " << numCols << endl << endl; + cerr << endl << "----------------------------------------" + << endl << "TSQR::Combine SequentialTsqr simulation with 2 " + "cache blocks:" << endl << "qr( [A1; A2] ), with A1 and A2 " + "A2 each " << numRows << " by " << numCols << endl << endl; } - Combine combiner; // qr( A1 ) - combiner.factor_first (A1, tau1.data(), work.data()); + combiner.factor_first (A1, tau1.data (), work.data (), lwork); // View of numCols by numCols upper triangle of A1. mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1)); // qr( [R1; A2] ) - combiner.factor_inner (R1, A2, tau2.data(), work.data()); + combiner.factor_inner (R1, A2, tau2.data (), + work.data (), lwork); // Extract (a deep copy of) the R factor. matrix_type R (R1); // Zero out everything below the diagonal of R. @@ -549,14 +605,11 @@ namespace TSQR { // Compute the explicit Q factor, by starting with A2 and // (working up the matrix A,) finishing with A1. - combiner.apply_inner (ApplyType::NoTranspose, - numRows, numCols, numCols, - A2.data(), A2.stride(1), tau2.data(), - Q1.data(), Q1.stride(1), - Q2.data(), Q2.stride(1), work.data()); - combiner.apply_first (ApplyType::NoTranspose, - A1, tau1.data(), - Q1, work.data()); + combiner.apply_inner (ApplyType::NoTranspose, A2, tau2.data (), + Q1_Q2.first, Q1_Q2.second, + work.data (), lwork); + combiner.apply_first (ApplyType::NoTranspose, A1, tau1.data (), + Q1_Q2.first, work.data (), lwork); if (debug) { cerr << "Results of first test problem:" << endl; cerr << "-- Test matrix A:" << endl; @@ -588,9 +641,9 @@ namespace TSQR { { using TSQR::Random::NormalGenerator; using std::cerr; -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX using std::complex; -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX using std::cout; using std::endl; using std::pair; @@ -613,130 +666,122 @@ namespace TSQR { iseed[2] = 0; iseed[3] = 1; - // Whether to print the field (i.e., column) names for the - // output data. - bool doPrintFieldNames = printFieldNames; - if (! simulateSequentialTsqr) { + printCombineFieldNames (); if (testReal) { { - NormalGenerator normgenS (iseed); - const vector resultsS = - verifyCombineTemplate (normgenS, normgenS, numRows, - numCols, debug); - // Only print field names (if at all) once per run, for - // the first data type. - printResults (string("float"), numRows, numCols, - resultsS, doPrintFieldNames); - // Print field names at most once. - doPrintFieldNames = false; - // Fetch the pseudorandom seed from the previous test. - normgenS.getSeed (iseed); + using scalar_type = float; + verifyCombineTemplateAllCombiners + (iseed, numRows, numCols, debug); } { - NormalGenerator normgenD (iseed); - const vector resultsD = - verifyCombineTemplate (normgenD, normgenD, numRows, - numCols, debug); - printResults (string("double"), numRows, numCols, - resultsD, doPrintFieldNames); - doPrintFieldNames = false; - normgenD.getSeed (iseed); + using scalar_type = double; + verifyCombineTemplateAllCombiners + (iseed, numRows, numCols, debug); } } - - if (testComplex) + if (testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - { - NormalGenerator > normgenC (iseed); - NormalGenerator normgenS (iseed); - const vector resultsC = - verifyCombineTemplate (normgenC, normgenS, numRows, - numCols, debug); - printResults (string("complex"), numRows, numCols, - resultsC, doPrintFieldNames); - doPrintFieldNames = false; - // Even though normgenC and normgenS each updated the - // random seed independently, for now we just fetch the - // updated seed from normgenC. This should still - // produce reproducible results. - normgenC.getSeed (iseed); - } - { - NormalGenerator > normgenZ (iseed); - NormalGenerator normgenD (iseed); - const vector resultsZ = - verifyCombineTemplate (normgenZ, normgenD, numRows, - numCols, debug); - printResults (string("complex"), numRows, numCols, - resultsZ, doPrintFieldNames); - doPrintFieldNames = false; - normgenZ.getSeed (iseed); - } -#else // NOT HAVE_KOKKOSTSQR_COMPLEX - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, - "Trilinos was not built with " - "complex arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX + using scalar_type = std::complex; + verifyCombineTemplateAllCombiners + (iseed, numRows, numCols, debug); } + { + using scalar_type = std::complex; + verifyCombineTemplateAllCombiners + (iseed, numRows, numCols, debug); + } +#else // NOT HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "You set testComplex=true, but " + "Trilinos was not built with complex arithmetic support " + "enabled."); +#endif // HAVE_TPETRATSQR_COMPLEX + } } else { // simulateSequentialTsqr + printSimSeqTsqrFieldNames (); if (testReal) { { - NormalGenerator normgenS (iseed); - const vector resultsS = - verifyCombineSeqTemplate (normgenS, normgenS, numRows, - numCols, debug); - printSimSeqTsqrResults (string("float"), numRows, numCols, - resultsS, doPrintFieldNames); - doPrintFieldNames = false; + using scalar_type = float; + + NormalGenerator normgenS (iseed); + auto combiner = + CombineFactory::create (numCols); + const std::string combinerName ("?"); + const auto results = + verifyCombineSeqTemplate (normgenS, normgenS, *combiner, + numRows, numCols, debug); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + printSimSeqTsqrResults (combinerName, scalarName, + numRows, numCols, results); normgenS.getSeed (iseed); } { - NormalGenerator normgenD (iseed); - const vector resultsD = - verifyCombineSeqTemplate (normgenD, normgenD, numRows, - numCols, debug); - printSimSeqTsqrResults (string("double"), numRows, numCols, - resultsD, doPrintFieldNames); - doPrintFieldNames = false; - normgenD.getSeed (iseed); + using scalar_type = double; + + NormalGenerator normgenS (iseed); + auto combiner = + CombineFactory::create (numCols); + const std::string combinerName ("?"); + const auto results = + verifyCombineSeqTemplate (normgenS, normgenS, *combiner, + numRows, numCols, debug); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + printSimSeqTsqrResults (combinerName, scalarName, + numRows, numCols, results); + normgenS.getSeed (iseed); } } if (testComplex) { -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX { - NormalGenerator > normgenC (iseed); - NormalGenerator normgenS (iseed); - const vector resultsC = - verifyCombineSeqTemplate (normgenC, normgenS, numRows, - numCols, debug); - printSimSeqTsqrResults (string("complex"), numRows, numCols, - resultsC, doPrintFieldNames); - doPrintFieldNames = false; - normgenC.getSeed (iseed); + using scalar_type = complex; + using mag_type = float; + + NormalGenerator normgenS (iseed); + NormalGenerator normgenM (iseed); + auto combiner = + CombineFactory::create (numCols); + const std::string combinerName ("?"); + const auto results = + verifyCombineSeqTemplate (normgenS, normgenM, *combiner, + numRows, numCols, debug); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + printSimSeqTsqrResults (combinerName, scalarName, + numRows, numCols, results); + normgenS.getSeed (iseed); } { - NormalGenerator > normgenZ (iseed); - NormalGenerator normgenD (iseed); - const vector resultsZ = - verifyCombineSeqTemplate (normgenZ, normgenD, numRows, - numCols, debug); - printSimSeqTsqrResults (string("complex"), numRows, - numCols, resultsZ, doPrintFieldNames); - doPrintFieldNames = false; - normgenZ.getSeed (iseed); + using scalar_type = complex; + using mag_type = double; + + NormalGenerator normgenS (iseed); + NormalGenerator normgenM (iseed); + auto combiner = + CombineFactory::create (numCols); + const std::string combinerName ("?"); + const auto results = + verifyCombineSeqTemplate (normgenS, normgenM, *combiner, + numRows, numCols, debug); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + printSimSeqTsqrResults (combinerName, scalarName, + numRows, numCols, results); + normgenS.getSeed (iseed); } -#else // NOT HAVE_KOKKOSTSQR_COMPLEX - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, - "Trilinos was not built with " - "complex arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX +#else // NOT HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "Trilinos was not built with " + "complex arithmetic support."); +#endif // HAVE_TPETRATSQR_COMPLEX } } } } // namespace Test } // namespace TSQR - diff --git a/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp new file mode 100644 index 000000000000..3f9ef926cc34 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp @@ -0,0 +1,1053 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos: Node API and Parallel Node Kernels +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER + +/// \file Tsqr_CuSolverNodeTsqr.hpp +/// \brief Declaration and definition of CuSolverNodeTsqr. + +#ifndef TSQR_CUSOLVERNODETSQR_HPP +#define TSQR_CUSOLVERNODETSQR_HPP + +#include "TpetraTSQR_config.h" + +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) +#include "Tsqr_NodeTsqr.hpp" +#include "Tsqr_Impl_CuBlas.hpp" +#include "Tsqr_Impl_CuSolver.hpp" +#include "Kokkos_ArithTraits.hpp" +#include +#include + +#define TSQR_IMPL_CATCH( message ) \ + catch (std::exception& e) { \ + threw = true; \ + err = std::unique_ptr (new std::ostringstream); \ + *err << prefix << message << std::endl << e.what (); \ + } \ + TEUCHOS_TEST_FOR_EXCEPTION \ + (threw, std::runtime_error, \ + (err.get () == nullptr ? "Unknown error" : err->str ())); \ + do {} while (false) + +#define TSQR_IMPL_CHECK_LAST_CUDA_ERROR( location ) \ + do { \ + cudaError_t errCode = cudaGetLastError (); \ + if (errCode != cudaSuccess ) { \ + const char* errorString = cudaGetErrorString (errCode); \ + TEUCHOS_TEST_FOR_EXCEPTION \ + (true, std::runtime_error, "At \"" << (location) << "\", " \ + "CUDA is in the following error state: " << errorString); \ + } \ + } while (false) + +namespace TSQR { + namespace Impl { + + using cusolver_memory_space = Kokkos::CudaSpace; + using cusolver_execution_space = Kokkos::Cuda; + using host_device_type = Kokkos::Device< + Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>; + + // Mapping from Scalar to Kokkos value type. + // e.g., Scalar=std::complex -> Kokkos::complex. + + template + using non_const_kokkos_value_type = typename Kokkos::ArithTraits< + typename std::remove_const::type + >::val_type; + + template + using kokkos_view_value_type = typename std::conditional< + std::is_const::value, + const non_const_kokkos_value_type, + non_const_kokkos_value_type + >::type; + + // vector_type, device_vector_type, and host_vector_type + + template + using vector_type = Kokkos::View; + + template + using device_vector_type = vector_type; + + template + using host_vector_type = vector_type; + + template + void + reallocDeviceVectorIfNeeded (device_vector_type& vec, + const char label[], + const size_t minSize) + { + using Kokkos::view_alloc; + using Kokkos::WithoutInitializing; + + if (size_t (vec.size ()) < minSize) { + vec = device_vector_type (); + auto alloc = view_alloc (std::string (label), WithoutInitializing); + vec = device_vector_type (alloc, minSize); + } + } + + // vec_view_type & device_vec_view_type + + template + using vec_view_type = + Kokkos::View>; + + template + using device_vec_view_type = vec_view_type; + + // matrix_type & device_matrix_type + + template + using matrix_type = Kokkos::View; + + template + using device_matrix_type = matrix_type; + + // mat_view_type, device_mat_view_type, & host_mat_view_type + + template + using mat_view_type = + Kokkos::View>; + + template + using device_mat_view_type = + mat_view_type; + + template + using host_mat_view_type = mat_view_type; + + // get_mat_view, get_host_mat_view, & get_device_mat_view + + template + static mat_view_type, MemorySpace> + get_mat_view (const size_t nrows, + const size_t ncols, + Scalar A[], + const size_t lda) + { + static_assert + (! std::is_const >::value, + "non_const_kokkos_value_type is const."); + using KVVT = kokkos_view_value_type; // preserves const + static_assert + ((std::is_const::value && std::is_const::value) || + (! std::is_const::value && ! std::is_const::value), + "kokkos_view_value_type failed to preserve const-ness."); + KVVT* A_raw = reinterpret_cast (A); + + mat_view_type A_full (A_raw, lda, ncols); + const std::pair rowRange (0, nrows); + return Kokkos::subview (A_full, rowRange, Kokkos::ALL ()); + } + + template + static host_mat_view_type> + get_host_mat_view (const size_t nrows, + const size_t ncols, + Scalar A[], + const size_t lda) + { + return get_mat_view + (nrows, ncols, A, lda); + } + + template + static host_mat_view_type> + get_host_mat_view (const MatView& A_host) + { + const size_t nrows (A_host.extent (0)); + const size_t ncols (A_host.extent (1)); + const size_t lda (A_host.stride (1)); + return get_mat_view + (nrows, ncols, A_host.data (), lda); + } + + template + static device_mat_view_type> + get_device_mat_view (const size_t nrows, + const size_t ncols, + Scalar A[], + const size_t lda) + { + return get_mat_view (nrows, ncols, A, lda); + } + + /// \brief Given rank-1 backing storage, return a device matrix + /// view with the given dimensions (numRows by numCols), that + /// has contiguous storage. Reallocate storage if needed. + /// + /// "Contiguous storage" means that if A is the matrix view + /// result, then A.stride(1) == A.extent(0). + template + device_mat_view_type + get_contiguous_device_mat_view (device_vector_type& storage, + const size_t numRows, + const size_t numCols) + { + const char prefix[] = "TSQR::Impl::get_contiguous_device_mat_view: "; + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( prefix ); + + const size_t currentStorageSize (storage.extent (0)); + const size_t requiredStorageSize = numRows * numCols; + if (currentStorageSize < requiredStorageSize) { + // It costs about as much to allocate 8B on device as 800B. + constexpr size_t minStorageSize = 100; + const size_t newStorageSize = + std::max (minStorageSize, requiredStorageSize); + + // Free it first, so that two allocations won't coexist. + storage = device_vector_type (); + using Kokkos::view_alloc; + using Kokkos::WithoutInitializing; + const char label[] = "matrixStorage"; + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::Impl::get_contiguous_device_mat_view: Right before allocating" ); + + try { + storage = device_vector_type + (view_alloc (std::string (label), WithoutInitializing), + newStorageSize); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "Allocating rank-1 " + "View of size " << newStorageSize << " to represent a " + << numRows << " x " << numCols << " matrix threw: " + << std::endl << e.what ()); + } + } + return device_mat_view_type (storage.data (), + numRows, numCols); + } + + template + host_mat_view_type + get_contiguous_host_mat_view (host_vector_type& storage, + const size_t numRows, + const size_t numCols) + { + const char prefix[] = "TSQR::Impl::get_contiguous_host_mat_view: "; + + const size_t currentStorageSize (storage.extent (0)); + const size_t requiredStorageSize = numRows * numCols; + if (currentStorageSize < requiredStorageSize) { + // It costs about as much to allocate 8B on host as 800B. + constexpr size_t minStorageSize = 100; + const size_t newStorageSize = + std::max (minStorageSize, requiredStorageSize); + + // Free it first, so that two allocations won't coexist. + storage = host_vector_type (); + using Kokkos::view_alloc; + using Kokkos::WithoutInitializing; + const char label[] = "hostMatrixStorage"; + + try { + storage = host_vector_type + (view_alloc (std::string (label), WithoutInitializing), + newStorageSize); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "Allocating rank-1 " + "host View of size " << newStorageSize << " to store a " + << numRows << " x " << numCols << " matrix threw: " + << std::endl << e.what ()); + } + } + return host_mat_view_type (storage.data (), + numRows, numCols); + } + + // info_type & const_info_type + + using info_type = Kokkos::View; + using const_info_type = Kokkos::View; + + template + class CuSolverNodeFactorOutput : + public NodeFactorOutput + { + public: + //using cuda_value_type = typename Impl::CudaValue::type; + using kokkos_value_type = non_const_kokkos_value_type; + using const_tau_type = device_vector_type; + using const_unmanaged_tau_type = + device_vec_view_type; + + CuSolverNodeFactorOutput (const const_tau_type& tau, + const const_info_type& info) : + tau_ (tau), info_ (info) + {} + + const_unmanaged_tau_type tau () const { return tau_; } + + int info () const { + int info_h = 0; + Kokkos::deep_copy (info_h, info_); + return info_h; + } + + private: + const_tau_type tau_; + const_info_type info_; + }; + + template + class SetDiagonalEntriesToOne { + static_assert (! std::is_const::value, + "SetDiagonalEntriesToOne requires a View of nonconst."); + public: + SetDiagonalEntriesToOne + (const device_mat_view_type& A) : A_ (A) {} + KOKKOS_INLINE_FUNCTION void + operator() (const IndexType j) const { + A_(j,j) = ScalarType (1.0); + } + private: + device_mat_view_type A_; + }; + + template + void + set_diagonal_entries_to_one + (const device_mat_view_type& A) + { + static_assert (! std::is_const::value, + "set_diagonal_entries_to_one requires a View of nonconst."); + using LO = + typename std::make_signed::type; + const LO ncols = std::min (A.extent (0), A.extent (1)); + using Kokkos::RangePolicy; + RangePolicy range (0, ncols); + Kokkos::parallel_for + ("set_diagonal_entries_to_one", range, + SetDiagonalEntriesToOne (A)); + } + + } // namespace Impl + + /// \class CuSolverNodeTsqr + /// \brief NodeTsqr implementation based on cuSOLVER. + /// \author Mark Hoemmen + template + class CuSolverNodeTsqr : public NodeTsqr + { + private: + using base_type = NodeTsqr; + using my_factor_output_type = + Impl::CuSolverNodeFactorOutput; + using kokkos_value_type = + Impl::non_const_kokkos_value_type; + + public: + using ordinal_type = typename base_type::ordinal_type; + using scalar_type = typename base_type::scalar_type; + using factor_output_type = typename base_type::factor_output_type; + + CuSolverNodeTsqr () = default; + + Teuchos::RCP + getValidParameters () const override + { + return Teuchos::parameterList ("NodeTsqr"); + } + + void + setParameterList + (const Teuchos::RCP&) override + {} + + std::string description () const override { + return "CuSolverNodeTsqr"; + } + + bool wants_device_memory () const override { return true; } + + bool ready () const override { + return true; + } + + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + return false; + } + + size_t cache_size_hint () const override { + return 0; + } + + private: + using tau_type = Impl::device_vector_type; + + // must return owning, since we'll pass off to factor output + tau_type + get_tau (const LocalOrdinal numCols) const + { + using Impl::reallocDeviceVectorIfNeeded; + Impl::reallocDeviceVectorIfNeeded (tau_, "tau", size_t (numCols)); + return tau_; + } + + using work_type = Impl::device_vector_type; + using nonowning_work_type = + Impl::device_vec_view_type; + + nonowning_work_type + get_work_for_geqrf (const LocalOrdinal numRows, + const LocalOrdinal numCols, + Scalar A[], + const LocalOrdinal lda) const + { + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + + auto info = get_info (); + CuSolver solver + {CuSolverHandle::getSingleton (), info.data ()}; + const int lwork = + solver.compute_QR_lwork (numRows, numCols, A, lda); + // Avoid constant reallocation by setting a minimum lwork. + constexpr int min_lwork = 128; + const int new_lwork = lwork < min_lwork ? min_lwork : lwork; + using Impl::reallocDeviceVectorIfNeeded; + reallocDeviceVectorIfNeeded (work_, "work", new_lwork); + return nonowning_work_type (work_); + } + + nonowning_work_type + get_work_for_apply_Q_factor (const ApplyType& apply_type, + const LocalOrdinal nrows, + const LocalOrdinal ncols_C, + const LocalOrdinal ncols_Q, + const Scalar A[], + const LocalOrdinal lda, + const Scalar tau[], + Scalar C[], + const LocalOrdinal ldc) const + { + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + + auto info = get_info (); + CuSolver solver + {CuSolverHandle::getSingleton (), info.data ()}; + const char side = 'L'; + const char trans = apply_type.toString ()[0]; + const int lwork = + solver.apply_Q_factor_lwork (side, trans, + nrows, ncols_C, ncols_Q, + A, lda, tau, C, ldc); + // Avoid constant reallocation by setting a minimum lwork. + constexpr int min_lwork = 128; + const int new_lwork = lwork < min_lwork ? min_lwork : lwork; + using Impl::reallocDeviceVectorIfNeeded; + reallocDeviceVectorIfNeeded (work_, "work", new_lwork); + return nonowning_work_type (work_); + } + + // must return owning, since we'll pass off to factor output + Impl::info_type + get_info () const + { + if (info_.data () == nullptr) { + info_ = Impl::info_type ("info"); + } + // "get last error" model will avoid doing multiple info allocations. + return info_; + } + + Impl::device_mat_view_type + get_Q_copy (const LocalOrdinal nrows, + const LocalOrdinal ncols, + const Scalar Q[], // DEVICE MEMORY + const LocalOrdinal ldq) const + { + using Impl::get_contiguous_device_mat_view; + auto Q_copy = + get_contiguous_device_mat_view (matrixStorage_, nrows, ncols); + auto Q_view = Impl::get_device_mat_view (nrows, ncols, Q, ldq); + // NOTE (mfh 17 Dec 2019) We're copying device to device, so the + // Kokkos::deep_copy noncontiguity problem does not apply. + Kokkos::deep_copy (Q_copy, Q_view); + return Q_copy; + } + + Impl::device_mat_view_type + get_B_copy (const LocalOrdinal nrows_and_ncols, + const Scalar B[], // HOST MEMORY + const LocalOrdinal ldb) const + { + auto B_copy = + Impl::get_contiguous_device_mat_view (matrixStorage_, + nrows_and_ncols, + nrows_and_ncols); + // Use copy_from_host, which knows how to avoid the + // Kokkos::deep_copy noncontiguity problem. + Scalar* B_copy_raw = reinterpret_cast (B_copy.data ()); + const LocalOrdinal B_copy_stride (B_copy.extent (1)); + MatView B_copy_matview + (nrows_and_ncols, nrows_and_ncols, B_copy_raw, B_copy_stride); + MatView B_matview + (nrows_and_ncols, nrows_and_ncols, B, ldb); + this->copy_from_host (B_copy_matview, B_matview); + return B_copy; + } + + void + extract_R (const LocalOrdinal nrows, + const LocalOrdinal ncols, + const Scalar A[], // DEVICE POINTER + const LocalOrdinal lda, + Scalar R[], // HOST POINTER + const LocalOrdinal ldr, + const bool /* contiguous_cache_blocks */) const + { + using std::endl; + const char prefix[] = "TSQR::CuSolverNodeTsqr::extract_R: "; + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "Top of TSQR::CuSolverNodeTsqr::extract_R" ); + + std::unique_ptr err; + bool threw = false; + + using Impl::get_device_mat_view; + using a_view_type = decltype (get_device_mat_view + (nrows, ncols, A, lda)); + a_view_type A_view; + try { + A_view = get_device_mat_view + (nrows, ncols, A, lda); + } + TSQR_IMPL_CATCH( "get_device_mat_view of A threw: " ); + + auto R_view = + Impl::get_host_mat_view (ncols, ncols, R, ldr); + + try { + // Fill R (including lower triangle) with zeros. + //Kokkos::deep_copy (R_view, kokkos_value_type {}); + + // The above code throws the following exception, even though + // R_view is most definitely a host View: + // + // TSQR::CuSolverNodeTsqr::extract_R: + // Kokkos::deep_copy(R_view, 0) threw an exception: + // cudaDeviceSynchronize() error( cudaErrorIllegalAddress): an + // illegal memory access was encountered + // .../kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp:120 + + MatView R_mv (ncols, ncols, R, ldr); + deep_copy (R_mv, Scalar {}); + } + TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, 0.0) threw: " ); + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::extract_R, " + "after deep_copy(R_mv, 0.0)" ); + + // Copy out the upper triangle of the R factor from A into R. + // + // The following (pseudo)code often does not work: + // + // auto A_view_top = subview(A_view, {0, ncols}, ALL()); + // Kokkos::deep_copy(R_view, A_view_top); + // + // Kokkos throws an exception, claiming "no available copy + // mechanism." This is probably because A_view is not packed. + // This means that cudaMemcpy won't work, so Kokkos must execute + // a kernel to copy the data. However, that kernel must be able + // to access both Views. In this case, it (thinks it) can't, + // because R_view is a HostSpace View and A_view_top is a device + // View (even though it may be a CudaUVMSpace View). + + using Kokkos::ALL; + using Kokkos::subview; + using LO = LocalOrdinal; + const std::pair rowRange (0, ncols); + auto A_view_top = subview (A_view, rowRange, ALL ()); + + if (size_t (A_view_top.stride (1)) == size_t (A_view_top.extent (0))) { + try { + Kokkos::deep_copy (R_view, A_view_top); + } + TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, A_view_top) " + "for contiguous A_view_top threw: "); + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::extract_R, " + "after attempting " + "Kokkos::deep_copy(R_view, A_view_top) " + "with contiguous A_view_top" ); + } + else { // A_view_top is NOT contiguous + // Packed device version of R. + Impl::device_mat_view_type R_copy; + try { + using Impl::get_contiguous_device_mat_view; + R_copy = get_contiguous_device_mat_view (matrixStorage_, + ncols, ncols); + } + TSQR_IMPL_CATCH( "R_copy = get_contiguous_device_mat_view threw: " ); + + TEUCHOS_ASSERT( size_t (R_copy.extent (0)) == size_t (ncols) ); + TEUCHOS_ASSERT( size_t (R_copy.extent (1)) == size_t (ncols) ); + TEUCHOS_ASSERT( size_t (R_copy.stride (1)) == size_t (ncols) ); + + try { + Kokkos::deep_copy (R_copy, A_view_top); + } + TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_copy, A_view_top) threw: "); + try { + Kokkos::deep_copy (R_view, R_copy); + } + TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, R_copy) threw: "); + } + + try { + for (LO j = 0; j < ncols; ++j) { + auto R_j = subview (R_view, ALL (), j); + for (LO i = j + LO(1); i < LO (R_j.extent(0)); ++i) { + R_j(i) = kokkos_value_type {}; + } + } + } + TSQR_IMPL_CATCH( "Filling lower triangle of R_view with zeros threw: "); + } + + public: + Teuchos::RCP + factor (const LocalOrdinal nrows, + const LocalOrdinal ncols, + Scalar A[], + const LocalOrdinal lda, + Scalar R[], + const LocalOrdinal ldr, + const bool contigCacheBlocks) const override + { + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor (top)" ); + + // It's a common case to call factor() again and again with the + // same pointers. In that case, it's wasteful for us to + // allocate a new tau array each time, especially since most + // users want explicit Q anyway (and thus will never see tau). + auto tau = get_tau (ncols); + // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes + // std::complex, but Kokkos::View stores Kokkos::complex. We're + // assuming they have the same alignment here, but all of Tpetra + // assumes that. + Scalar* tau_raw = reinterpret_cast (tau.data ()); + auto work = get_work_for_geqrf (nrows, ncols, A, lda); + Scalar* work_raw = reinterpret_cast (work.data ()); + const int lwork (work.extent (0)); + auto info = get_info (); + + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + CuSolver solver + {CuSolverHandle::getSingleton (), info.data ()}; + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, " + "before solver.compute_QR" ); + try { + solver.compute_QR (nrows, ncols, A, lda, tau_raw, + work_raw, lwork); + } + catch (std::exception& e) { + std::ostringstream err; + err << "TSQR::CuSolverNodeTsqr::factor: CuSolver::compute_QR " + "threw an exception: " << std::endl << e.what (); + throw std::runtime_error (err.str ()); + } + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, " + "after solver.compute_QR, " + "before extract_R" ); + try { + this->extract_R (nrows, ncols, A, lda, R, ldr, + contigCacheBlocks); + } + catch (std::exception& e) { + std::ostringstream err; + err << "TSQR::CuSolverNodeTsqr::factor: extract_R " + "threw an exception: " << std::endl << e.what (); + throw std::runtime_error (err.str ()); + } + + TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, " + "after extract_R" ); + return Teuchos::rcp (new my_factor_output_type (tau, info)); + } + + private: + const my_factor_output_type& + get_my_factor_output (const factor_output_type& factor_output) const + { + const char prefix[] = "TSQR::CuSolverNodeTsqr: "; + + const my_factor_output_type* output_ptr = + dynamic_cast (&factor_output); + if (output_ptr == nullptr) { + const std::string this_name = Teuchos::typeName (*this); + const std::string factor_output_type_name = + Teuchos::TypeNameTraits::name (); + const std::string dynamic_type_name = + Teuchos::demangleName (typeid (factor_output).name ()); + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::invalid_argument, prefix << "Input " + "factor_output_type object was not created by the same " + "type of CuSolverNodeTsqr object as this one. This " + "object has type " << this_name << " and its subclass of " + "factor_output_type has type " << factor_output_type_name + << ", but the input factor_output_type object has dynamic " + "type " << dynamic_type_name << "."); + } + return *output_ptr; + } + + public: + void + apply (const ApplyType& apply_type, + const LocalOrdinal nrows, + const LocalOrdinal ncols_Q, + const Scalar Q[], + const LocalOrdinal ldq, + const factor_output_type& factor_output, + const LocalOrdinal ncols_C, + Scalar C[], + const LocalOrdinal ldc, + const bool contigCacheBlocks) const override + { + const char prefix[] = "TSQR::CuSolverNodeTsqr::apply: "; + + // Quick exit and error tests + if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) { + return; + } + else if (ldc < nrows) { + std::ostringstream os; + os << prefix << "ldc (= " << ldc << ") < nrows (= " + << nrows << ")"; + throw std::invalid_argument (os.str()); + } + else if (ldq < nrows) { + std::ostringstream os; + os << prefix << "ldq (= " << ldq << ") < nrows (= " + << nrows << ")"; + throw std::invalid_argument (os.str()); + } + + const char side = 'L'; + const char trans = apply_type.toString ()[0]; + auto tau = get_my_factor_output (factor_output).tau (); + // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes + // std::complex, but Kokkos::View stores Kokkos::complex. We're + // assuming they have the same alignment here, but all of Tpetra + // assumes that. + const Scalar* tau_raw = + reinterpret_cast (tau.data ()); + auto work = + get_work_for_apply_Q_factor (apply_type, + nrows, ncols_C, ncols_Q, + Q, ldq, tau_raw, C, ldc); + Scalar* work_raw = reinterpret_cast (work.data ()); + const int lwork (work.extent (0)); + auto info = get_info (); + + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + CuSolver solver + {CuSolverHandle::getSingleton (), info.data ()}; + solver.apply_Q_factor (side, trans, + nrows, ncols_C, ncols_Q, + Q, ldq, tau_raw, + C, ldc, + work_raw, lwork); + } + + /// \brief Copy from a host matrix, to "native" NodeTsqr device + /// storage. + virtual void + copy_from_host (const MatView& C_dev, + const MatView& C_host) const + { + const char prefix[] = + "TSQR::CuSolverNodeTsqr::copy_from_host: "; + + const size_t nrows (C_dev.extent (0)); + const size_t ncols (C_dev.extent (1)); + TEUCHOS_ASSERT( nrows == size_t (C_host.extent (0)) ); + TEUCHOS_ASSERT( ncols == size_t (C_host.extent (1)) ); + + auto C_dev_view = Impl::get_device_mat_view + (nrows, ncols, C_dev.data (), C_dev.stride (1)); + auto C_host_view = Impl::get_host_mat_view + (nrows, ncols, C_host.data (), C_host.stride (1)); + + // NOTE (mfh 17 Dec 2019) If C_host is contiguous, that is, if + // C_host.stride(1) == C_host.extent(0), then we can + // Kokkos::deep_copy directly. Otherwise, Kokkos::deep_copy + // will throw an exception, claiming "no available copy + // mechanism." This is because cudaMemcpy won't work, so Kokkos + // must execute a kernel to copy the data. (Kokkos doesn't seem + // to exploit any of the various 2-D or 3-D array copying + // functions that CUDA provides.) That kernel must be able to + // access both Views. We deal with this with a fall-back path + // that uses temporary contiguous storage. + + if (C_dev_view.stride (1) == C_dev_view.extent (0) && + C_host_view.stride (1) == C_host_view.extent (0)) { + // Both Views are contiguous. + try { + Kokkos::deep_copy (C_dev_view, C_host_view); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "Kokkos::deep_copy(C_dev_view, C_host_view) (both " + "contiguous) threw: " << e.what ()); + } + } + else { + // We need to make a contiguous copy of host storage. + auto C_host_copy = Impl::get_contiguous_host_mat_view + (hostMatrixStorage_, nrows, ncols); + TEUCHOS_ASSERT( C_host_copy.stride (1) == + C_host_copy.extent (0) ); + try { + Kokkos::deep_copy (C_host_copy, C_host_view); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "Kokkos::deep_copy(C_host_copy, C_host_view) threw: " + << e.what ()); + } + + if (C_dev_view.stride (1) == C_dev_view.extent (0)) { + try { + Kokkos::deep_copy (C_dev_view, C_host_copy); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "Kokkos::deep_copy(C_dev_view, C_host_copy) threw: " + << e.what ()); + } + } + else { + auto C_dev_copy = Impl::get_contiguous_device_mat_view + (matrixStorage_, nrows, ncols); + try { + Kokkos::deep_copy (C_dev_copy, C_host_copy); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "Kokkos::deep_copy(C_dev_copy, C_host_copy) threw: " + << e.what ()); + } + try { + Kokkos::deep_copy (C_dev_view, C_dev_copy); + } + catch (std::exception& e) { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << + "Kokkos::deep_copy(C_dev_view, C_dev_copy) threw: " + << e.what ()); + } + } + } + } + + /// \brief Copy from "native" NodeTsqr device storage, to a packed + /// host matrix. + Matrix + copy_to_host + (const MatView& C) const override + { + using LO = LocalOrdinal; + const LO nrows (C.extent (0)); + const LO ncols (C.extent (1)); + const LO ldc (C.stride (1)); + auto C_dev = + Impl::get_device_mat_view (nrows, ncols, + C.data (), ldc); + Matrix C_copy (nrows, ncols); + auto C_host = Impl::get_host_mat_view (C_copy.view ()); + + // NOTE (mfh 17 Dec 2019) Directly calling + // Kokkos::deep_copy(C_host, C_dev) may not necessarily work, + // since C_dev need not be contiguous. In that case, Kokkos + // would throw an exception, claiming "no available copy + // mechanism." The work-around is to create a packed device + // View, copy C_dev into it, then copy the packed View to + // C_host. + try { + Kokkos::deep_copy (C_host, C_dev); + } + catch (std::exception& /* e */) { + auto C_dev_copy = + Impl::get_contiguous_device_mat_view (matrixStorage_, + nrows, ncols); + Kokkos::deep_copy (C_dev_copy, C_dev); + Kokkos::deep_copy (C_host, C_dev_copy); + } + return C_copy; + } + + /// \brief Fill C (DEVICE MEMORY) with the first C.extent(1) + /// columns of the identity matrix. Assume that C has already + /// been pre-filled with zeros. + void + set_diagonal_entries_to_one + (const MatView& C) const override + { + auto C_view = + Impl::get_device_mat_view (C.extent (0), C.extent (1), + C.data (), C.stride (1)); + Impl::set_diagonal_entries_to_one (C_view); + } + + void + explicit_Q (const LocalOrdinal nrows, + const LocalOrdinal ncols_Q, + const Scalar Q[], // DEVICE MEMORY + const LocalOrdinal ldq, + const factor_output_type& factor_output, + const LocalOrdinal ncols_C, + Scalar C[], // DEVICE MEMORY + const LocalOrdinal ldc, + const bool contigCacheBlocks) const override + { + using Impl::get_device_mat_view; + auto C_view = get_device_mat_view (nrows, ncols_C, C, ldc); + using IST = Impl::non_const_kokkos_value_type; + deep_copy (C_view, IST {}); + Impl::set_diagonal_entries_to_one (C_view); + apply (ApplyType::NoTranspose, + nrows, ncols_Q, Q, ldq, factor_output, + ncols_C, C, ldc, contigCacheBlocks); + } + + void + Q_times_B (const LocalOrdinal nrows, + const LocalOrdinal ncols, + Scalar Q[], // DEVICE MEMORY + const LocalOrdinal ldq, + const Scalar B[], // HOST MEMORY + const LocalOrdinal ldb, + const bool /* contigCacheBlocks */) const override + { + // Take the easy exit if available. + if (ncols == 0 || nrows == 0) { + return; + } + + // _GEMM doesn't permit the in/out matrix to alias either of the + // two input matrices, so we must make a copy. + auto Q_copy = get_Q_copy (nrows, ncols, Q, ldq); + + // We assume that B is in host memory, so we need to copy it to + // device before we can use cuBLAS. + auto B_copy = get_B_copy (ncols, B, ldb); + + constexpr Scalar ZERO {}; + constexpr Scalar ONE (1.0); + + using TSQR::Impl::CuBlas; + using TSQR::Impl::CuBlasHandle; + CuBlas blas {CuBlasHandle::getSingleton ()}; + + const char transa = 'N'; + const char transb = 'N'; + // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes + // std::complex, but Kokkos::View stores Kokkos::complex. We're + // assuming they have the same alignment here, but all of Tpetra + // assumes that. + const Scalar* Q_copy_raw = + reinterpret_cast (Q_copy.data ()); + const int Q_copy_stride (Q_copy.stride (1)); + blas.gemm (transa, transb, nrows, ncols, ncols, + ONE, Q_copy_raw, Q_copy_stride, + B, ldb, ZERO, Q, ldq); + } + + void + cache_block (const LocalOrdinal /* nrows */, + const LocalOrdinal /* ncols */, + Scalar /* A_out */ [], + const Scalar /*A_in */ [], + const LocalOrdinal /* lda_in */) const override + {} + + void + un_cache_block (const LocalOrdinal /* nrows */, + const LocalOrdinal /* ncols */, + Scalar /* A_out */[], + const LocalOrdinal /* lda_out */, + const Scalar /* A_in */ []) const override + {} + + void + fill_with_zeros (const LocalOrdinal nrows, + const LocalOrdinal ncols, + Scalar A[], + const LocalOrdinal lda, + const bool /* contigCacheBlocks */) const override + { + auto A_view = Impl::get_device_mat_view (nrows, ncols, A, lda); + Kokkos::deep_copy (A_view, kokkos_value_type {}); + } + + private: + mutable tau_type tau_; + mutable work_type work_; + mutable Impl::info_type info_; + mutable Impl::device_vector_type matrixStorage_; + mutable Impl::host_vector_type hostMatrixStorage_; + }; + +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER +#endif // TSQR_CUSOLVERNODETSQR_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp index 39aba991f8cc..a0933b4cad5d 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp @@ -40,8 +40,8 @@ /// \file Tsqr_DistTsqr.hpp /// \brief Internode part of TSQR. /// -#ifndef __TSQR_Tsqr_DistTsqr_hpp -#define __TSQR_Tsqr_DistTsqr_hpp +#ifndef TSQR_DISTTSQR_HPP +#define TSQR_DISTTSQR_HPP #include "Tsqr_DistTsqrHelper.hpp" #include "Tsqr_DistTsqrRB.hpp" @@ -64,12 +64,16 @@ namespace TSQR { template class DistTsqr : public Teuchos::ParameterListAcceptorDefaultBase { public: - typedef Scalar scalar_type; - typedef LocalOrdinal ordinal_type; - typedef MatView mat_view_type; - typedef std::vector > VecVec; - typedef std::pair FactorOutput; - typedef int rank_type; + using scalar_type = Scalar; + using ordinal_type = LocalOrdinal; + + private: + using VecVec = std::vector>; + + public: + using mat_view_type = MatView; + using FactorOutput = std::pair; + using rank_type = int; /// \brief Constructor (that accepts a parameter list). /// @@ -125,10 +129,10 @@ namespace TSQR { /// communicator, if the latter is an MPI communicator. If it's a /// serial "communicator," the rank is always zero. rank_type rank() const { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); return messenger_->rank(); } @@ -138,18 +142,14 @@ namespace TSQR { /// communicator, if the latter is an MPI communicator. If it's a /// serial "communicator," the size is always one. rank_type size() const { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); return messenger_->size(); } - /// \brief Destructor. - /// - /// The destructor doesn't need to do anything, thanks to smart - /// pointers. - virtual ~DistTsqr () {} + virtual ~DistTsqr () = default; /// \brief Does the R factor have a nonnegative diagonal? /// @@ -159,14 +159,16 @@ namespace TSQR { /// negative entries. This Boolean tells you whether DistTsqr /// promises to compute an R factor whose diagonal entries are all /// nonnegative. - bool QR_produces_R_factor_with_nonnegative_diagonal () const { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); - typedef Combine combine_type; - return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() && - reduceBroadcastImpl_->QR_produces_R_factor_with_nonnegative_diagonal(); + bool + QR_produces_R_factor_with_nonnegative_diagonal () const + { + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); + TEUCHOS_ASSERT( reduceBroadcastImpl_.getRawPtr () != nullptr ); + return reduceBroadcastImpl_-> + QR_produces_R_factor_with_nonnegative_diagonal (); } /// \brief Internode TSQR with explicit Q factor. @@ -198,10 +200,10 @@ namespace TSQR { mat_view_type Q_mine, const bool forceNonnegativeDiagonal=false) { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); reduceBroadcastImpl_->factorExplicit (R_mine, Q_mine, forceNonnegativeDiagonal); } @@ -214,10 +216,10 @@ namespace TSQR { void getFactorExplicitTimings (std::vector& stats) const { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); reduceBroadcastImpl_->getStats (stats); } @@ -229,10 +231,10 @@ namespace TSQR { void getFactorExplicitTimingLabels (std::vector& labels) const { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); reduceBroadcastImpl_->getStatsLabels (labels); } @@ -262,10 +264,10 @@ namespace TSQR { FactorOutput factor (mat_view_type R_mine) { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); VecVec Q_factors, tau_arrays; DistTsqrHelper helper; const ordinal_type ncols = R_mine.extent(1); @@ -278,9 +280,13 @@ namespace TSQR { const int P = messenger_->size(); const int my_rank = messenger_->rank(); const int first_tag = 0; - std::vector work (ncols); - helper.factor_helper (ncols, R_local, my_rank, 0, P-1, first_tag, - messenger_.get(), Q_factors, tau_arrays, work); + + const ordinal_type lwork = helper.work_size (ncols); + std::vector work (lwork); + helper.factor_helper (ncols, R_local, my_rank, 0, P-1, + first_tag, messenger_.get (), + Q_factors, tau_arrays, + work.data (), lwork); deep_copy (R_mine, R_local_view); return std::make_pair (Q_factors, tau_arrays); } @@ -294,10 +300,10 @@ namespace TSQR { const ordinal_type ldc_mine, const FactorOutput& factor_output) { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); const bool transposed = apply_type.transposed(); TEUCHOS_TEST_FOR_EXCEPTION(transposed, std::logic_error, "DistTsqr: Applying Q^T or Q^H has not yet " @@ -306,18 +312,20 @@ namespace TSQR { const int my_rank = messenger_->rank(); const int first_tag = 0; std::vector C_other (ncols_C * ncols_C); - std::vector work (ncols_C); + DistTsqrHelper helper; + const ordinal_type lwork = helper.work_size (ncols_C); + std::vector work (lwork); const VecVec& Q_factors = factor_output.first; const VecVec& tau_arrays = factor_output.second; // assert (Q_factors.size() == tau_arrays.size()); const int cur_pos = Q_factors.size() - 1; - DistTsqrHelper helper; - helper.apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other.data(), my_rank, 0, P-1, first_tag, - messenger_.get(), Q_factors, tau_arrays, cur_pos, - work); + + helper.apply_helper (apply_type, ncols_C, ncols_Q, C_mine, + ldc_mine, C_other.data (), my_rank, 0, P-1, + first_tag, messenger_.get (), Q_factors, + tau_arrays, cur_pos, work.data (), lwork); } //! Apply the result of \c factor() to compute the explicit Q factor. @@ -327,20 +335,18 @@ namespace TSQR { const ordinal_type ldq_mine, const FactorOutput& factor_output) { - TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error, - "Before using DistTsqr computational methods, " - "you must first call init() with a valid " - "MessengerBase instance."); - const int myRank = messenger_->rank (); - + TEUCHOS_TEST_FOR_EXCEPTION + (! ready (), std::logic_error, "Before using DistTsqr " + "computational methods, you must first call init() with a " + "valid MessengerBase instance."); MatView Q_mine_view (ncols_Q, ncols_Q, Q_mine, ldq_mine); deep_copy (Q_mine_view, scalar_type {}); + + const int myRank = messenger_->rank (); if (myRank == 0) { for (ordinal_type j = 0; j < ncols_Q; ++j) { - // FIXME (26 Nov 2019) Eventually, we only want to write to - // a matrix through a Kokkos kernel or a TPL. - Q_mine[j + j*ldq_mine] = scalar_type (1.0); + Q_mine_view(j,j) = scalar_type (1.0); } } apply (ApplyType::NoTranspose, ncols_Q, ncols_Q, @@ -348,17 +354,18 @@ namespace TSQR { } private: - Teuchos::RCP > messenger_; - Teuchos::RCP > reduceBroadcastImpl_; + Teuchos::RCP> messenger_; + Teuchos::RCP> reduceBroadcastImpl_; /// \brief Whether this object is ready to perform computations. /// /// It is not ready until after \c init() has been called. bool ready() const { - return ! messenger_.is_null() && ! reduceBroadcastImpl_.is_null(); + return ! messenger_.is_null () && + ! reduceBroadcastImpl_.is_null (); } }; } // namespace TSQR -#endif // __TSQR_Tsqr_DistTsqr_hpp +#endif // TSQR_DISTTSQR_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp index 67ecc2b31e06..6bb60a160535 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp @@ -37,12 +37,12 @@ // ************************************************************************ //@HEADER -#ifndef __TSQR_Tsqr_DistTsqrHelper_hpp -#define __TSQR_Tsqr_DistTsqrHelper_hpp +#ifndef TSQR_DISTTSQRHELPER_HPP +#define TSQR_DISTTSQRHELPER_HPP #include "Tsqr_MatView.hpp" #include "Tsqr_MessengerBase.hpp" -#include "Tsqr_Combine.hpp" +#include "Tsqr_Impl_CombineUser.hpp" #include "Tsqr_Util.hpp" #include // std::min, std::max @@ -59,53 +59,61 @@ namespace TSQR { /// The only reason to mess with this class is if you want to change /// how the internode part of TSQR is implemented. template - class DistTsqrHelper { + class DistTsqrHelper : + private Impl::CombineUser { public: - DistTsqrHelper () {} + using ordinal_type = LocalOrdinal; + using scalar_type = Scalar; + + ordinal_type work_size (const ordinal_type ncols) { + auto& combine = this->getCombine (ncols); + return combine.work_size (2*ncols, ncols, ncols); + } void - factor_pair (const LocalOrdinal ncols, - std::vector< Scalar >& R_mine, - const LocalOrdinal P_mine, - const LocalOrdinal P_other, - const LocalOrdinal tag, - MessengerBase* const messenger, - std::vector >& Q_factors, - std::vector >& tau_arrays, - std::vector& work) + factor_pair (const ordinal_type ncols, + std::vector& R_mine, + const ordinal_type P_mine, + const ordinal_type P_other, + const ordinal_type tag, + MessengerBase* const messenger, + std::vector>& Q_factors, + std::vector>& tau_arrays, + scalar_type work[], + const ordinal_type lwork) { using std::endl; using std::ostringstream; using std::vector; - - if (P_mine == P_other) + using LO = ordinal_type; + if (P_mine == P_other) { return; // nothing to do - + } const int P_top = std::min (P_mine, P_other); const int P_bot = std::max (P_mine, P_other); - const LocalOrdinal nelts = ncols * ncols; - const LocalOrdinal ldr = ncols; - MatView R_mine_view + const LO nelts = ncols * ncols; + const LO ldr = ncols; + MatView R_mine_view (ncols, ncols, R_mine.data (), ldr); - vector< Scalar > R_other (nelts); - MatView R_other_view + vector R_other (nelts); + MatView R_other_view (ncols, ncols, R_other.data (), ldr); - vector< Scalar > tau (ncols); + vector tau (ncols); // Send and receive R factor. - messenger->swapData (R_mine.data(), R_other.data(), + messenger->swapData (R_mine.data (), R_other.data (), nelts, P_other, tag); - Combine combine; + auto& combine = this->getCombine (ncols); if (P_mine == P_top) { combine.factor_pair (R_mine_view, R_other_view, - tau.data(), work.data()); + tau.data(), work, lwork); Q_factors.push_back (R_other); tau_arrays.push_back (tau); } else if (P_mine == P_bot) { combine.factor_pair (R_other_view, R_mine_view, - tau.data(), work.data()); + tau.data (), work, lwork); Q_factors.push_back (R_mine); // Make sure that the "bottom" processor gets the current R // factor, which is returned in R_mine. @@ -116,255 +124,265 @@ namespace TSQR { ostringstream os; os << "Should never get here: P_mine (= " << P_mine << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; - throw std::logic_error (os.str()); + throw std::logic_error (os.str ()); } } void - factor_helper (const LocalOrdinal ncols, - std::vector< Scalar >& R_mine, - const LocalOrdinal my_rank, - const LocalOrdinal P_first, - const LocalOrdinal P_last, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - std::vector< std::vector< Scalar > >& Q_factors, - std::vector< std::vector< Scalar > >& tau_arrays, - std::vector< Scalar >& work) + factor_helper (const ordinal_type ncols, + std::vector& R_mine, + const ordinal_type my_rank, + const ordinal_type P_first, + const ordinal_type P_last, + const ordinal_type tag, + MessengerBase* const messenger, + std::vector>& Q_factors, + std::vector>& tau_arrays, + scalar_type work[], + const ordinal_type lwork) { using std::endl; using std::ostringstream; using std::vector; - if (P_last <= P_first) + if (P_last <= P_first) { return; - else - { - const int P = P_last - P_first + 1; - // Whether the interval [P_first, P_last] has an even number of - // elements. Our interval splitting scheme ensures that the - // interval [P_first, P_mid - 1] always has an even number of - // elements. - const bool b_even = (P % 2 == 0); - // We split the interval [P_first, P_last] into 2 intervals: - // [P_first, P_mid-1], and [P_mid, P_last]. We bias the - // splitting procedure so that the lower interval always has an - // even number of processor ranks, and never has fewer processor - // ranks than the higher interval. - const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); - - if (my_rank < P_mid) // Interval [P_first, P_mid-1] - { - factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1, - tag + 1, messenger, Q_factors, tau_arrays, work); - - // If there aren't an even number of processors in the - // original interval, then the last processor in the lower - // interval has to skip this round. - if (b_even || my_rank < P_mid - 1) { - const int my_offset = my_rank - P_first; - const int P_other = P_mid + my_offset; - if (P_other < P_mid || P_other > P_last) { - throw std::logic_error ("P_other not in [P_mid,P_last] range"); - } - factor_pair (ncols, R_mine, my_rank, P_other, tag, - messenger, Q_factors, tau_arrays, work); - } - // If I'm skipping this round, get the "current" R factor - // from P_mid. - if (! b_even && my_rank == P_mid - 1) { - const int theTag = 142; // magic constant - messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag); - } - } - else // Interval [P_mid, P_last] - { - factor_helper (ncols, R_mine, my_rank, P_mid, P_last, - tag + 1, messenger, Q_factors, tau_arrays, work); - - const int my_offset = my_rank - P_mid; - const int P_other = P_first + my_offset; - - if (P_other < P_first || P_other >= P_mid) - throw std::logic_error ("P_other not in [P_first,P_mid-1] range"); - factor_pair (ncols, R_mine, my_rank, P_other, tag, - messenger, Q_factors, tau_arrays, work); - - // If Proc P_mid-1 is skipping this round, Proc P_mid will - // send it the "current" R factor. - if (! b_even) - { - const int theTag = 142; // magic constant - messenger->send (R_mine.data(), ncols*ncols, P_mid-1, theTag); - } + } + else { + const int P = P_last - P_first + 1; + // Whether the interval [P_first, P_last] has an even number + // of elements. Our interval splitting scheme ensures that + // the interval [P_first, P_mid - 1] always has an even number + // of elements. + const bool b_even = (P % 2 == 0); + // We split the interval [P_first, P_last] into 2 intervals: + // [P_first, P_mid-1], and [P_mid, P_last]. We bias the + // splitting procedure so that the lower interval always has + // an even number of processor ranks, and never has fewer + // processor ranks than the higher interval. + const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); + + if (my_rank < P_mid) { // Interval [P_first, P_mid-1] + factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1, + tag + 1, messenger, Q_factors, tau_arrays, + work, lwork); + + // If there aren't an even number of processors in the + // original interval, then the last processor in the lower + // interval has to skip this round. + if (b_even || my_rank < P_mid - 1) { + const int my_offset = my_rank - P_first; + const int P_other = P_mid + my_offset; + if (P_other < P_mid || P_other > P_last) { + throw std::logic_error ("P_other not in [P_mid,P_last] range"); } + factor_pair (ncols, R_mine, my_rank, P_other, tag, + messenger, Q_factors, tau_arrays, + work, lwork); + } + // If I'm skipping this round, get the "current" R factor + // from P_mid. + if (! b_even && my_rank == P_mid - 1) { + const int theTag = 142; // magic constant + messenger->recv (R_mine.data (), ncols*ncols, P_mid, + theTag); + } } + else { // Interval [P_mid, P_last] + factor_helper (ncols, R_mine, my_rank, P_mid, P_last, + tag + 1, messenger, Q_factors, tau_arrays, + work, lwork); + const int my_offset = my_rank - P_mid; + const int P_other = P_first + my_offset; + + if (P_other < P_first || P_other >= P_mid) { + throw std::logic_error ("P_other not in [P_first," + "P_mid-1] range"); + } + factor_pair (ncols, R_mine, my_rank, P_other, tag, + messenger, Q_factors, tau_arrays, work, lwork); + + // If Proc P_mid-1 is skipping this round, Proc P_mid will + // send it the "current" R factor. + if (! b_even) { + const int theTag = 142; // magic constant + messenger->send (R_mine.data(), ncols*ncols, P_mid-1, theTag); + } + } + } } void apply_pair (const ApplyType& apply_type, - const LocalOrdinal ncols_C, - const LocalOrdinal ncols_Q, - Scalar C_mine[], - const LocalOrdinal ldc_mine, - Scalar C_other[], // contiguous ncols_C x ncols_C scratch - const LocalOrdinal P_mine, - const LocalOrdinal P_other, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - const std::vector< Scalar >& Q_cur, - const std::vector< Scalar >& tau_cur, - std::vector< Scalar >& work) + const ordinal_type ncols_C, + const ordinal_type ncols_Q, + scalar_type C_mine[], + const ordinal_type ldc_mine, + scalar_type C_other[], // contiguous ncols_C x ncols_C scratch + const ordinal_type P_mine, + const ordinal_type P_other, + const ordinal_type tag, + MessengerBase* const messenger, + const std::vector& Q_cur, + const std::vector& tau_cur, + scalar_type work[], + const ordinal_type lwork) { using std::endl; using std::ostringstream; using std::vector; + using LO = ordinal_type; + using const_mat_view_type = MatView; + using mat_view_type = MatView; - if (P_mine == P_other) + if (P_mine == P_other) { return; // nothing to do - + } const int P_top = std::min (P_mine, P_other); const int P_bot = std::max (P_mine, P_other); - - const LocalOrdinal nelts = ncols_C * ncols_C; - const LocalOrdinal ldq = ncols_Q; - const LocalOrdinal ldc_other = ncols_C; + const LO nelts = ncols_C * ncols_C; + const LO ldq = ncols_Q; + const LO ldc_other = ncols_C; // Send and receive C_mine resp. C_other to the other processor of // the pair. - messenger->swapData (&C_mine[0], &C_other[0], nelts, P_other, tag); + messenger->swapData (C_mine, C_other, nelts, P_other, tag); - Combine< LocalOrdinal, Scalar > combine; - if (P_mine == P_top) - combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, - &tau_cur[0], C_mine, ldc_mine, C_other, ldc_other, - &work[0]); - else if (P_mine == P_bot) - combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq, - &tau_cur[0], C_other, ldc_other, C_mine, ldc_mine, - &work[0]); - else - { - ostringstream os; - os << "Should never get here: P_mine (= " << P_mine - << ") not one of P_top, P_bot = " << P_top << ", " << P_bot; - throw std::logic_error (os.str()); - } + const_mat_view_type Q_bot + (ncols_Q, ncols_Q, Q_cur.data (), ldq); + auto& combine = this->getCombine (std::max (ncols_Q, ncols_C)); + if (P_mine == P_top) { + mat_view_type C_top (ncols_Q, ncols_C, C_mine, ldc_mine); + mat_view_type C_bot (ncols_Q, ncols_C, C_other, ldc_other); + combine.apply_pair (apply_type, Q_bot, tau_cur.data (), + C_top, C_bot, work, lwork); + } + else if (P_mine == P_bot) { + mat_view_type C_top (ncols_Q, ncols_C, C_other, ldc_other); + mat_view_type C_bot (ncols_Q, ncols_C, C_mine, ldc_mine); + combine.apply_pair (apply_type, Q_bot, tau_cur.data (), + C_top, C_bot, work, lwork); + } + else { + ostringstream os; + os << "Should never get here: P_mine (= " << P_mine + << ") not one of P_top, P_bot = " << P_top << ", " + << P_bot; + throw std::logic_error (os.str ()); + } } void apply_helper (const ApplyType& apply_type, - const LocalOrdinal ncols_C, - const LocalOrdinal ncols_Q, - Scalar C_mine[], - const LocalOrdinal ldc_mine, - Scalar C_other[], // contiguous ncols_C x ncols_C scratch - const LocalOrdinal my_rank, - const LocalOrdinal P_first, - const LocalOrdinal P_last, - const LocalOrdinal tag, - MessengerBase< Scalar >* const messenger, - const std::vector< std::vector< Scalar > >& Q_factors, - const std::vector< std::vector< Scalar > >& tau_arrays, - const LocalOrdinal cur_pos, - std::vector< Scalar >& work) + const ordinal_type ncols_C, + const ordinal_type ncols_Q, + scalar_type C_mine[], + const ordinal_type ldc_mine, + scalar_type C_other[], // contiguous ncols_C x ncols_C scratch + const ordinal_type my_rank, + const ordinal_type P_first, + const ordinal_type P_last, + const ordinal_type tag, + MessengerBase* const messenger, + const std::vector>& Q_factors, + const std::vector>& tau_arrays, + const ordinal_type cur_pos, + scalar_type work[], + const ordinal_type lwork) { using std::endl; using std::ostringstream; using std::vector; - if (P_last <= P_first) + if (P_last <= P_first) { return; - else - { - const int P = P_last - P_first + 1; - // Whether the interval [P_first, P_last] has an even number of - // elements. Our interval splitting scheme ensures that the - // interval [P_first, P_mid - 1] always has an even number of - // elements. - const bool b_even = (P % 2 == 0); - // We split the interval [P_first, P_last] into 2 intervals: - // [P_first, P_mid-1], and [P_mid, P_last]. We bias the - // splitting procedure so that the lower interval always has an - // even number of processor ranks, and never has fewer processor - // ranks than the higher interval. - const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); - - if (my_rank < P_mid) // Interval [P_first, P_mid - 1] - { - const bool b_participating = b_even || my_rank < P_mid - 1; - - if (cur_pos < 0) - { - ostringstream os; - os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos - << ") < 0; lower interval [" << P_first << "," << (P_mid-1) - << "]; original interval [" << P_first << "," << P_last - << "]" << endl; - throw std::logic_error (os.str()); - } - - // If there aren't an even number of processors in the - // original interval, then the last processor in the lower - // interval has to skip this round. Since we skip this - // round, don't decrement cur_pos (else we'll skip an entry - // and eventually fall off the front of the array. - int new_cur_pos; - if (b_even || my_rank < P_mid - 1) - { - if (! b_participating) - throw std::logic_error("Should never get here"); - - const int my_offset = my_rank - P_first; - const int P_other = P_mid + my_offset; - // assert (P_mid <= P_other && P_other <= P_last); - if (P_other < P_mid || P_other > P_last) - throw std::logic_error("Should never get here"); - - apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_other, tag, messenger, - Q_factors[cur_pos], tau_arrays[cur_pos], work); - new_cur_pos = cur_pos - 1; - } - else - { - if (b_participating) - throw std::logic_error("Should never get here"); - - new_cur_pos = cur_pos; - } - apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_first, P_mid - 1, tag + 1, - messenger, Q_factors, tau_arrays, new_cur_pos, - work); + } + else { + const int P = P_last - P_first + 1; + // Whether the interval [P_first, P_last] has an even number + // of elements. Our interval splitting scheme ensures that + // the interval [P_first, P_mid - 1] always has an even number + // of elements. + const bool b_even = (P % 2 == 0); + // We split the interval [P_first, P_last] into 2 intervals: + // [P_first, P_mid-1], and [P_mid, P_last]. We bias the + // splitting procedure so that the lower interval always has + // an even number of processor ranks, and never has fewer + // processor ranks than the higher interval. + const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1); + + if (my_rank < P_mid) { // Interval [P_first, P_mid - 1] + const bool b_participating = b_even || my_rank < P_mid - 1; + + if (cur_pos < 0) { + ostringstream os; + os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos + << ") < 0; lower interval [" << P_first << "," << (P_mid-1) + << "]; original interval [" << P_first << "," << P_last + << "]" << endl; + throw std::logic_error (os.str()); + } + + // If there aren't an even number of processors in the + // original interval, then the last processor in the lower + // interval has to skip this round. Since we skip this + // round, don't decrement cur_pos (else we'll skip an entry + // and eventually fall off the front of the array. + int new_cur_pos; + if (b_even || my_rank < P_mid - 1) { + if (! b_participating) { + throw std::logic_error("Should never get here"); } - else - { - if (cur_pos < 0) - { - ostringstream os; - os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos - << ") < 0; upper interval [" << P_mid << "," << P_last - << "]; original interval [" << P_first << "," << P_last - << "]" << endl; - throw std::logic_error (os.str()); - } - const int my_offset = my_rank - P_mid; - const int P_other = P_first + my_offset; - // assert (0 <= P_other && P_other < P_mid); - apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_other, tag, messenger, - Q_factors[cur_pos], tau_arrays[cur_pos], work); - apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, - C_other, my_rank, P_mid, P_last, tag + 1, - messenger, Q_factors, tau_arrays, cur_pos - 1, - work); + const int my_offset = my_rank - P_first; + const int P_other = P_mid + my_offset; + // assert (P_mid <= P_other && P_other <= P_last); + if (P_other < P_mid || P_other > P_last) { + throw std::logic_error("Should never get here"); + } + apply_pair (apply_type, ncols_C, ncols_Q, C_mine, + ldc_mine, C_other, my_rank, P_other, + tag, messenger, Q_factors[cur_pos], + tau_arrays[cur_pos], work, lwork); + new_cur_pos = cur_pos - 1; + } + else { + if (b_participating) { + throw std::logic_error("Should never get here"); } + new_cur_pos = cur_pos; + } + apply_helper (apply_type, ncols_C, ncols_Q, C_mine, + ldc_mine, C_other, my_rank, P_first, + P_mid - 1, tag + 1, messenger, Q_factors, + tau_arrays, new_cur_pos, work, lwork); } + else { + if (cur_pos < 0) { + ostringstream os; + os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos + << ") < 0; upper interval [" << P_mid << "," << P_last + << "]; original interval [" << P_first << "," << P_last + << "]" << endl; + throw std::logic_error (os.str ()); + } + + const int my_offset = my_rank - P_mid; + const int P_other = P_first + my_offset; + // assert (0 <= P_other && P_other < P_mid); + apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_other, tag, messenger, + Q_factors[cur_pos], tau_arrays[cur_pos], + work, lwork); + apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine, + C_other, my_rank, P_mid, P_last, tag + 1, + messenger, Q_factors, tau_arrays, cur_pos - 1, + work, lwork); + } + } } }; } // namespace TSQR -#endif // __TSQR_Tsqr_DistTsqrHelper_hpp +#endif // TSQR_DISTTSQRHELPER_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp index 10035b80c6df..472fd700142c 100644 --- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp @@ -39,11 +39,11 @@ //@HEADER */ -#ifndef __TSQR_DistTsqrRB_hpp -#define __TSQR_DistTsqrRB_hpp +#ifndef TSQR_DISTTSQRRB_HPP +#define TSQR_DISTTSQRRB_HPP #include "Tsqr_ApplyType.hpp" -#include "Tsqr_Combine.hpp" +#include "Tsqr_Impl_CombineUser.hpp" #include "Tsqr_Matrix.hpp" #include "Tsqr_StatTimeMonitor.hpp" @@ -129,7 +129,6 @@ namespace TSQR { }; } // namespace details - /// \class DistTsqrRB /// \brief Reduce-and-Broadcast (RB) version of DistTsqr. /// \author Mark Hoemmen @@ -146,15 +145,15 @@ namespace TSQR { /// broadcast. The implicit Q factor data stay on the MPI process /// where they were computed. template - class DistTsqrRB { + class DistTsqrRB : private Impl::CombineUser { public: - typedef LocalOrdinal ordinal_type; - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type; - typedef MatView mat_view_type; - typedef Matrix matrix_type; - typedef int rank_type; - typedef Combine combine_type; + using ordinal_type = LocalOrdinal; + using scalar_type = Scalar; + using magnitude_type = + typename Teuchos::ScalarTraits::magnitudeType; + using mat_view_type = MatView; + using matrix_type = Matrix; + using rank_type = int; /// \brief Constructor /// @@ -193,10 +192,10 @@ namespace TSQR { /// timings from factorExplicit(). The vector gets resized if /// necessary to fit all the labels. void - getStatsLabels (std::vector< std::string >& labels) const + getStatsLabels (std::vector& labels) const { const int numTimers = 5; - labels.resize (std::max (labels.size(), static_cast(numTimers))); + labels.resize (std::max (labels.size (), size_t (numTimers))); labels[0] = totalTime_->name(); labels[1] = reduceCommTime_->name(); @@ -208,7 +207,12 @@ namespace TSQR { /// Whether or not all diagonal entries of the R factor computed /// by the QR factorization are guaranteed to be nonnegative. bool QR_produces_R_factor_with_nonnegative_diagonal () const { - return combine_type::QR_produces_R_factor_with_nonnegative_diagonal(); + // FIXME (20 Dec 2019) If the combine type is dynamic, we can't + // answer this question without knowing the number of columns. + // Just guess for now. + constexpr LocalOrdinal fakeNumCols = 10; + auto& c = this->getCombine (fakeNumCols); + return c.QR_produces_R_factor_with_nonnegative_diagonal (); } /// \brief Internode TSQR with explicit Q factor @@ -244,25 +248,23 @@ namespace TSQR { // R_mine has columns, but Q_mine may have any number of // columns. (It depends on how many columns of the explicit Q // factor we want to compute.) - if (R_mine.extent(0) < R_mine.extent(1)) - { - std::ostringstream os; - os << "R factor input has fewer rows (" << R_mine.extent(0) - << ") than columns (" << R_mine.extent(1) << ")"; - // This is a logic error because TSQR users should not be - // calling this method directly. - throw std::logic_error (os.str()); - } - else if (Q_mine.extent(0) != R_mine.extent(1)) - { - std::ostringstream os; - os << "Q factor input must have the same number of rows as the R " - "factor input has columns. Q has " << Q_mine.extent(0) - << " rows, but R has " << R_mine.extent(1) << " columns."; - // This is a logic error because TSQR users should not be - // calling this method directly. - throw std::logic_error (os.str()); - } + if (R_mine.extent(0) < R_mine.extent(1)) { + std::ostringstream os; + os << "R factor input has fewer rows (" << R_mine.extent(0) + << ") than columns (" << R_mine.extent(1) << ")"; + // This is a logic error because TSQR users should not be + // calling this method directly. + throw std::logic_error (os.str()); + } + else if (Q_mine.extent(0) != R_mine.extent(1)) { + std::ostringstream os; + os << "Q factor input must have the same number of rows as the R " + "factor input has columns. Q has " << Q_mine.extent(0) + << " rows, but R has " << R_mine.extent(1) << " columns."; + // This is a logic error because TSQR users should not be + // calling this method directly. + throw std::logic_error (os.str()); + } // The factorization is a recursion over processors [P_first, P_last]. const rank_type P_mine = messenger_->rank(); @@ -389,13 +391,13 @@ namespace TSQR { recv_R (R_other, P_mid); std::vector tau (numCols); - // Don't shrink the workspace array; doing so may - // require expensive reallocation every time we send / - // receive data. - resizeWork (numCols); - combine_.factor_pair (R_mine, R_other.view (), - tau.data(), work_.data()); + auto& combine = this->getCombine (numCols); + const ordinal_type lwork = + combine.work_size (2 * numCols, numCols, numCols); + work_.resize (lwork); + combine.factor_pair (R_mine, R_other.view (), + tau.data (), work_.data (), lwork); QFactors.push_back (R_other); tauArrays.push_back (tau); } @@ -413,9 +415,11 @@ namespace TSQR { const rank_type P_first, const rank_type P_last, const rank_type curpos, - std::vector< matrix_type >& QFactors, - std::vector< std::vector< scalar_type > >& tauArrays) + std::vector& QFactors, + std::vector>& tauArrays) { + using LO = LocalOrdinal; + if (P_last < P_first) { std::ostringstream os; os << "explicitQBroadcast: interval [P_first=" << P_first @@ -444,8 +448,8 @@ namespace TSQR { throw std::logic_error (os.str()); } // Q_impl, tau: implicitly stored local Q factor. - matrix_type& Q_impl = QFactors[curpos]; - std::vector& tau = tauArrays[curpos]; + auto Q_bot = QFactors[curpos].view (); + const scalar_type* tau = tauArrays[curpos].data (); // Apply implicitly stored local Q factor to // [Q_mine; @@ -453,13 +457,18 @@ namespace TSQR { // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)). // Overwrite both Q_mine and Q_other with the result. deep_copy (Q_other, scalar_type {}); - combine_.apply_pair (ApplyType::NoTranspose, - Q_mine.extent(1), Q_impl.extent(1), - Q_impl.data(), Q_impl.stride(1), - tau.data(), - Q_mine.data(), Q_mine.stride(1), - Q_other.data(), Q_other.stride(1), - work_.data()); + + const LO pair_nrows + (Q_mine.extent (0) + Q_other.extent (0)); + const LO pair_ncols (Q_mine.extent (1)); + auto& combine = this->getCombine (pair_ncols); + const LO lwork = + combine.work_size (pair_nrows, pair_ncols, pair_ncols); + if (lwork > LO (work_.size ())) { + work_.resize (lwork); + } + combine.apply_pair (ApplyType::NoTranspose, Q_bot, tau, + Q_mine, Q_other, work_.data (), lwork); // Send the resulting Q_other, and the final R factor, to P_mid. send_Q_R (Q_other, R_mine, P_mid); newpos = curpos - 1; @@ -476,9 +485,9 @@ namespace TSQR { newpos, QFactors, tauArrays); } else { // Interval [P_mid, P_last] - explicitQBroadcast (R_mine, Q_mine, Q_other, - P_mine, P_mid, P_last, - newpos, QFactors, tauArrays); + explicitQBroadcast (R_mine, Q_mine, Q_other, + P_mine, P_mid, P_last, + newpos, QFactors, tauArrays); } } } @@ -499,14 +508,15 @@ namespace TSQR { // Don't shrink the workspace array; doing so would still be // correct, but may require reallocation of data when it needs // to grow again. - resizeWork (numElts); + work_.resize (numElts); // Pack the Q data into the workspace array. - mat_view_type Q_contig (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0)); + mat_view_type Q_contig (Q.extent (0), Q.extent (1), + work_.data (), Q.extent (0)); deep_copy (Q_contig, Q); // Pack the R data into the workspace array. pack_R (R, &work_[Q_size]); - messenger_->send (work_.data(), numElts, destProc, 0); + messenger_->send (work_.data (), numElts, destProc, 0); } template< class MatrixType1, class MatrixType2 > @@ -525,12 +535,13 @@ namespace TSQR { // Don't shrink the workspace array; doing so would still be // correct, but may require reallocation of data when it needs // to grow again. - resizeWork (numElts); + work_.resize (numElts); - messenger_->recv (work_.data(), numElts, srcProc, 0); + messenger_->recv (work_.data (), numElts, srcProc, 0); // Unpack the C data from the workspace array. - deep_copy (Q, mat_view_type (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0))); + deep_copy (Q, mat_view_type (Q.extent (0), Q.extent (1), + work_.data (), Q.extent (0))); // Unpack the R data from the workspace array. unpack_R (R, &work_[Q_size]); } @@ -547,10 +558,10 @@ namespace TSQR { // Don't shrink the workspace array; doing so would still be // correct, but may require reallocation of data when it needs // to grow again. - resizeWork (numElts); + work_.resize (numElts); // Pack the R data into the workspace array. - pack_R (R, work_.data()); - messenger_->send (work_.data(), numElts, destProc, 0); + pack_R (R, work_.data ()); + messenger_->send (work_.data (), numElts, destProc, 0); } template< class MatrixType > @@ -565,23 +576,26 @@ namespace TSQR { // Don't shrink the workspace array; doing so would still be // correct, but may require reallocation of data when it needs // to grow again. - resizeWork (numElts); - messenger_->recv (work_.data(), numElts, srcProc, 0); + work_.resize (numElts); + messenger_->recv (work_.data (), numElts, srcProc, 0); // Unpack the R data from the workspace array. - unpack_R (R, work_.data()); + unpack_R (R, work_.data ()); } template< class MatrixType > static void unpack_R (MatrixType& R, const scalar_type buf[]) { + // FIXME (mfh 08 Dec 2019) Rewrite to use deep_copy; we don't + // want to access Matrix or MatView entries on host directly any + // more. ordinal_type curpos = 0; - for (ordinal_type j = 0; j < R.extent(1); ++j) - { - scalar_type* const R_j = &R(0, j); - for (ordinal_type i = 0; i <= j; ++i) - R_j[i] = buf[curpos++]; + for (ordinal_type j = 0; j < R.extent(1); ++j) { + scalar_type* const R_j = &R(0, j); + for (ordinal_type i = 0; i <= j; ++i) { + R_j[i] = buf[curpos++]; } + } } template< class ConstMatrixType > @@ -589,37 +603,33 @@ namespace TSQR { pack_R (const ConstMatrixType& R, scalar_type buf[]) { ordinal_type curpos = 0; - for (ordinal_type j = 0; j < R.extent(1); ++j) - { - const scalar_type* const R_j = &R(0, j); - for (ordinal_type i = 0; i <= j; ++i) - buf[curpos++] = R_j[i]; + for (ordinal_type j = 0; j < R.extent(1); ++j) { + const scalar_type* const R_j = &R(0, j); + for (ordinal_type i = 0; i <= j; ++i) { + buf[curpos++] = R_j[i]; } - } - - void - resizeWork (const ordinal_type numElts) - { - typedef typename std::vector< scalar_type >::size_type vec_size_type; - work_.resize (std::max (work_.size(), static_cast< vec_size_type >(numElts))); + } } private: - combine_type combine_; - Teuchos::RCP< MessengerBase< scalar_type > > messenger_; - std::vector< scalar_type > work_; + Teuchos::RCP> messenger_; + std::vector work_; // Timers for various phases of the factorization. Time is // cumulative over all calls of factorExplicit(). - Teuchos::RCP< Teuchos::Time > totalTime_; - Teuchos::RCP< Teuchos::Time > reduceCommTime_; - Teuchos::RCP< Teuchos::Time > reduceTime_; - Teuchos::RCP< Teuchos::Time > bcastCommTime_; - Teuchos::RCP< Teuchos::Time > bcastTime_; - - TimeStats totalStats_, reduceCommStats_, reduceStats_, bcastCommStats_, bcastStats_; + Teuchos::RCP totalTime_; + Teuchos::RCP reduceCommTime_; + Teuchos::RCP reduceTime_; + Teuchos::RCP bcastCommTime_; + Teuchos::RCP bcastTime_; + + TimeStats totalStats_; + TimeStats reduceCommStats_; + TimeStats reduceStats_; + TimeStats bcastCommStats_; + TimeStats bcastStats_; }; } // namespace TSQR -#endif // __TSQR_DistTsqrRB_hpp +#endif // TSQR_DISTTSQRRB_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp index 89f91f788cdc..f9d3647e3a21 100644 --- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp @@ -37,17 +37,20 @@ // ************************************************************************ //@HEADER -#ifndef __TSQR_Test_FullTsqrTest_hpp -#define __TSQR_Test_FullTsqrTest_hpp +#ifndef TSQR_TEST_FULLTSQRTEST_HPP +#define TSQR_TEST_FULLTSQRTEST_HPP #include "Tsqr.hpp" +#include "Tsqr_NodeTsqrFactory.hpp" #include "Tsqr_Random_NormalGenerator.hpp" #include "Tsqr_Random_GlobalMatrix.hpp" +#include "Tsqr_SequentialTsqr.hpp" #include "Tsqr_TestSetup.hpp" #include "Tsqr_GlobalVerify.hpp" #include "Tsqr_TeuchosMessenger.hpp" #include "Tsqr_TestUtils.hpp" #include "Teuchos_ScalarTraits.hpp" +#include "Teuchos_TypeNameTraits.hpp" #include #include @@ -56,93 +59,181 @@ namespace TSQR { namespace Test { - /// \class TsqrInaccurate - /// \brief Signals that a TSQR test failed due to insufficient accuracy. - class TsqrInaccurate : public std::exception { - public: - //! Constructor - TsqrInaccurate (const std::string& msg) : msg_ (msg) {} - - //! The error message - const char* what() const throw() { return msg_.c_str(); } - - //! Destructor (declared virtual for memory safety of subclasses). - virtual ~TsqrInaccurate() throw() {} - - private: - std::string msg_; - }; + template + using kokkos_value_type = typename std::conditional< + std::is_const::value, + const typename Kokkos::ArithTraits< + typename std::remove_const::type>::val_type, + typename Kokkos::ArithTraits::val_type + >::type; + + template + Kokkos::View**, + Kokkos::LayoutLeft, Kokkos::HostSpace, + Kokkos::MemoryTraits> + getHostMatrixView (const MatView& A) + { + using Kokkos::ALL; + using Kokkos::subview; + using IST = kokkos_value_type; + using host_mat_view_type = + Kokkos::View>; + + const size_t nrows (A.extent (0)); + const size_t ncols (A.extent (1)); + const size_t lda (A.stride (1)); + IST* A_raw = reinterpret_cast (A.data ()); + host_mat_view_type A_full (A_raw, lda, ncols); + const std::pair rowRange (0, nrows); + return Kokkos::subview (A_full, rowRange, Kokkos::ALL ()); + } + + template + Kokkos::View::val_type**, + Kokkos::LayoutLeft> + getDeviceMatrixCopy (const MatView& A, + const std::string& label) + { + using Kokkos::view_alloc; + using Kokkos::WithoutInitializing; + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + const size_t nrows (A.extent (0)); + const size_t ncols (A.extent (1)); + device_matrix_type A_dev + (view_alloc (label, WithoutInitializing), nrows, ncols); + auto A_host = getHostMatrixView (A); + Kokkos::deep_copy (A_dev, A_host); + return A_dev; + } /// \class FullTsqrVerifier - /// \brief Test (correctness and) accuracy of Tsqr for one Scalar type. + /// \brief Test (correctness and) accuracy of Tsqr for one Scalar + /// type. /// \author Mark Hoemmen /// - /// This class is meant to be used only by \c - /// FullTsqrVerifierCaller. It performs one accuracy test of \c - /// Tsqr for the given Scalar type (that is, the type of the - /// matrix entries). An accuracy test is also a correctness test. - /// This test computes accuracy bounds for both orthogonality and - /// forward errors, and if those bounds are exceeded and the - /// failIfInaccurate option is enabled, the test will throw a \c + /// \tparam Scalar Type of each matrix entry. + /// + /// This class is meant to be used only by FullTsqrVerifierCaller. + /// It performs one accuracy test of Tsqr for the given Scalar + /// type. An accuracy test is also a correctness test. This test + /// computes accuracy bounds for both orthogonality and forward + /// errors, and if those bounds are exceeded and the + /// failIfInaccurate option is enabled, the test will throw a /// TsqrInaccurate exception. /// - /// The test takes a \c Teuchos::ParameterList input. For a + /// The test takes a Teuchos::ParameterList input. For a /// ParameterList with all parameters, their default values, and - /// documentation, see the relevant class method in \c + /// documentation, see the relevant class method in /// FullTsqrVerifierCaller. - /// - /// This class currently only tests the version of Tsqr that is - /// the composition of NodeTsqrType=SequentialTsqr and - /// DistTsqrType=DistTsqr. This should suffice to test - /// correctness, as long as the other NodeTsqrType possibilities - /// (such as TbbTsqr) are tested separately. - /// template class FullTsqrVerifier { public: - typedef Scalar scalar_type; - typedef int ordinal_type; - typedef SequentialTsqr node_tsqr_type; - typedef DistTsqr dist_tsqr_type; - typedef Tsqr tsqr_type; + using scalar_type = Scalar; + using ordinal_type = int; + using node_tsqr_type = NodeTsqr; + using dist_tsqr_type = DistTsqr; + using tsqr_type = Tsqr; private: + static Teuchos::RCP + getNodeTsqr (const Teuchos::RCP& testParams, + const bool myRank, + const bool verbose, + const std::string inputPrefix) + { + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::rcp_implicit_cast; + using std::cerr; + using std::endl; + using device_type = + Kokkos::DefaultExecutionSpace::device_type; + const char cacheSizeHintParamName[] = "Cache Size Hint"; + const std::string prefix = inputPrefix + " "; + + auto nodeTsqrParams = Teuchos::parameterList ("NodeTsqr"); + + size_t cacheSizeHint = 0; + if (testParams->isType (cacheSizeHintParamName)) { + cacheSizeHint = + testParams->get (cacheSizeHintParamName); + nodeTsqrParams->set (cacheSizeHintParamName, cacheSizeHint); + } + else if (testParams->isType (cacheSizeHintParamName)) { + cacheSizeHint = static_cast + (testParams->get (cacheSizeHintParamName)); + nodeTsqrParams->set (cacheSizeHintParamName, cacheSizeHint); + } + + std::string nodeTsqrName ("Default"); + if (testParams->isType ("NodeTsqr")) { + nodeTsqrName = testParams->get ("NodeTsqr"); + } + if (myRank == 0 && verbose) { + cerr << prefix << "getNodeTsqr:" << endl + << prefix << " - NodeTsqr: " << nodeTsqrName << endl + << prefix << " - Cache Size Hint: " << cacheSizeHint + << endl; + } + + RCP nodeTsqr; + using node_tsqr_factory_type = TSQR::NodeTsqrFactory< + scalar_type, ordinal_type, device_type>; + nodeTsqr = node_tsqr_factory_type::getNodeTsqr (nodeTsqrName); + TEUCHOS_ASSERT( ! nodeTsqr.is_null () ); + + if (myRank == 0 && verbose) { + using execution_space = device_type::execution_space; + const std::string spaceName = + Teuchos::TypeNameTraits::name (); + const std::string myPrefix = prefix + " * "; + + cerr << myPrefix << "execution_space: " << spaceName << endl + << myPrefix << "concurrency: " + << execution_space ().concurrency () << endl + << myPrefix << "Requested NodeTsqr subclass type: " + << nodeTsqrName << endl + << myPrefix << "Actual NodeTsqr subclass type: " + << Teuchos::typeName (*nodeTsqr) << endl; + } + return nodeTsqr; + } //! Instantiate and return a (full) Tsqr instance. static Teuchos::RCP getTsqr (const Teuchos::RCP& testParams, - const Teuchos::RCP >& comm) + const Teuchos::RCP >& comm, + const bool verbose) { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::rcp_implicit_cast; using Teuchos::RCP; using Teuchos::rcp; + using Teuchos::rcp_implicit_cast; + using std::cerr; + using std::endl; + const int myRank = comm->getRank (); - const size_t cacheSizeHint = testParams->get ("cacheSizeHint"); - //const int numTasks = testParams->get ("numTasks"); - - //RCP tsqrParams = parameterList ("NodeTsqr"); - //tsqrParams->set ("Cache Size Hint", cacheSizeHint); - //tsqrParams->set ("Num Tasks", numCores); - - // TODO (mfh 21 Oct 2011) Some node_tsqr_type classes need a - // Kokkos Node instance. SequentialTsqr doesn't, so this code - // should be fine for now. - RCP seqTsqr = rcp (new node_tsqr_type (cacheSizeHint)); + const std::string prefix (" "); - RCP > scalarMess = + if (myRank == 0 && verbose) { + cerr << prefix << "- Set up TSQR::Tsqr instance" << endl; + } + auto nodeTsqr = + getNodeTsqr (testParams, myRank, verbose, prefix); + auto scalarMess = rcp (new TeuchosMessenger (comm)); - RCP > scalarMessBase = - rcp_implicit_cast > (scalarMess); - RCP distTsqr = rcp (new dist_tsqr_type); + auto scalarMessBase = + rcp_implicit_cast> (scalarMess); + RCP distTsqr (new dist_tsqr_type); distTsqr->init (scalarMessBase); - return rcp (new tsqr_type (seqTsqr, distTsqr)); + return rcp (new tsqr_type (nodeTsqr, distTsqr)); } public: - /// \brief Run the test for the Scalar type. /// /// \param comm [in] Communicator over which to run the test. @@ -151,7 +242,9 @@ namespace TSQR { /// \param randomSeed [in/out] On input: the random seed for /// LAPACK's pseudorandom number generator. On output: the /// updated random seed. - static void + /// + /// \return Whether the test passed. + static bool run (const Teuchos::RCP >& comm, const Teuchos::RCP& testParams, std::vector& randomSeed) @@ -159,31 +252,52 @@ namespace TSQR { using std::cerr; using std::cout; using std::endl; - using Teuchos::arcp; using Teuchos::ParameterList; using Teuchos::parameterList; using Teuchos::RCP; using Teuchos::rcp; - using Teuchos::rcp_const_cast; using Teuchos::rcp_implicit_cast; - typedef Matrix matrix_type; - typedef MatView mat_view_type; - typedef typename tsqr_type::FactorOutput factor_output_type; - - const int myRank = Teuchos::rank (*comm); - const int numProcs = Teuchos::size (*comm); - - // Construct TSQR implementation instance. - RCP tsqr = getTsqr (testParams, comm); + using matrix_type = Matrix; + using mat_view_type = MatView; + + bool success = true; + + TEUCHOS_ASSERT( ! comm.is_null () ); + TEUCHOS_ASSERT( ! testParams.is_null () ); + + const int myRank = comm->getRank (); + const int numProcs = comm->getSize (); + const bool verbose = testParams->get ("verbose"); + const ordinal_type numRowsLocal = + testParams->get ("numRowsLocal"); + const ordinal_type numCols = + testParams->get ("numCols"); + //const int numCores = testParams->get ("numCores"); + const bool contiguousCacheBlocks = + testParams->get ("contiguousCacheBlocks"); + const bool testFactorExplicit = + testParams->get ("testFactorExplicit"); + const bool testRankRevealing = + testParams->get ("testRankRevealing"); + + if (myRank == 0 && verbose) { + cerr << "Full TSQR test: Scalar=" + << Teuchos::TypeNameTraits::name () << endl + << " - Command-line arguments:" << endl + << " * numRowsLocal: " << numRowsLocal << endl + << " * numCols: " << numCols << endl + << " * contiguousCacheBlocks: " + << (contiguousCacheBlocks ? "true" : "false") << endl + << " * testFactorExplicit: " + << (testFactorExplicit ? "true" : "false") << endl + << " * testRankRevealing: " + << (testRankRevealing ? "true" : "false") << endl + << " * verbose: " + << (verbose ? "true" : "false") << endl; + } - // Fetch test parameters from the input parameter list. - const ordinal_type numRowsLocal = testParams->get ("numRowsLocal"); - const ordinal_type numCols = testParams->get ("numCols"); - const int numCores = testParams->get ("numCores"); - const bool contiguousCacheBlocks = testParams->get ("contiguousCacheBlocks"); - const bool testFactorExplicit = testParams->get ("testFactorExplicit"); - const bool testRankRevealing = testParams->get ("testRankRevealing"); - const bool debug = testParams->get ("debug"); + RCP tsqr = getTsqr (testParams, comm, verbose); + TEUCHOS_ASSERT( ! tsqr.is_null () ); // Space for each process's local part of the test problem. // A_local, A_copy, and Q_local are distributed matrices, and @@ -193,7 +307,7 @@ namespace TSQR { matrix_type Q_local (numRowsLocal, numCols); matrix_type R (numCols, numCols); - // Start out by filling the test problem with zeros. + // Start by filling the test problem with zeros. deep_copy (A_local, Scalar {}); deep_copy (A_copy, Scalar {}); deep_copy (Q_local, Scalar {}); @@ -222,15 +336,21 @@ namespace TSQR { // We need a Messenger for Ordinal-type data, so that we can // build a global random test matrix. - RCP> ordinalMessenger = - rcp_implicit_cast> (rcp (new TeuchosMessenger (comm))); + auto ordinalMessenger = + rcp_implicit_cast> + (rcp (new TeuchosMessenger (comm))); // We also need a Messenger for Scalar-type data. The TSQR // implementation already constructed one, but it's OK to // construct another one; TeuchosMessenger is just a thin // wrapper over the Teuchos::Comm object. - RCP> scalarMessenger = - rcp_implicit_cast> (rcp (new TeuchosMessenger (comm))); + auto scalarMessenger = + rcp_implicit_cast> + (rcp (new TeuchosMessenger (comm))); + + if (myRank == 0 && verbose) { + cerr << " - Generate test problem" << endl; + } { // Generate a global distributed matrix (whose part local to @@ -239,68 +359,201 @@ namespace TSQR { using TSQR::Random::randomGlobalMatrix; mat_view_type A_local_view (A_local.extent(0), A_local.extent(1), - A_local.data(), A_local.stride(1)); + A_local.data(), + A_local.stride(1)); const magnitude_type* const singVals = singularValues.data(); - randomGlobalMatrix (&gen, A_local_view, singVals, - ordinalMessenger.getRawPtr(), - scalarMessenger.getRawPtr()); + randomGlobalMatrix (&gen, A_local_view, singVals, + ordinalMessenger.getRawPtr(), + scalarMessenger.getRawPtr()); } // Save the pseudorandom number generator's seed for any later // tests. The generator keeps its own copy of the seed and // updates it internally, so we have to ask for its copy. gen.getSeed (randomSeed); + if (myRank == 0 && verbose) { + cerr << "-- tsqr->wants_device_memory() = " + << (tsqr->wants_device_memory () ? "true" : "false") + << endl; + } + + using IST = + typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + auto A_h = getHostMatrixView (A_local.view ()); + auto A_copy_h = getHostMatrixView (A_copy.view ()); + auto Q_h = getHostMatrixView (Q_local.view ()); + device_matrix_type A_d; + device_matrix_type A_copy_d; + device_matrix_type Q_d; + if (tsqr->wants_device_memory ()) { + A_d = getDeviceMatrixCopy (A_local.view (), "A_d"); + // Don't copy A_copy yet; see below. + A_copy_d = device_matrix_type ("A_copy_d", + numRowsLocal, numCols); + Q_d = device_matrix_type ("Q_d", numRowsLocal, numCols); + } + // If specified in the test parameters, rearrange cache blocks // in the copy. Otherwise, just copy the test problem into // A_copy. The factorization overwrites the input matrix, so // we have to make a copy in order to validate the final // result. - if (contiguousCacheBlocks) { - tsqr->cache_block (numRowsLocal, numCols, A_copy.data(), - A_local.data(), A_local.stride(1)); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished Tsqr::cache_block" << endl; + + if (! contiguousCacheBlocks) { + if (myRank == 0 && verbose) { + cerr << " - Copy A into A_copy" << endl; + } + deep_copy (A_copy, A_local); + if (tsqr->wants_device_memory ()) { + deep_copy (A_copy_d, A_d); } } else { - deep_copy (A_copy, A_local); + if (myRank == 0 && verbose) { + cerr << " - Copy A into A_copy via cache_block" << endl; + } + if (tsqr->wants_device_memory ()) { + Scalar* A_copy_d_raw = + reinterpret_cast (A_copy_d.data ()); + const Scalar* A_d_raw = + reinterpret_cast (A_d.data ()); + tsqr->cache_block (numRowsLocal, numCols, A_copy_d_raw, + A_d_raw, A_d.stride (1)); + deep_copy (A_copy_h, A_copy_d); + } + else { + tsqr->cache_block (numRowsLocal, numCols, A_copy.data (), + A_local.data (), A_local.stride (1)); + } + if (myRank == 0 && verbose) { + cerr << " - Finished cache-blocking the test problem" + << endl; + } } - // "factorExplicit" is an alternate, hopefully faster way of - // factoring the matrix, when only the explicit Q factor is - // wanted. if (testFactorExplicit) { - tsqr->factorExplicitRaw (A_copy.extent (0), A_copy.extent (1), - A_copy.data (), A_copy.stride (1), - Q_local.data (), Q_local.stride (1), - R.data (), R.stride (1), - contiguousCacheBlocks); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished Tsqr::factorExplicit" << endl; + if (myRank == 0 && verbose) { + cerr << " - Call factorExplicitRaw" << endl; + } + try { + if (tsqr->wants_device_memory ()) { + Scalar* A_raw = + reinterpret_cast (A_copy_d.data ()); + Scalar* Q_raw = reinterpret_cast (Q_d.data ()); + tsqr->factorExplicitRaw (A_copy_d.extent (0), + A_copy_d.extent (1), + A_raw, + A_copy_d.stride (1), + Q_raw, + Q_d.stride (1), + R.data (), R.stride (1), + contiguousCacheBlocks); + if (myRank == 0 && verbose) { + cerr << " - Finished factorExplicitRaw; now " + "deep_copy(Q_h, Q_d)" << endl; + } + deep_copy (Q_h, Q_d); + } + else { + Scalar* A_raw = A_copy.data (); + Scalar* Q_raw = Q_local.data (); + tsqr->factorExplicitRaw (A_copy.extent (0), + A_copy.extent (1), + A_raw, + A_copy.stride (1), + Q_raw, + Q_local.stride (1), + R.data (), R.stride (1), + contiguousCacheBlocks); + if (myRank == 0 && verbose) { + cerr << " - Finished factorExplicitRaw" << endl; + } + } + } + catch (std::exception& e) { + std::ostringstream os; + os << "Proc " << myRank << " threw an exception: " + << e.what () << endl; + cerr << os.str (); + MPI_Abort (MPI_COMM_WORLD, -1); + } + + bool found_nonzero_in_R = false; + for (ordinal_type j = 0; j < numCols; ++j) { + for (ordinal_type i = 0; i < numCols; ++i) { + if (R(i,j) != scalar_type {}) { + found_nonzero_in_R = true; + } + } + } + + if (! found_nonzero_in_R) { + success = false; + if (myRank == 0) { + const std::string prefix + (verbose ? " - *** " : "*** "); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + cerr << prefix << "For Scalar=" << scalarName + << ": R factor resulting from factorExplicitRaw " + << "is zero." << endl; + } } } else { - // Factor the (copy of the) matrix. - factor_output_type factorOutput = - tsqr->factor (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished Tsqr::factor" << endl; + if (myRank == 0 && verbose) { + cerr << " - Call factor" << endl; } - // Compute the explicit Q factor in Q_local. - tsqr->explicit_Q (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1), - factorOutput, numCols, Q_local.data(), Q_local.stride(1), - contiguousCacheBlocks); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished Tsqr::explicit_Q" << endl; + auto factorOutput = [&] () { + if (tsqr->wants_device_memory ()) { + Scalar* A_raw = + reinterpret_cast (A_copy_d.data ()); + auto result = + tsqr->factor (numRowsLocal, numCols, + A_raw, A_copy_d.stride (1), + R.data (), R.stride (1), + contiguousCacheBlocks); + deep_copy (A_copy_h, A_copy_d); + return result; + } + else { + Scalar* A_raw = + reinterpret_cast (A_copy_d.data ()); + return tsqr->factor (numRowsLocal, numCols, + A_raw, A_copy.stride (1), + R.data (), R.stride (1), + contiguousCacheBlocks); + } + } (); + + if (myRank == 0 && verbose) { + cerr << " - Finished factor; call explicit_Q" << endl; + } + if (tsqr->wants_device_memory ()) { + const Scalar* A_raw = + reinterpret_cast (A_copy_d.data ()); + Scalar* Q_raw = reinterpret_cast (Q_d.data ()); + tsqr->explicit_Q (numRowsLocal, numCols, + A_raw, A_copy_d.stride (1), + factorOutput, numCols, + Q_raw, Q_d.stride (1), + contiguousCacheBlocks); + deep_copy (Q_h, Q_d); + } + else { + const Scalar* A_raw = A_copy.data (); + Scalar* Q_raw = Q_local.data (); + tsqr->explicit_Q (numRowsLocal, numCols, + A_raw, A_copy.stride (1), + factorOutput, numCols, + Q_raw, Q_local.stride (1), + contiguousCacheBlocks); + } + if (myRank == 0 && verbose) { + cerr << " - Finished explicit_Q" << endl; } } @@ -318,12 +571,23 @@ namespace TSQR { // tolerance of zero to test the purported rank with the // actual numerical rank. const magnitude_type tol = STM::zero(); - const ordinal_type rank = - tsqr->revealRankRaw (Q_local.extent (0), Q_local.extent (1), - Q_local.data (), Q_local.stride (1), - R.data (), R.stride (1), tol, - contiguousCacheBlocks); - + if (myRank == 0 && verbose) { + cerr << " - Call revealRankRaw" << endl; + } + const ordinal_type rank = [&] () { + Scalar* Q_raw = tsqr->wants_device_memory () ? + reinterpret_cast (Q_d.data ()) : + Q_local.data (); + const ordinal_type ldq = tsqr->wants_device_memory () ? + Q_d.stride (1) : Q_local.stride (1); + return tsqr->revealRankRaw (numRowsLocal, numCols, + Q_raw, ldq, + R.data (), R.stride (1), + tol, contiguousCacheBlocks); + } (); + if (myRank == 0 && verbose) { + cerr << " - Finished revealRankRaw" << endl; + } magnitude_type two_to_the_numCols = STM::one(); for (int k = 0; k < numCols; ++k) { const magnitude_type two = STM::one() + STM::one(); @@ -333,22 +597,19 @@ namespace TSQR { // rounding error (so the test only fails if something is // really broken). if (two_to_the_numCols > magnitude_type(10) * STM::eps ()) { - TEUCHOS_TEST_FOR_EXCEPTION( - rank != numCols, std::logic_error, "The matrix of " << numCols - << " columns should have full numerical rank, but Tsqr reports " - "that it has rank " << rank << ". Please report this bug to " - "the Kokkos developers."); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Tested rank-revealing capability" << endl; + TEUCHOS_TEST_FOR_EXCEPTION + (rank != numCols, std::logic_error, "The matrix of " << + numCols << " columns should have full numerical rank, " + "but Tsqr reports that it has rank " << rank << ". " + "Please report this bug to the Kokkos developers."); + if (myRank == 0 && verbose) { + cerr << " - Tested rank-revealing capability" << endl; } } else { - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Not testing rank-revealing capability; too many columns" << endl; + if (myRank == 0 && verbose) { + cerr << " - Not testing rank-revealing capability; " + "too many columns" << endl; } } } @@ -356,29 +617,49 @@ namespace TSQR { // were used. This is only necessary because global_verify() // doesn't currently support contiguous cache blocks. if (contiguousCacheBlocks) { - // We can use A_copy as scratch space for - // un-cache-blocking Q_local, since we're done using - // A_copy for other things. - tsqr->un_cache_block (numRowsLocal, numCols, A_copy.data(), - A_copy.stride(1), Q_local.data()); - // Overwrite Q_local with the un-cache-blocked Q factor. - deep_copy (Q_local, A_copy); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished Tsqr::un_cache_block" << endl; + // Use A_copy(_d) as scratch for un-cache-blocking Q_local. + if (myRank == 0 && verbose) { + cerr << " - Call Tsqr::un_cache_block" << endl; + } + if (tsqr->wants_device_memory ()) { + Scalar* A_copy_d_raw = + reinterpret_cast (A_copy_d.data ()); + const Scalar* Q_d_raw = + reinterpret_cast (Q_d.data ()); + tsqr->un_cache_block (numRowsLocal, numCols, + A_copy_d_raw, + A_copy_d.stride (1), + Q_d_raw); + deep_copy (Q_h, A_copy_d); + } + else { + tsqr->un_cache_block (numRowsLocal, numCols, + A_copy.data (), + A_copy.stride (1), + Q_local.data ()); + deep_copy (Q_local, A_copy); + } + if (myRank == 0 && verbose) { + cerr << " - Finished Tsqr::un_cache_block" << endl; + } + } + else { + if (tsqr->wants_device_memory ()) { + deep_copy (Q_h, Q_d); } } - // Test accuracy of the factorization. - const std::vector results = - global_verify (numRowsLocal, numCols, A_local.data(), A_local.stride(1), - Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), + if (myRank == 0 && verbose) { + cerr << " - Call global_verify" << endl; + } + const auto results = + global_verify (numRowsLocal, numCols, + A_local.data(), A_local.stride(1), + Q_local.data(), Q_local.stride(1), + R.data(), R.stride(1), scalarMessenger.getRawPtr()); - if (debug) { - Teuchos::barrier (*comm); - if (myRank == 0) - cerr << "-- Finished global_verify" << endl; + if (myRank == 0 && verbose) { + cerr << " - Finished global_verify" << endl; } // Print the results on Proc 0. @@ -390,7 +671,6 @@ namespace TSQR { << ",numRowsLocal" << ",numCols" << ",numProcs" - << ",numCores" << ",cacheSizeHint" << ",contiguousCacheBlocks" << ",absFrobResid" @@ -401,12 +681,13 @@ namespace TSQR { testParams->set ("printFieldNames", false); } if (testParams->get ("printResults")) { + const std::string scalarName = + Teuchos::TypeNameTraits::name (); cout << "Tsqr" - << "," << Teuchos::TypeNameTraits::name() + << "," << scalarName << "," << numRowsLocal << "," << numCols << "," << numProcs - << "," << numCores << "," << tsqr->cache_size_hint() << "," << contiguousCacheBlocks << "," << results[0] @@ -414,7 +695,7 @@ namespace TSQR { << "," << results[2] << endl; } - } // if (myRank == 0) + } // If requested, check accuracy and fail if results are not // sufficiently accurate. @@ -447,28 +728,52 @@ namespace TSQR { magnitude_type(10*numCols*numCols) * STM::eps(); // Avoid division by zero. - const magnitude_type relResidError = - results[0] / (results[2] == STM::zero() ? STM::one() : results[2]); - TEUCHOS_TEST_FOR_EXCEPTION( - relResidError > relResidBound, TsqrInaccurate, "Full Tsqr " - "has an inaccurate relative residual ||A - QR||_F" - << (results[2] == STM::zero() ? " / ||A||_F" : "") - << " = " << relResidError << ", which is greater than the bound " - << relResidBound << " by a factor of " - << relResidError / relResidBound << "."); + const magnitude_type relResidError = results[0] / + (results[2] == STM::zero() ? STM::one() : results[2]); + + if (relResidError > relResidBound) { + success = false; + if (myRank == 0) { + const std::string prefix + (verbose ? " - *** " : "*** "); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + const std::string relResStr + (results[2] == STM::zero() ? " / ||A||_F" : ""); + cerr << prefix << "For Scalar=" << scalarName + << ": Inaccurate residual ||A - QR||_F" + << relResStr + << (results[2] == STM::zero() ? " / ||A||_F" : "") + << " = " << relResidError << "." << endl + << prefix << "It's greater than the bound " + << relResidBound << " by a factor of " + << relResidError / relResidBound << "." << endl; + } + } const magnitude_type orthoError = results[1]; - TEUCHOS_TEST_FOR_EXCEPTION( - orthoError > orthoBound, TsqrInaccurate, - "Full Tsqr has an inaccurate orthogonality measure ||I - Q^* Q||_F" - << results[1] << " = " << orthoError << ", which is greater than " - "the bound " << orthoBound << " by a factor of " - << orthoError / orthoBound << "."); + if (orthoError > orthoBound) { + success = false; + if (myRank == 0) { + const std::string prefix + (verbose ? " - *** " : "*** "); + const std::string scalarName = + Teuchos::TypeNameTraits::name (); + cerr << prefix << "For Scalar=" << scalarName + << ": Inaccurate orthogonality measure " + << "||I - Q^* Q||_F = " << orthoError << "." + << endl << prefix << "It's greater than the bound " + << orthoBound << " by a factor of " + << orthoError / orthoBound << "." << endl; + } + } } // if (the tests should fail on inaccuracy) + return success; } }; /// \class FullTsqrVerifierCallerImpl - /// \brief This class implements a "function template specialization." + /// \brief This class implements a "function template + /// specialization." /// \author Mark Hoemmen /// /// We want to make FullTsqrVerifierCaller::run() a template @@ -489,7 +794,7 @@ namespace TSQR { template class FullTsqrVerifierCallerImpl { public: - static void + static bool run (const Teuchos::RCP >& comm, const Teuchos::RCP& testParams, std::vector& randomSeed); @@ -499,17 +804,21 @@ namespace TSQR { // Partial specialization for Cons. // template - class FullTsqrVerifierCallerImpl > { + class FullTsqrVerifierCallerImpl> + { public: - static void + static bool run (const Teuchos::RCP >& comm, const Teuchos::RCP& testParams, std::vector& randomSeed) { - typedef CarType car_type; - typedef CdrType cdr_type; - FullTsqrVerifier::run (comm, testParams, randomSeed); - FullTsqrVerifierCallerImpl::run (comm, testParams, randomSeed); + using car_type = FullTsqrVerifier; + using cdr_type = FullTsqrVerifierCallerImpl; + const bool success1 = + car_type::run (comm, testParams, randomSeed); + const bool success2 = + cdr_type::run (comm, testParams, randomSeed); + return success1 && success2; } }; @@ -519,22 +828,23 @@ namespace TSQR { template<> class FullTsqrVerifierCallerImpl { public: - static void + static bool run (const Teuchos::RCP >&, const Teuchos::RCP&, std::vector&) { - // We're at the end of the type list, so do nothing. + return true; } }; /// \class FullTsqrVerifierCaller - /// \brief Invokes FullTsqrVerifier::run() over all Scalar types in a type list. + /// \brief Invokes FullTsqrVerifier::run() over all Scalar types + /// in a type list. /// \author Mark Hoemmen /// /// Use this class to test the full TSQR implementation in Tsqr. /// It will test Tsqr over a list of Scalar types that you define, - /// using \c Cons and \c NullCons. + /// using Cons and NullCons. class FullTsqrVerifierCaller { public: /// \typedef ordinal_type @@ -556,7 +866,7 @@ namespace TSQR { RCP plist = parameterList ("FullTsqrVerifier"); const size_t cacheSizeHint = 0; - const int numCores = 1; + // const int numCores = 1; const ordinal_type numRowsLocal = 100; const ordinal_type numCols = 10; const bool contiguousCacheBlocks = false; @@ -565,67 +875,67 @@ namespace TSQR { const bool printFieldNames = true; const bool printResults = true; const bool failIfInaccurate = true; - const bool debug = false; + const std::string nodeTsqr ("Default"); + const bool verbose = false; // Parameters for configuring Tsqr itself. - plist->set ("cacheSizeHint", cacheSizeHint, + plist->set ("Cache Size Hint", cacheSizeHint, "Cache size hint in bytes. " "Zero means TSQR picks a reasonable default."); - plist->set ("numCores", numCores, - "Number of partition(s) to use for TbbTsqr (if " - "applicable). Must be a positive integer."); // Parameters for testing Tsqr. plist->set ("numRowsLocal", numRowsLocal, - "Number of rows per (MPI) process in the test matrix. " - "Must be >= the number of columns."); + "Number of rows per (MPI) process in the test " + "matrix. Must be >= the number of columns."); plist->set ("numCols", numCols, "Number of columns in the test matrix."); plist->set ("contiguousCacheBlocks", contiguousCacheBlocks, - "Whether to test the factorization with contiguously " - "stored cache blocks."); + "Whether to test the factorization with " + "contiguously stored cache blocks."); plist->set ("testFactorExplicit", testFactorExplicit, - "Whether to test TSQR's factorExplicit() (a hopefully " - "faster path than calling factor() and explicit_Q() in " - "sequence)."); + "Whether to test TSQR's factorExplicit() (a " + "hopefully faster path than calling factor() and " + "explicit_Q() in sequence)."); plist->set ("testRankRevealing", testRankRevealing, "Whether to test TSQR's rank-revealing capability."); plist->set ("printFieldNames", printFieldNames, - "Whether to print field names (this is only done once, " - "for all Scalar types tested)."); + "Whether to print field names (this is only done " + "once, for all Scalar types tested)."); plist->set ("printResults", printResults, "Whether to print test results."); plist->set ("failIfInaccurate", failIfInaccurate, "Whether to fail the test if the factorization " "is not sufficiently accurate."); - plist->set ("debug", debug, - "Whether to print debugging output."); + plist->set ("NodeTsqr", nodeTsqr, "NodeTsqr subclass to use; " + "\"Default\" means let TSQR pick it"); + plist->set ("verbose", verbose, + "Whether to print verbose debugging output."); return plist; } - /// \brief Run TsqrVerifier::run() for every type in the type list. + /// \brief Run TsqrVerifier::run() for every type in the type + /// list. /// - /// TypeListType should be either a \c NullCons (representing an + /// TypeListType should be either a NullCons (representing an /// empty type list, in which case this function does nothing), - /// or a \c Cons (whose CarType is a Scalar type to test, and - /// whose CdrType is either a NullCons or a Cons). + /// or a Cons (whose CarType is a Scalar type to test, and whose + /// CdrType is either a NullCons or a Cons). /// /// \param testParams [in/out] List of parameters for all tests - /// to run. Call \c getValidParameterList() to get a valid - /// list of parameters with default values and documentation. + /// to run. Call getValidParameterList() to get a valid list + /// of parameters with default values and documentation. /// template - void + bool run (const Teuchos::RCP& testParams) { // Using a class with a static method is a way to implement // "partial specialization of function templates" (which by // itself is not allowed in C++). - typedef FullTsqrVerifierCallerImpl impl_type; - impl_type::run (comm_, testParams, randomSeed_); + using impl_type = FullTsqrVerifierCallerImpl; + return impl_type::run (comm_, testParams, randomSeed_); } - /// \brief Full constructor. /// /// \param comm [in] Communicator (with one or more processes) @@ -660,17 +970,19 @@ namespace TSQR { static std::vector validateRandomSeed (const std::vector& seed) { - TEUCHOS_TEST_FOR_EXCEPTION( - seed.size () < 4, std::invalid_argument, "Invalid random seed: " - "Need an array of four integers."); - for (std::vector::size_type k = 0; k < seed.size (); ++k) { - TEUCHOS_TEST_FOR_EXCEPTION( - seed[k] < 0 || seed[k] > 4095, std::invalid_argument, "Invalid " - "random seed: Each of the four integers must be in [0, 4095]."); + TEUCHOS_TEST_FOR_EXCEPTION + (seed.size () < 4, std::invalid_argument, "Invalid random " + "seed: Need an array of four integers, but you gave us " + << seed.size () << " of them."); + for (size_t k = 0; k < seed.size (); ++k) { + TEUCHOS_TEST_FOR_EXCEPTION + (seed[k] < 0 || seed[k] > 4095, std::invalid_argument, + "seed[" << k << "]=" << seed[k] << " is invalid. " + "Each of the four seeds must be in [0, 4095]."); } - TEUCHOS_TEST_FOR_EXCEPTION( - seed[3] % 2 != 1, std::invalid_argument, "Invalid random seed: " - "The last of the four integers must be odd."); + TEUCHOS_TEST_FOR_EXCEPTION + (seed[3] % 2 != 1, std::invalid_argument, "seed[3]=" + << seed[3] << " is invalid: it must be odd."); return seed; } @@ -691,7 +1003,7 @@ namespace TSQR { /// /// This communicator may include one or more processes. /// MPI is not required (it may be a "serial communicator"). - Teuchos::RCP > comm_; + Teuchos::RCP> comm_; /// \brief The seed for LAPACK's pseudorandom number generator. /// @@ -704,5 +1016,4 @@ namespace TSQR { } // namespace Test } // namespace TSQR -#endif // __TSQR_Test_FullTsqrTest_hpp - +#endif // TSQR_TEST_FULLTSQRTEST_HPP diff --git a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp similarity index 55% rename from packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp rename to packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp index 4e5d22e1403c..fab3efa79671 100644 --- a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp @@ -37,54 +37,49 @@ // ************************************************************************ //@HEADER -#ifndef __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp -#define __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp +#ifndef TSQR_COMBINEUSER_HPP +#define TSQR_COMBINEUSER_HPP -/// \file TsqrFactory_TbbTsqr.hpp -/// -/// \warning Trilinos users should _not_ include this file directly. - -#include "Tsqr_ConfigDefs.hpp" - -#ifdef HAVE_KOKKOSTSQR_TBB -# include "TbbTsqr.hpp" -#endif // HAVE_KOKKOSTSQR_TBB +#include "Tsqr_CombineFactory.hpp" namespace TSQR { - namespace Trilinos { +namespace Impl { -#ifdef HAVE_KOKKOSTSQR_TBB - /// \class TbbTsqrFactory - /// \brief Subclass of TsqrFactory that uses \c TSQR::TBB::TbbTsqr. - /// \author Mark Hoemmen - /// - /// \tparam LO "LocalOrdinal": the type of indices into the - /// node-local part of the matrix. - /// - /// \tparam S "Scalar": the type of entries in the node-local part - /// of the matrix. - /// - /// All of this class' public methods, other than the constructor - /// and destructor, are implemented in the parent class. - template - class TbbTsqrFactory : - public TsqrFactory, DistTsqr > { - public: - // Help C++ pull in the typedefs from the base class. C++ needs - // help when both the base and the derived classes are - // templated. - typedef typename base_type::node_tsqr_type node_tsqr_type; - typedef typename base_type::dist_tsqr_type dist_tsqr_type; - typedef typename base_type::tsqr_type tsqr_type; - typedef typename base_type::scalar_messenger_type scalar_messenger_type; +/// \class CombineUser +/// \brief Private base class for TSQR classes that use Combine. +/// +/// Classes that use Combine should inherit privately from this class, +/// in order to reuse getCombine. +template +class CombineUser { +public: + /// \brief Given the maximum number of columns that the caller + /// intends to give to Combine functions, return the best choice + /// of Combine implementation. + Combine& + getCombine (const LocalOrdinal maxNumCols) const { + if (combine_.get () == nullptr) { + using factory_type = CombineFactory; + combine_ = factory_type::create (maxNumCols); + } + return *combine_; + } - TbbTsqrFactory () {} - virtual ~TbbTsqrFactory () {} - }; -#endif // HAVE_KOKKOSTSQR_TBB + //! Return a specific Combine implementation. + Combine& + getCombine (const std::string& combineType) const { + if (combine_.get () == nullptr) { + using factory_type = CombineFactory; + combine_ = factory_type::create (combineType); + } + return *combine_; + } - } // namespace Trilinos -} // namespace TSQR +private: + mutable std::unique_ptr> combine_; +}; +} // namespace Impl +} // namespace TSQR -#endif // __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp +#endif // TSQR_COMBINEUSER_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp new file mode 100644 index 000000000000..4a7fdaccf368 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp @@ -0,0 +1,149 @@ +#include "Tsqr_Impl_CuBlas.hpp" +#if defined(HAVE_TPETRATSQR_CUBLAS) +#include "Tsqr_Impl_CuBlasHandle.hpp" +#include "Tsqr_Impl_CuTypes.hpp" +#include "Teuchos_Assert.hpp" + +namespace TSQR { +namespace Impl { + +template +class RawCuBlas {}; + +template<> +class RawCuBlas { +public: + using impl_scalar_type = double; + + static cublasStatus_t + gemm (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + const int m, const int n, const int k, + const impl_scalar_type* alpha, + const impl_scalar_type* A, const int lda, + const impl_scalar_type* B, const int ldb, + const impl_scalar_type* beta, + impl_scalar_type* C, const int ldc) + { + return cublasDgemm (handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + } +}; + +template<> +class RawCuBlas { +public: + using impl_scalar_type = float; + + static cublasStatus_t + gemm (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + const int m, const int n, const int k, + const impl_scalar_type* alpha, + const impl_scalar_type* A, const int lda, + const impl_scalar_type* B, const int ldb, + const impl_scalar_type* beta, + impl_scalar_type* C, const int ldc) + { + return cublasSgemm (handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + } +}; + +#if defined(HAVE_TPETRATSQR_COMPLEX) +template<> +class RawCuBlas>::type> { +public: + using impl_scalar_type = CudaValue>::type; + + static cublasStatus_t + gemm (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + const int m, const int n, const int k, + const impl_scalar_type* alpha, + const impl_scalar_type* A, const int lda, + const impl_scalar_type* B, const int ldb, + const impl_scalar_type* beta, + impl_scalar_type* C, const int ldc) + { + return cublasZgemm (handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + } +}; + +template<> +class RawCuBlas>::type> { +public: + using impl_scalar_type = CudaValue>::type; + + static cublasStatus_t + gemm (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + const int m, const int n, const int k, + const impl_scalar_type* alpha, + const impl_scalar_type* A, const int lda, + const impl_scalar_type* B, const int ldb, + const impl_scalar_type* beta, + impl_scalar_type* C, const int ldc) + { + return cublasCgemm (handle, transa, transb, m, n, k, + alpha, A, lda, B, ldb, beta, C, ldc); + } +}; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +template +CuBlas::CuBlas (CuBlasHandle handle) : + handle_ (handle) {} + +template +void +CuBlas:: +gemm (const char transa, + const char transb, + const int m, const int n, const int k, + const Scalar alpha, + const Scalar* A, const int lda, + const Scalar* B, const int ldb, + const Scalar beta, + Scalar* C, const int ldc) +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + const cublasOperation_t cuTransa = cuBlasTrans (transa); + const cublasOperation_t cuTransb = cuBlasTrans (transb); + + using IST = typename CudaValue::type; + const IST alpha_raw = CudaValue::makeValue (alpha); + const IST* A_raw = reinterpret_cast (A); + const IST* B_raw = reinterpret_cast (B); + const IST beta_raw = CudaValue::makeValue (beta); + IST* C_raw = reinterpret_cast (C); + + using impl_type = RawCuBlas; + // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm + // says that alpha and beta may be host or device pointers. + const auto status = + impl_type::gemm (rawHandle, cuTransa, cuTransb, + m, n, k, + &alpha_raw, A_raw, lda, + B_raw, ldb, + &beta_raw, C_raw, ldc); + TEUCHOS_ASSERT( status == CUBLAS_STATUS_SUCCESS ); +} + +template class CuBlas; +template class CuBlas; +#if defined(HAVE_TPETRATSQR_COMPLEX) +template class CuBlas>; +template class CuBlas>; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp new file mode 100644 index 000000000000..08ef1c989878 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp @@ -0,0 +1,44 @@ +#ifndef TSQR_IMPL_CUBLAS_HPP +#define TSQR_IMPL_CUBLAS_HPP + +#include "TpetraTSQR_config.h" +#if defined(HAVE_TPETRATSQR_CUBLAS) +# include "Tsqr_Impl_CuBlasHandle.hpp" +# if defined(HAVE_TPETRATSQR_COMPLEX) +# include +# endif // HAVE_TPETRATSQR_COMPLEX + +namespace TSQR { +namespace Impl { + +template +class CuBlas { +public: + CuBlas (CuBlasHandle handle); + + void + gemm (const char transa, + const char transb, + const int m, const int n, const int k, + const Scalar alpha, + const Scalar* A, const int lda, + const Scalar* B, const int ldb, + const Scalar beta, + Scalar* C, const int ldc); + +private: + CuBlasHandle handle_; +}; + +extern template class CuBlas; +extern template class CuBlas; +#if defined(HAVE_TPETRATSQR_COMPLEX) +extern template class CuBlas>; +extern template class CuBlas>; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS +#endif // TSQR_IMPL_CUBLAS_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp new file mode 100644 index 000000000000..352fe743b725 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp @@ -0,0 +1,38 @@ +#include "Tsqr_Impl_CuBlasHandle.hpp" + +#ifdef HAVE_TPETRATSQR_CUBLAS +#include "Kokkos_Core.hpp" +#include "Teuchos_Assert.hpp" +#include + +namespace TSQR { +namespace Impl { + +cublasHandle_t cuBlasRawHandle_ = nullptr; + +CuBlasHandle::CuBlasHandle (void* handle) : + handle_ (handle) +{} + +CuBlasHandle CuBlasHandle::getSingleton () +{ + static int called_before = 0; + if (called_before == 0) { + auto finalizer = [] () { + if (cuBlasRawHandle_ != nullptr) { + (void) cublasDestroy (cuBlasRawHandle_); + cuBlasRawHandle_ = nullptr; + } + }; + Kokkos::push_finalize_hook (finalizer); + auto status = cublasCreate (&cuBlasRawHandle_); + TEUCHOS_ASSERT( status == CUBLAS_STATUS_SUCCESS ); + called_before = 1; + } + TEUCHOS_ASSERT( cuBlasRawHandle_ != nullptr ); + return CuBlasHandle (cuBlasRawHandle_); +} + +} // namespace Impl +} // namespace TSQR +#endif // HAVE_TPETRATSQR_CUBLAS diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp new file mode 100644 index 000000000000..05899aaeb28d --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp @@ -0,0 +1,33 @@ +#ifndef TSQR_IMPL_CUBLASHANDLE_HPP +#define TSQR_IMPL_CUBLASHANDLE_HPP + +#include "TpetraTSQR_config.h" +#ifdef HAVE_TPETRATSQR_CUBLAS + +namespace TSQR { +namespace Impl { + +class CuBlasHandle { +private: + // This is actually a cublasHandle_t, which is a pointer type. + void* handle_ {nullptr}; + + CuBlasHandle (void* handle); + +public: + static CuBlasHandle getSingleton (); + + // This is not really encapsulation, because the "handle" type is + // just a pointer. However, it lets us define cuBlas wrapper + // functions without needing to make them friends of CuBlasHandle. + void* getHandle () const { + return handle_; + } +}; + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS + +#endif // TSQR_IMPL_CUBLASHANDLE_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp new file mode 100644 index 000000000000..e4f01e920285 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp @@ -0,0 +1,603 @@ +#include "Tsqr_Impl_CuSolver.hpp" +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) +#include "Tsqr_Impl_CuSolverHandle.hpp" +#include "Tsqr_Impl_CuTypes.hpp" +#include "Teuchos_Assert.hpp" + +namespace TSQR { +namespace Impl { + +template +class RawCuSolver {}; + +template<> +class RawCuSolver { +public: + using impl_scalar_type = double; + + static cusolverStatus_t + compute_QR_lwork (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + int *lwork) + { + return cusolverDnDgeqrf_bufferSize (handle, m, n, A, lda, lwork); + } + + static cusolverStatus_t + compute_QR (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + impl_scalar_type* tau, + impl_scalar_type* work, + int lwork, + int* info) + { + return cusolverDnDgeqrf (handle, m, n, A, lda, tau, + work, lwork, info); + } + + static cusolverStatus_t + apply_Q_factor_lwork (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + const impl_scalar_type* C, + int ldc, + int *lwork) + { + return cusolverDnDormqr_bufferSize (handle, side, trans, + m, n, k, A, lda, tau, + C, ldc, lwork); + } + + static cusolverStatus_t + apply_Q_factor (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + impl_scalar_type* C, + int ldc, + impl_scalar_type* work, + int lwork, + int* devInfo) + { + return cusolverDnDormqr (handle, side, trans, m, n, k, + A, lda, tau, C, ldc, + work, lwork, devInfo); + } + + static cusolverStatus_t + compute_explicit_Q_lwork (cusolverDnHandle_t handle, + int m, + int n, + int k, + const impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + int *lwork) + { + return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, + tau, lwork); + } + + static cusolverStatus_t + compute_explicit_Q (cusolverDnHandle_t handle, + int m, + int n, + int k, + impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + impl_scalar_type *work, + int lwork, + int *devInfo) + { + return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, + work, lwork, devInfo); + } +}; + +template<> +class RawCuSolver { +public: + using impl_scalar_type = float; + + static cusolverStatus_t + compute_QR_lwork (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + int *lwork) + { + return cusolverDnSgeqrf_bufferSize (handle, m, n, A, lda, lwork); + } + + static cusolverStatus_t + compute_QR (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + impl_scalar_type* tau, + impl_scalar_type* work, + int lwork, + int* info) + { + return cusolverDnSgeqrf (handle, m, n, A, lda, tau, + work, lwork, info); + } + + static cusolverStatus_t + apply_Q_factor_lwork (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + const impl_scalar_type* C, + int ldc, + int *lwork) + { + return cusolverDnSormqr_bufferSize (handle, side, trans, + m, n, k, A, lda, tau, + C, ldc, lwork); + } + + static cusolverStatus_t + apply_Q_factor (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + impl_scalar_type* C, + int ldc, + impl_scalar_type* work, + int lwork, + int* devInfo) + { + return cusolverDnSormqr (handle, side, trans, m, n, k, + A, lda, tau, C, ldc, + work, lwork, devInfo); + } + + static cusolverStatus_t + compute_explicit_Q_lwork (cusolverDnHandle_t handle, + int m, + int n, + int k, + const impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + int *lwork) + { + return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, + tau, lwork); + } + + static cusolverStatus_t + compute_explicit_Q (cusolverDnHandle_t handle, + int m, + int n, + int k, + impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + impl_scalar_type *work, + int lwork, + int *devInfo) + { + return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, + work, lwork, devInfo); + } +}; + +#if defined(HAVE_TPETRATSQR_COMPLEX) +template<> +class RawCuSolver>::type> { +public: + using impl_scalar_type = CudaValue>::type; + + static cusolverStatus_t + compute_QR_lwork (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + int *lwork) + { + return cusolverDnZgeqrf_bufferSize (handle, m, n, A, lda, lwork); + } + + static cusolverStatus_t + compute_QR (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + impl_scalar_type* tau, + impl_scalar_type* work, + int lwork, + int* info) + { + return cusolverDnZgeqrf (handle, m, n, A, lda, tau, + work, lwork, info); + } + + static cusolverStatus_t + apply_Q_factor_lwork (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + const impl_scalar_type* C, + int ldc, + int *lwork) + { + return cusolverDnZunmqr_bufferSize (handle, side, trans, + m, n, k, A, lda, tau, + C, ldc, lwork); + } + + static cusolverStatus_t + apply_Q_factor (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + impl_scalar_type* C, + int ldc, + impl_scalar_type* work, + int lwork, + int* devInfo) + { + return cusolverDnZunmqr (handle, side, trans, m, n, k, + A, lda, tau, C, ldc, + work, lwork, devInfo); + } + + static cusolverStatus_t + compute_explicit_Q_lwork (cusolverDnHandle_t handle, + int m, + int n, + int k, + const impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + int *lwork) + { + return cusolverDnZungqr_bufferSize(handle, m, n, k, A, lda, + tau, lwork); + } + + static cusolverStatus_t + compute_explicit_Q (cusolverDnHandle_t handle, + int m, + int n, + int k, + impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + impl_scalar_type *work, + int lwork, + int *devInfo) + { + return cusolverDnZungqr(handle, m, n, k, A, lda, tau, + work, lwork, devInfo); + } +}; + +template<> +class RawCuSolver>::type> { +public: + using impl_scalar_type = CudaValue>::type; + + static cusolverStatus_t + compute_QR_lwork (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + int *lwork) + { + return cusolverDnCgeqrf_bufferSize (handle, m, n, A, lda, lwork); + } + + static cusolverStatus_t + compute_QR (cusolverDnHandle_t handle, + int m, + int n, + impl_scalar_type* A, + int lda, + impl_scalar_type* tau, + impl_scalar_type* work, + int lwork, + int* info) + { + return cusolverDnCgeqrf (handle, m, n, A, lda, tau, + work, lwork, info); + } + + static cusolverStatus_t + apply_Q_factor_lwork (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + const impl_scalar_type* C, + int ldc, + int *lwork) + { + return cusolverDnCunmqr_bufferSize (handle, side, trans, + m, n, k, A, lda, tau, + C, ldc, lwork); + } + + static cusolverStatus_t + apply_Q_factor (cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const impl_scalar_type* A, + int lda, + const impl_scalar_type* tau, + impl_scalar_type* C, + int ldc, + impl_scalar_type* work, + int lwork, + int* devInfo) + { + return cusolverDnCunmqr (handle, side, trans, m, n, k, + A, lda, tau, C, ldc, + work, lwork, devInfo); + } + + static cusolverStatus_t + compute_explicit_Q_lwork (cusolverDnHandle_t handle, + int m, + int n, + int k, + const impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + int *lwork) + { + return cusolverDnCungqr_bufferSize(handle, m, n, k, A, lda, + tau, lwork); + } + + static cusolverStatus_t + compute_explicit_Q (cusolverDnHandle_t handle, + int m, + int n, + int k, + impl_scalar_type *A, + int lda, + const impl_scalar_type *tau, + impl_scalar_type *work, + int lwork, + int *devInfo) + { + return cusolverDnCungqr(handle, m, n, k, A, lda, tau, + work, lwork, devInfo); + } +}; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +template +CuSolver::CuSolver (CuSolverHandle handle, int* const info) : + handle_ (handle), info_ (info) +{} + +template +int +CuSolver:: +compute_QR_lwork (const int nrows, + const int ncols, + Scalar A[], + const int lda) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + int lwork = 0; + + using IST = typename CudaValue::type; + IST* A_raw = reinterpret_cast (A); + + using impl_type = RawCuSolver; + const auto status = + impl_type::compute_QR_lwork (rawHandle, nrows, ncols, + A_raw, lda, &lwork); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); + return lwork; +} + +template +void +CuSolver:: +compute_QR (const int nrows, + const int ncols, + Scalar A[], + const int lda, + Scalar tau[], + Scalar work[], + const int lwork) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + + using IST = typename CudaValue::type; + IST* A_raw = reinterpret_cast (A); + IST* tau_raw = reinterpret_cast (tau); + IST* work_raw = reinterpret_cast (work); + + using impl_type = RawCuSolver; + const auto status = + impl_type::compute_QR (rawHandle, nrows, ncols, A_raw, lda, + tau_raw, work_raw, lwork, info_); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); +} + +template +int +CuSolver:: +apply_Q_factor_lwork (const char side, + const char trans, + const int nrows, + const int ncols_C, + const int ncols_Q, + const Scalar Q[], + const int ldq, + const Scalar tau[], + Scalar C[], + const int ldc) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + const cublasSideMode_t cuSide = cuBlasSide (side); + const cublasOperation_t cuTrans = cuBlasTrans (trans); + int lwork = 0; + + using IST = typename CudaValue::type; + const IST* Q_raw = reinterpret_cast (Q); + const IST* tau_raw = reinterpret_cast (tau); + const IST* C_raw = reinterpret_cast (C); + + using impl_type = RawCuSolver; + const auto status = + impl_type::apply_Q_factor_lwork (rawHandle, cuSide, cuTrans, + nrows, ncols_C, ncols_Q, + Q_raw, ldq, tau_raw, + C_raw, ldc, &lwork); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); + return lwork; +} + +template +void +CuSolver:: +apply_Q_factor (const char side, + const char trans, + const int nrows, + const int ncols_C, + const int ncols_Q, + const Scalar Q[], + const int ldq, + const Scalar tau[], + Scalar C[], + const int ldc, + Scalar work[], + const int lwork) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + const cublasSideMode_t cuSide = cuBlasSide (side); + const cublasOperation_t cuTrans = cuBlasTrans (trans); + + using IST = typename CudaValue::type; + const IST* Q_raw = reinterpret_cast (Q); + const IST* tau_raw = reinterpret_cast (tau); + IST* C_raw = reinterpret_cast (C); + IST* work_raw = reinterpret_cast (work); + + using impl_type = RawCuSolver; + const auto status = + impl_type::apply_Q_factor (rawHandle, cuSide, cuTrans, + nrows, ncols_C, ncols_Q, + Q_raw, ldq, tau_raw, C_raw, ldc, + work_raw, lwork, info_); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); +} + +template +int +CuSolver:: +compute_explicit_Q_lwork(const int m, const int n, const int k, + Scalar A[], const int lda, + const Scalar tau[]) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + int lwork = 0; + + using IST = typename CudaValue::type; + const IST* A_raw = reinterpret_cast (A); + const IST* tau_raw = reinterpret_cast (tau); + + using impl_type = RawCuSolver; + const auto status = + impl_type::compute_explicit_Q_lwork (rawHandle, m, n, k, + A_raw, lda, tau_raw, &lwork); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); + return lwork; +} + +template +void +CuSolver:: +compute_explicit_Q(const int m, const int n, const int k, + Scalar A[], const int lda, + const Scalar tau[], + Scalar work[], const int lwork) const +{ + auto rawHandle = + reinterpret_cast (handle_.getHandle ()); + using IST = typename CudaValue::type; + IST* A_raw = reinterpret_cast (A); + const IST* tau_raw = reinterpret_cast (tau); + IST* work_raw = reinterpret_cast (work); + + using impl_type = RawCuSolver; + const auto status = + impl_type::compute_explicit_Q (rawHandle, m, n, k, A_raw, lda, + tau_raw, work_raw, lwork, info_); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); +} + +template class CuSolver; +template class CuSolver; +#if defined(HAVE_TPETRATSQR_COMPLEX) +template class CuSolver>; +template class CuSolver>; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp new file mode 100644 index 000000000000..7123b8d4479c --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp @@ -0,0 +1,91 @@ +#ifndef TSQR_IMPL_CUSOLVER_HPP +#define TSQR_IMPL_CUSOLVER_HPP + +#include "TpetraTSQR_config.h" +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) +#include "Tsqr_Impl_CuBlasHandle.hpp" +#include "Tsqr_Impl_CuSolverHandle.hpp" +#if defined(HAVE_TPETRATSQR_COMPLEX) +# include +#endif // HAVE_TPETRATSQR_COMPLEX +#include "Tsqr_Impl_RawQR.hpp" + +namespace TSQR { +namespace Impl { + +template +class CuSolver : public RawQR { +public: + CuSolver(CuSolverHandle handle, int* const info); + + virtual bool wants_device_memory () const { return true; } + + int + compute_QR_lwork(const int nrows, + const int ncols, + Scalar A_raw[], + const int lda) const override; + + void + compute_QR(const int nrows, + const int ncols, + Scalar A[], + const int lda, + Scalar tau[], + Scalar work[], + const int lwork) const override; + + int + apply_Q_factor_lwork(const char side, + const char trans, + const int nrows, + const int ncols_C, + const int ncols_Q, + const Scalar Q[], + const int ldq, + const Scalar tau[], + Scalar C[], + const int ldc) const override; + + void + apply_Q_factor(const char side, + const char trans, + const int nrows, + const int ncols_C, + const int ncols_Q, + const Scalar Q[], + const int ldq, + const Scalar tau[], + Scalar C[], + const int ldc, + Scalar work[], + const int lwork) const override; + + int + compute_explicit_Q_lwork(const int m, const int n, const int k, + Scalar A[], const int lda, + const Scalar tau[]) const override; + + void + compute_explicit_Q(const int m, const int n, const int k, + Scalar A[], const int lda, + const Scalar tau[], + Scalar work[], const int lwork) const override; + +private: + CuSolverHandle handle_; + int* info_; // DEVICE MEMORY +}; + +extern template class CuSolver; +extern template class CuSolver; +#if defined(HAVE_TPETRATSQR_COMPLEX) +extern template class CuSolver>; +extern template class CuSolver>; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER +#endif // TSQR_IMPL_CUSOLVER_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp new file mode 100644 index 000000000000..23be0a6cec51 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp @@ -0,0 +1,38 @@ +#include "Tsqr_Impl_CuSolverHandle.hpp" + +#ifdef HAVE_TPETRATSQR_CUSOLVER +#include "Kokkos_Core.hpp" +#include "Teuchos_Assert.hpp" +#include + +namespace TSQR { +namespace Impl { + +cusolverDnHandle_t cuSolverRawHandle_ = nullptr; + +CuSolverHandle::CuSolverHandle (void* handle) : + handle_ (handle) +{} + +CuSolverHandle CuSolverHandle::getSingleton () +{ + static int called_before = 0; + if (called_before == 0) { + auto finalizer = [] () { + if (cuSolverRawHandle_ != nullptr) { + (void) cusolverDnDestroy (cuSolverRawHandle_); + cuSolverRawHandle_ = nullptr; + } + }; + Kokkos::push_finalize_hook (finalizer); + auto status = cusolverDnCreate (&cuSolverRawHandle_); + TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS ); + called_before = 1; + } + TEUCHOS_ASSERT( cuSolverRawHandle_ != nullptr ); + return CuSolverHandle (cuSolverRawHandle_); +} + +} // namespace Impl +} // namespace TSQR +#endif // HAVE_TPETRATSQR_CUSOLVER diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp new file mode 100644 index 000000000000..802f81e3c742 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp @@ -0,0 +1,33 @@ +#ifndef TSQR_IMPL_CUSOLVERHANDLE_HPP +#define TSQR_IMPL_CUSOLVERHANDLE_HPP + +#include "TpetraTSQR_config.h" +#ifdef HAVE_TPETRATSQR_CUSOLVER + +namespace TSQR { +namespace Impl { + +class CuSolverHandle { +private: + // This is actually a cusolverDnHandle_t, which is a pointer type. + void* handle_ {nullptr}; + + CuSolverHandle (void* handle); + +public: + static CuSolverHandle getSingleton (); + + // This is not really encapsulation, because the "handle" type is + // just a pointer. However, it lets us define cuSolver wrapper + // functions without needing to make them friends of CuSolverHandle. + void* getHandle () const { + return handle_; + } +}; + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUSOLVER + +#endif // TSQR_IMPL_CUSOLVERHANDLE_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp new file mode 100644 index 000000000000..edccc391d01a --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp @@ -0,0 +1,33 @@ +#include "Tsqr_Impl_CuTypes.hpp" +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + +namespace TSQR { +namespace Impl { + +cublasSideMode_t cuBlasSide (const char side) +{ + if (side == 'L' || side == 'l') { + return CUBLAS_SIDE_LEFT; + } + else { + return CUBLAS_SIDE_RIGHT; + } +} + +cublasOperation_t cuBlasTrans (const char trans) +{ + if (trans == 'C' || trans == 'c') { + return CUBLAS_OP_C; + } + else if (trans == 'T' || trans == 't') { + return CUBLAS_OP_T; + } + else { + return CUBLAS_OP_N; + } +} + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp new file mode 100644 index 000000000000..6f271895dc08 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp @@ -0,0 +1,96 @@ +#ifndef TSQR_IMPL_CUTYPES_HPP +#define TSQR_IMPL_CUTYPES_HPP + +#include "TpetraTSQR_config.h" +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) +#include // for cublasSideMode_t etc. +#include +#if defined(HAVE_TPETRATSQR_COMPLEX) +# include +#endif // HAVE_TPETRATSQR_COMPLEX + +namespace TSQR { +namespace Impl { + +template +struct CudaValue {}; + +template<> +struct CudaValue { + using type = double; + + static type makeValue (const double x) { + return x; + } + + static bool arrayCorrectlyAligned (const double* const /* x */) { + return true; + } +}; + +template<> +struct CudaValue { + using type = float; + + static type makeValue (const float x) { + return x; + } + + static bool arrayCorrectlyAligned (const double* const /* x */) { + return true; + } +}; + +#if defined(HAVE_TPETRATSQR_COMPLEX) +// FIXME (mfh 10 Dec 2019) CUDA's built-in complex types must be +// aligned to the whole type, not just to double or float (as with +// std::complex or (currently) Kokkos::complex). +template<> +struct CudaValue> { + using type = cuDoubleComplex; + + static type makeValue (const std::complex x) { + return make_cuDoubleComplex (std::real (x), std::imag (x)); + } + + static bool + arrayCorrectlyAligned (const std::complex* const x) + { + // CUDA requires arrays of complex to be aligned to the full type, + // not just to one of the two numbers (as with std::complex). + constexpr size_t requiredAlignment = + sizeof (std::complex); + return x == nullptr || + reinterpret_cast (x) % requiredAlignment == 0; + } +}; + +template<> +struct CudaValue> { + using type = cuFloatComplex; + + static type makeValue (const std::complex x) { + return make_cuFloatComplex (std::real (x), std::imag (x)); + } + + static bool + arrayCorrectlyAligned (const std::complex* const x) + { + // CUDA requires arrays of complex to be aligned to the full type, + // not just to one of the two numbers (as with std::complex). + constexpr size_t requiredAlignment = + sizeof (std::complex); + return x == nullptr || + reinterpret_cast (x) % requiredAlignment == 0; + } +}; +#endif // defined(HAVE_TPETRATSQR_COMPLEX) + +cublasSideMode_t cuBlasSide (const char side); +cublasOperation_t cuBlasTrans (const char trans); + +} // namespace Impl +} // namespace TSQR + +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER +#endif // TSQR_IMPL_CUTYPES_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp index 51d105b6bc68..fed10d62136e 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp @@ -6,119 +6,215 @@ namespace TSQR { namespace Impl { -#define TSQR_IMPL_LAPACK_IMPL( Scalar ) \ -void Lapack:: \ -LARNV(const int idist, int seed[], const int n, \ - value_type v[]) const \ -{ \ - Teuchos::LAPACK lapack; \ - lapack.LARNV(idist, seed, n, v); \ -} \ - \ -void Lapack:: \ -POTRF(const char UPLO, const int n, \ - value_type A[], const int lda) const \ -{ \ - Teuchos::LAPACK lapack; \ - int info = 0; \ - lapack.POTRF(UPLO, n, A, lda, &info); \ - if (info != 0) { \ - std::ostringstream os; \ - os << "LAPACK POTRF (Cholesky factorization) " \ - << "failed with INFO = " << info << "."; \ - throw std::logic_error (os.str ()); \ - } \ -} \ - \ -void Lapack:: \ -GESVD(const char JOBU, const char JOBVT, \ - const int m, const int n, \ - value_type A[], const int lda, \ - magnitude_type S[], value_type U[], const int ldu, \ - value_type V[], const int ldv, \ - value_type WORK[], const int lwork, \ - magnitude_type RWORK[]) const \ -{ \ - Teuchos::LAPACK lapack; \ - int info = 0; \ - lapack.GESVD(JOBU, JOBVT, m, n, A, lda, S, \ - U, ldu, V, ldv, WORK, lwork, RWORK, &info); \ - if (info != 0) { \ - std::ostringstream os; \ - os << "LAPACK GESVD (singular value decomposition) " \ - << "failed with INFO = " << info << "."; \ - throw std::logic_error (os.str ()); \ - } \ -} \ - \ -void Lapack:: \ -LARFG(const int n, value_type& alpha, value_type x[], \ - const int incx, value_type& tau) const \ -{ \ - Teuchos::LAPACK lapack; \ - lapack.LARFG(n, &alpha, x, incx, &tau); \ -} \ - \ -void Lapack:: \ -compute_QR(const int m, const int n, value_type A[], const int lda, \ - value_type TAU[], value_type WORK[], const int lwork) const \ -{ \ - Teuchos::LAPACK lapack; \ - int info = 0; \ - lapack.GEQRF(m, n, A, lda, TAU, WORK, lwork, &info); \ - if (info != 0) { \ - std::ostringstream os; \ - os << "LAPACK GEQRF (QR factorization) failed with INFO = " \ - << info << "."; \ - throw std::logic_error (os.str()); \ - } \ -} \ - \ -void Lapack:: \ -apply_Q_factor(const char SIDE, const char TRANS, \ - const int m, const int n, const int k, \ - const value_type A[], const int lda, \ - const value_type TAU[], \ - value_type C[], const int ldc, \ - value_type WORK[], const int lwork) const \ -{ \ - Teuchos::LAPACK lapack; \ - int info = 0; \ - value_type* A_nc = const_cast(A); \ - lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, WORK, \ - lwork, &info); \ - if (info != 0) { \ - std::ostringstream os; \ - os << "LAPACK UNMQR (apply Q factor from GEQRF) failed with " \ - "INFO = " << info << "."; \ - throw std::logic_error (os.str()); \ - } \ -} \ - \ -void Lapack:: \ -compute_explicit_Q(const int m, const int n, const int k, \ - value_type A[], const int lda, \ - const value_type TAU[], value_type WORK[], \ - const int lwork) const \ -{ \ - Teuchos::LAPACK lapack; \ - int info = 0; \ - lapack.UNGQR(m, n, k, A, lda, TAU, WORK, lwork, &info); \ - if (info != 0) { \ - std::ostringstream os; \ - os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) " \ - "failed with INFO = " << info << "."; \ - throw std::logic_error (os.str()); \ - } \ +template +void Lapack:: +LARNV(const int idist, int seed[], const int n, + value_type v[]) const +{ + Teuchos::LAPACK lapack; + lapack.LARNV(idist, seed, n, v); } -TSQR_IMPL_LAPACK_IMPL( float ) -TSQR_IMPL_LAPACK_IMPL( double ) +template +void Lapack:: +POTRF(const char UPLO, const int n, + value_type A[], const int lda) const +{ + Teuchos::LAPACK lapack; + int info = 0; + lapack.POTRF(UPLO, n, A, lda, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK POTRF (Cholesky factorization) " + << "failed with INFO = " << info << "."; + throw std::logic_error (os.str ()); + } +} + +template +void Lapack:: +GESVD(const char JOBU, const char JOBVT, + const int m, const int n, + value_type A[], const int lda, + magnitude_type S[], value_type U[], const int ldu, + value_type V[], const int ldv, + value_type WORK[], const int lwork, + magnitude_type RWORK[]) const +{ + Teuchos::LAPACK lapack; + int info = 0; + lapack.GESVD(JOBU, JOBVT, m, n, A, lda, S, + U, ldu, V, ldv, WORK, lwork, RWORK, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK GESVD (singular value decomposition) " + << "failed with INFO = " << info << "."; + throw std::logic_error (os.str ()); + } +} + +template +void Lapack:: +LARFG(const int n, value_type& alpha, value_type x[], + const int incx, value_type& tau) const +{ + Teuchos::LAPACK lapack; + lapack.LARFG(n, &alpha, x, incx, &tau); +} + +template +int Lapack:: +compute_QR_lwork (const int m, const int n, + value_type A[], const int lda) const +{ + Teuchos::LAPACK lapack; + Scalar WORK {}; + int lwork = -1; + int info = 0; + lapack.GEQRF(m, n, A, lda, nullptr, &WORK, lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK GEQRF (QR factorization) LWORK query " + "failed with INFO = " << info << "."; + throw std::logic_error (os.str ()); + } + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; + lwork = mag_type (STS::real (WORK)); + if (lwork < mag_type {}) { + std::ostringstream os; + os << "LAPACK GEQRF (QR factorization) LWORK query " + "returned INFO=0, but WORK=" << lwork << " < 0."; + throw std::logic_error (os.str ()); + } + return lwork; +} + +template +void Lapack:: +compute_QR(const int m, const int n, value_type A[], const int lda, + value_type TAU[], value_type WORK[], const int lwork) const +{ + Teuchos::LAPACK lapack; + int info = 0; + lapack.GEQRF(m, n, A, lda, TAU, WORK, lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK GEQRF (QR factorization) failed with INFO = " + << info << "."; + throw std::logic_error (os.str()); + } +} + +template +int Lapack:: +apply_Q_factor_lwork(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc) const +{ + Teuchos::LAPACK lapack; + value_type WORK {}; + int lwork = -1; + int info = 0; + value_type* A_nc = const_cast(A); + lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, &WORK, + lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK UNMQR (apply Q factor from GEQRF) LWORK query " + "failed with INFO = " << info << "."; + throw std::logic_error (os.str()); + } + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; + lwork = mag_type (STS::real (WORK)); + if (lwork < mag_type {}) { + std::ostringstream os; + os << "LAPACK UNMQR (apply Q factor from GEQRF) LWORK query " + "returned INFO=0, but WORK=" << lwork << " < 0."; + throw std::logic_error (os.str ()); + } + return lwork; +} + +template +void Lapack:: +apply_Q_factor(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc, + value_type WORK[], const int lwork) const +{ + Teuchos::LAPACK lapack; + int info = 0; + value_type* A_nc = const_cast(A); + lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, WORK, + lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK UNMQR (apply Q factor from GEQRF) failed with " + "INFO = " << info << "."; + throw std::logic_error (os.str()); + } +} + +template +int Lapack:: +compute_explicit_Q_lwork (const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[]) const +{ + Teuchos::LAPACK lapack; + Scalar WORK {}; + int lwork = -1; + int info = 0; + lapack.UNGQR(m, n, k, A, lda, TAU, &WORK, lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) " + "LWORK query failed with INFO = " << info << "."; + throw std::logic_error (os.str()); + } + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; + lwork = mag_type (STS::real (WORK)); + if (lwork < mag_type {}) { + std::ostringstream os; + os << "LAPACK UNGQR (compute explicit Q factor form GEQRF) " + "LWORK query returned INFO=0, but WORK=" << lwork << " < 0."; + throw std::logic_error (os.str ()); + } + return lwork; +} + +template +void Lapack:: +compute_explicit_Q(const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[], value_type WORK[], + const int lwork) const +{ + Teuchos::LAPACK lapack; + int info = 0; + lapack.UNGQR(m, n, k, A, lda, TAU, WORK, lwork, &info); + if (info != 0) { + std::ostringstream os; + os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) " + "failed with INFO = " << info << "."; + throw std::logic_error (os.str()); + } +} + +template class Lapack; +template class Lapack; -#ifdef HAVE_KOKKOSTSQR_COMPLEX -TSQR_IMPL_LAPACK_IMPL( std::complex ) -TSQR_IMPL_LAPACK_IMPL( std::complex ) -#endif // HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX +template class Lapack>; +template class Lapack>; +#endif // HAVE_TPETRATSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp index 392f2aa4f6c4..8dc20b55b4d5 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp @@ -8,73 +8,92 @@ namespace TSQR { namespace Impl { +/// \brief Implementation of RawQR that uses the system's LAPACK +/// library via Teuchos::LAPACK. +/// +/// This class provides functions not in RawQR for the sake of +/// CombineNative. CombineNative needs LARFG, but it's not properly +/// part of RawQR. It doesn't make sense to launch a device kernel +/// from host for every column of the matrix, especially not when +/// cuSOLVER already has all the needed QR factorization and apply Q +/// factor functions. template -class Lapack {}; - -// CombineNative needs LARFG, but it's not properly part of RawQR. -// RawQR needs to be able to wrap lots of different functions, -// including whatever cuSOLVER provides. It doesn't make sense to -// launch a device kernel from host for ever column of the matrix, -// especially not when cuSOLVER already has all the needed QR -// factorization and apply Q factor functions. - -#define TSQR_IMPL_LAPACK_DECL( Scalar ) \ -template<> \ -class Lapack : public RawQR { \ -public: \ - using value_type = Scalar; \ - using magnitude_type = decltype(std::abs(Scalar{})); \ - \ - ~Lapack() = default; \ - \ - void \ - compute_QR(const int m, const int n, value_type A[], \ - const int lda, value_type TAU[], value_type WORK[], \ - const int lwork) const override; \ - \ - void \ - apply_Q_factor(const char SIDE, const char TRANS, \ - const int m, const int n, const int k, \ - const value_type A[], const int lda, \ - const value_type TAU[], \ - value_type C[], const int ldc, \ - value_type WORK[], const int lwork) const override; \ - \ - void \ - compute_explicit_Q(const int m, const int n, const int k, \ - value_type A[], const int lda, \ - const value_type TAU[], value_type WORK[], \ - const int lwork) const override; \ - \ - void \ - GESVD(const char JOBU, const char JOBVT, \ - const int m, const int n, \ - value_type A[], const int lda, \ - magnitude_type S[], value_type U[], const int ldu, \ - value_type V[], const int ldv, \ - value_type WORK[], const int lwork, \ - magnitude_type RWORK[]) const; \ - \ - void \ - LARFG(const int n, value_type& alpha, value_type x[], \ - const int incx, value_type& tau) const; \ - \ - void \ - POTRF(const char UPLO, const int n, \ - value_type A[], const int lda) const; \ - \ - void \ - LARNV(const int idist, int seed[], const int n, \ - value_type v[]) const; \ +class Lapack : public RawQR { +public: + using value_type = Scalar; + using magnitude_type = decltype(std::abs(Scalar{})); + + // NOTE (mfh 22 Dec 2019) I would normally write "= default;" here, + // but Intel 17 appears to have a bug that requires an explicit + // nondefault definition. See discussion here: + // + // https://github.com/trilinos/Trilinos/pull/6488#issuecomment-568351758 + ~Lapack() override {} + + int + compute_QR_lwork(const int m, const int n, + value_type A[], const int lda) const override; + + void + compute_QR(const int m, const int n, value_type A[], + const int lda, value_type TAU[], value_type WORK[], + const int lwork) const override; + + int + apply_Q_factor_lwork(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc) const override; + + void + apply_Q_factor(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc, + value_type WORK[], const int lwork) const override; + + int + compute_explicit_Q_lwork(const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[]) const override; + + void + compute_explicit_Q(const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[], value_type WORK[], + const int lwork) const override; + + void + GESVD(const char JOBU, const char JOBVT, + const int m, const int n, + value_type A[], const int lda, + magnitude_type S[], value_type U[], const int ldu, + value_type V[], const int ldv, + value_type WORK[], const int lwork, + magnitude_type RWORK[]) const; + + void + LARFG(const int n, value_type& alpha, value_type x[], + const int incx, value_type& tau) const; + + void + POTRF(const char UPLO, const int n, + value_type A[], const int lda) const; + + void + LARNV(const int idist, int seed[], const int n, + value_type v[]) const; }; -TSQR_IMPL_LAPACK_DECL( float ) -TSQR_IMPL_LAPACK_DECL( double ) +extern template class Lapack; +extern template class Lapack; -#ifdef HAVE_KOKKOSTSQR_COMPLEX -TSQR_IMPL_LAPACK_DECL( std::complex ) -TSQR_IMPL_LAPACK_DECL( std::complex ) -#endif // HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX +extern template class Lapack>; +extern template class Lapack>; +#endif // HAVE_TPETRATSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp index 307aa103e9a9..f078bb72dec9 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp @@ -13,12 +13,6 @@ namespace Impl { /// CUDA stream instance (cudaStream_t) and a cuSOLVER handle /// (cusolverDnHandle_t). /// -/// WORK size query ("LWORK query") happens as in LAPACK, by passing -/// in lwork = -1. A cuSOLVER Implementation would just check if -/// lwork is -1, and call cusolverDn?geqrf_bufferSize in that case -/// (replace the question mark with S, D, C, or Z as appropriate for -/// the Scalar type). -/// /// Methods are virtual because they are meant to be called from host. /// (For the CUDA case, we plan to make cuSOLVER calls from host; we /// don't need to call QR from device.) @@ -29,6 +23,18 @@ class RawQR { virtual ~RawQR() = default; + /// \brief Whether the subclass takes arrays and pointers as + /// "device" (GPU) memory. + /// + /// Unlike with NodeTsqr, this means all array and pointers, + /// not just "large" ones. + virtual bool wants_device_memory() const { return false; } + + //! Get recommended work array size for compute_QR. + virtual int + compute_QR_lwork(const int m, const int n, + value_type A[], const int lda) const = 0; + //! Compute QR factorization of a general m by n matrix A. virtual void compute_QR(const int m, const int n, @@ -36,6 +42,14 @@ class RawQR { value_type TAU[], value_type WORK[], const int lwork) const = 0; + //! Get recommended work array size for apply_Q_factor. + virtual int + apply_Q_factor_lwork(const char SIDE, const char TRANS, + const int m, const int n, const int k, + const value_type A[], const int lda, + const value_type TAU[], + value_type C[], const int ldc) const = 0; + /// \brief Apply Householder reflectors. /// /// Overwrite the general complex m by n matrix C with the product @@ -52,6 +66,12 @@ class RawQR { value_type C[], const int ldc, value_type WORK[], const int lwork) const = 0; + //! Get recommended work array size for compute_explicit_Q. + virtual int + compute_explicit_Q_lwork(const int m, const int n, const int k, + value_type A[], const int lda, + const value_type TAU[]) const = 0; + /// \brief Compute explicit QR factor from QR factorization (GEQRF). /// /// Generate the m by n matrix Q with orthonormal (or unitary, if diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp index bc19ef78be03..25219f6d28b7 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp @@ -90,10 +90,10 @@ TRSM(const Teuchos::ESide side, const Teuchos::EUplo uplo, \ TSQR_IMPL_SYSTEMBLAS_IMPL( float ) TSQR_IMPL_SYSTEMBLAS_IMPL( double ) -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex ) -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp index 1e49ddc266c8..7b1599e41df1 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp @@ -58,10 +58,10 @@ public: \ TSQR_IMPL_SYSTEMBLAS_DECL( float ) TSQR_IMPL_SYSTEMBLAS_DECL( double ) -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) TSQR_IMPL_SYSTEMBLAS_DECL( std::complex ) -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX } // namespace Impl } // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp deleted file mode 100644 index 71b823b19558..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp +++ /dev/null @@ -1,1728 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -/// \file Tsqr_KokkosNodeTsqr.hpp -/// \brief Parallel intranode TSQR implemented using Kokkos::parallel_for. - -#ifndef __TSQR_KokkosNodeTsqr_hpp -#define __TSQR_KokkosNodeTsqr_hpp - -#include "Tsqr_CacheBlocker.hpp" -#include "Tsqr_Combine.hpp" -#include "Tsqr_NodeTsqr.hpp" -#include "Tsqr_Impl_SystemBlas.hpp" - -#include "Teuchos_ParameterListAcceptorDefaultBase.hpp" -#include "Kokkos_Core.hpp" - -namespace TSQR { - namespace details { - /// \brief Half-exclusive range of my partition's cache block indices. - /// - /// \c FactorFirstPass (used by the factor() method of \c - /// KokkosNodeTsqr) breaks up the matrix into contiguous - /// partitions of row blocks. The index argument of Kokkos' - /// parallel_for is the (zero-based) partition index. This - /// function returns the half-exclusive range of the cache block - /// indices belonging to the partition partitionIndex. - /// - /// \param numRows [in] Number of rows in the matrix. - /// \param numCols [in] Number of columns in the matrix. - /// \param partitionIndex [in] Zero-based index of the partition. - /// This is specifically an int and not a LocalOrdinal, because - /// partition indices are arguments to Kokkos Node API methods - /// parallel_for and parallel_reduce. Cache block indices are - /// of LocalOrdinal type and should not be mixed with partition - /// indices, even though in most cases LocalOrdinal == int. - /// \param numPartitions [in] Total number of partitions; a - /// positive integer. - /// \param strategy [in] The cache blocking strategy to use. - /// - /// \return (start cache block index, end cache block index). - /// This is a half-exclusive range: it does not include the end - /// point. Thus, if the two indices are equal, the range is - /// empty. - template - std::pair - cacheBlockIndexRange (const LocalOrdinal numRows, - const LocalOrdinal numCols, - const int partitionIndex, - const int numPartitions, - const CacheBlockingStrategy& strategy) - { - using LO = LocalOrdinal; - // The input index is a zero-based index of the current - // partition (not the "current cache block" -- a partition - // contains zero or more cache blocks). If the input index is - // out of range, then return, since there is nothing to do. - // - // The nice thing about partitioning over cache blocks is that - // the cache blocking strategy guarantees that exactly one of - // the following is true: - // - // 1. The partition is empty (contains zero cache blocks) - // 2. All cache blocks in the partition are valid (none - // contains more columns than rows) - - // Return an empty partition (an empty cache block range) if - // the partition index is out of range. - if (partitionIndex >= numPartitions) { - return {0, 0}; - } - - const LO numRowsCacheBlock = - strategy.cache_block_num_rows (numCols); - const LO numCacheBlocks = - strategy.num_cache_blocks (numRows, numCols, numRowsCacheBlock); - - // Figure out how many cache blocks my partition contains. If - // the number of partitions doesn't evenly divide the number - // of cache blocks, we spread out the remainder among the - // first few threads. - const LO quotient = numCacheBlocks / numPartitions; - const LO remainder = numCacheBlocks - quotient * numPartitions; - const LO myNumCacheBlocks = (partitionIndex < remainder) ? - (quotient + 1) : quotient; - - // If there are no cache blocks, there is nothing to factor. - // Return an empty cache block range to indicate this. - if (myNumCacheBlocks == 0) { - return {0, 0}; - } - - // Index of my first cache block (inclusive). - const LO myFirstCacheBlockIndex = (partitionIndex < remainder) ? - partitionIndex * (quotient+1) : - remainder * (quotient+1) + (partitionIndex - remainder) * quotient; - // Index of my last cache block (exclusive). - const LO myLastCacheBlockIndex = (partitionIndex+1 < remainder) ? - (partitionIndex+1) * (quotient+1) : - remainder * (quotient+1) + (partitionIndex+1 - remainder) * quotient; - TEUCHOS_TEST_FOR_EXCEPTION - (myLastCacheBlockIndex <= myFirstCacheBlockIndex, - std::logic_error, "Partition " << (partitionIndex+1) << " of " - << numPartitions << ": My range of cache block indices [" - << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex - << ") is empty."); - return {myFirstCacheBlockIndex, myLastCacheBlockIndex}; - } - - - /// \class FactorFirstPass - /// \brief First pass of KokkosNodeTsqr's factorization. - /// \author Mark Hoemmen - template - class FactorFirstPass { - public: - typedef MatView mat_view_type; - - private: - mat_view_type A_; - // While tauArrays_ is shared among tasks (i.e., partitions), - // there are no race conditions among entries, since each - // partition writes its own entry. Ditto for topBlocks_. - std::vector >& tauArrays_; - std::vector& topBlocks_; - CacheBlockingStrategy strategy_; - int numPartitions_; - bool contiguousCacheBlocks_; - - std::vector - factorFirstCacheBlock (Combine& combine, - const mat_view_type& A_top, - std::vector& work) const - { - std::vector tau (A_top.extent(1)); - - // We should only call this if A_top.extent(1) > 0 and therefore - // work.size() > 0, but we've already checked for that, so we - // don't have to check again. - combine.factor_first (A_top, tau.data(), work.data()); - return tau; - } - - std::vector - factorCacheBlock (Combine& combine, - const mat_view_type& A_top, - const mat_view_type& A_cur, - std::vector& work) const - { - std::vector tau (A_top.extent(1)); - - // We should only call this if A_top.extent(1) > 0 and therefore - // tau.size() > 0 and work.size() > 0, but we've already - // checked for that, so we don't have to check again. - combine.factor_inner (A_top, A_cur, tau.data(), work.data()); - return tau; - } - - /// \brief Factor the given cache block range using sequential TSQR. - /// - /// \param cbIndices [in] Half-exclusive range of cache block indices. - /// \param partitionIndex [in] Zero-based index of my partition. - /// - /// \return A view of the top block of the cache block range. - mat_view_type - factor (const std::pair cbIndices, - const int partitionIndex) const - { - const char suffix[] = " Please report this bug to the Tpetra developers."; - using cb_range_type = CacheBlockRange; - - // Workspace is created here, because it must not be shared - // among threads. - std::vector work (A_.extent(1)); - - // Range of cache blocks to factor. - cb_range_type cbRange (A_, strategy_, cbIndices.first, - cbIndices.second, contiguousCacheBlocks_); - // Iterator in the forward direction over the range of cache - // blocks to factor. - typedef typename CacheBlockRange::iterator range_iter_type; - range_iter_type cbIter = cbRange.begin(); - - // Remember the top (first) block. - mat_view_type A_top = *cbIter; - if (A_top.empty ()) { - return A_top; - } - TEUCHOS_TEST_FOR_EXCEPTION - (cbIndices.first >= cbIndices.second, std::logic_error, - "FactorFirstPass::factor: A_top is not empty, but the " - "cache block index range " << cbIndices.first << "," - << cbIndices.second << " is empty." << suffix); - - // Current cache block index. - LocalOrdinal curTauIdx = cbIndices.first; - - // Factor the first cache block. - Combine combine; - tauArrays_[curTauIdx++] = factorFirstCacheBlock (combine, A_top, work); - - // Move past the first cache block. - ++cbIter; - - // Number of cache block(s) we have factored thus far. - LocalOrdinal count = 1; - - // Factor the remaining cache block(s). - range_iter_type cbEnd = cbRange.end(); - while (cbIter != cbEnd) { - mat_view_type A_cur = *cbIter; - // Iteration over cache blocks of a partition should - // always result in nonempty cache blocks. - TEUCHOS_TEST_FOR_EXCEPTION - (A_cur.empty (), std::logic_error, "FactorFirstPass::factor: " - "The current cache block (the " << count << "-th to factor in the " - "range [" << cbIndices.first << "," << cbIndices.second << ") of " - "cache block indices) in partition " << (partitionIndex+1) << " " - "(out of " << numPartitions_ << " partitions) is empty." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (static_cast(curTauIdx) >= tauArrays_.size(), - std::logic_error, "FactorFirstPass::factor: curTauIdx (= " - << curTauIdx << ") >= tauArrays_.size() (= " - << tauArrays_.size() << ")." << suffix); - tauArrays_[curTauIdx++] = - factorCacheBlock (combine, A_top, A_cur, work); - ++count; - ++cbIter; - } - return A_top; - } - - public: - /// \brief Constructor - /// - /// \param A [in/out] On input: View of the matrix to factor. - /// On output: (Part of) the implicitly stored Q factor. - /// (The other part is tauArrays.) - /// \param tauArrays [out] Where to write the "TAU" arrays - /// (implicit factorization results) for each cache block. - /// (TAU is what LAPACK's QR factorization routines call this - /// array; see the LAPACK documentation for an explanation.) - /// Indexed by the cache block index; one TAU array per cache - /// block. - /// \param strategy [in] Cache blocking strategy to use. - /// \param numPartitions [in] Number of partitions (positive - /// integer), and therefore the maximum parallelism available - /// to the algorithm. Oversubscribing processors is OK, but - /// should not be done to excess. This is an int, and not a - /// LocalOrdinal, because it is the argument to Kokkos' - /// parallel_for. - /// \param contiguousCacheBlocks [in] Whether the cache blocks - /// of A are stored contiguously. - FactorFirstPass (const mat_view_type& A, - std::vector >& tauArrays, - std::vector& topBlocks, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool contiguousCacheBlocks = false) : - A_ (A), - tauArrays_ (tauArrays), - topBlocks_ (topBlocks), - strategy_ (strategy), - numPartitions_ (numPartitions), - contiguousCacheBlocks_ (contiguousCacheBlocks) - { - TEUCHOS_TEST_FOR_EXCEPTION(A_.empty(), std::logic_error, - "TSQR::FactorFirstPass constructor: A is empty. " - "Please report this bug to the Kokkos developers."); - TEUCHOS_TEST_FOR_EXCEPTION(numPartitions < 1, std::logic_error, - "TSQR::FactorFirstPass constructor: numPartitions " - "must be positive, but numPartitions = " - << numPartitions << ". Please report this bug to " - "the Kokkos developers."); - } - - /// \brief First pass of intranode TSQR factorization. - /// - /// Invoked by Kokkos' parallel_for template method. This - /// routine parallelizes over contiguous partitions of the - /// matrix. Each partition in turn contains cache blocks. - /// Partitions do not break up cache blocks. (This ensures that - /// the cache blocking scheme is the same as that used by - /// SequentialTsqr, as long as the cache blocking strategies are - /// the same. However, the implicit Q factor is not compatible - /// with that of SequentialTsqr.) - /// - /// This method also saves a view of the top block of the - /// partition in the topBlocks_ array. This is useful for the - /// next factorization pass. - /// - /// \param partitionIndex [in] Zero-based index of the - /// partition. If greater than or equal to the number of - /// partitions, this routine does nothing. - void operator() (const int partitionIndex) const - { - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || A_.empty ()) { - return; - } - else { - const std::pair cbIndices = - cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex, - numPartitions_, strategy_); - // It's legitimate, though suboptimal, for some partitions - // not to get any work to do (in this case, not to get any - // cache blocks to factor). - if (cbIndices.second <= cbIndices.first) { - return; - } else { - topBlocks_[partitionIndex] = factor (cbIndices, partitionIndex); - } - } - } - }; - - /// \class ApplyFirstPass - /// \brief "First" pass of applying KokkosNodeTsqr's implicit Q factor. - /// \author Mark Hoemmen - /// - /// We call this ApplyFirstPass as a reminder that this algorithm - /// has the same form as FactorFirstPass and uses the results of - /// the latter, even though ApplyFirstPass is really the last pass - /// of applying the implicit Q factor. - template - class ApplyFirstPass { - public: - using const_mat_view_type = MatView; - using mat_view_type = MatView; - - private: - ApplyType applyType_; - const_mat_view_type Q_; - const std::vector >& tauArrays_; - const std::vector& topBlocks_; - mat_view_type C_; - CacheBlockingStrategy strategy_; - int numPartitions_; - bool explicitQ_, contiguousCacheBlocks_; - - void - applyFirstCacheBlock (Combine& combine, - const ApplyType& applyType, - const const_mat_view_type& Q_top, - const std::vector& tau, - const mat_view_type& C_top, - std::vector& work) const - { - TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast (Q_top.extent(1)), - std::logic_error, - "ApplyFirstPass::applyFirstCacheBlock: tau.size() " - "(= " << tau.size() << ") < number of columns " - << Q_top.extent(1) << " in the Q factor. Please " - "report this bug to the Kokkos developers."); - - // If we get this far, it's fair to assume that we have - // checked whether tau and work have nonzero lengths. - combine.apply_first (applyType, Q_top, tau.data(), - C_top, work.data()); - } - - void - applyCacheBlock (Combine& combine, - const ApplyType& applyType, - const const_mat_view_type& Q_cur, - const std::vector& tau, - const mat_view_type& C_top, - const mat_view_type& C_cur, - std::vector& work) const - { - TEUCHOS_TEST_FOR_EXCEPTION - (tau.size() < static_cast (Q_cur.extent(1)), - std::logic_error, "ApplyFirstPass::applyCacheBlock: tau.size() " - "(= " << tau.size() << ") < number of columns " - << Q_cur.extent(1) << " in the Q factor." - " Please report this bug to the Tpetra developers."); - - // If we get this far, it's fair to assume that we have - // checked whether tau and work have nonzero lengths. - combine.apply_inner (applyType, C_cur.extent(0), C_cur.extent(1), - Q_cur.extent(1), Q_cur.data(), Q_cur.stride(1), - tau.data(), - C_top.data(), C_top.stride(1), - C_cur.data(), C_cur.stride(1), - work.data()); - } - - /// \fn apply - /// \brief Apply the sequential part of the implicit Q factor to C. - /// - /// \param applyType [in] Whether we are applying Q, Q^T, or Q^H. - /// \param cbIndices [in] Half-exclusive range of cache block - /// indices. - /// \param partitionIndex [in] The argument to \c operator(); the - /// index of the partition which instance of ApplyFirstPass - /// is currently processing. - void - apply (const ApplyType& applyType, - const std::pair cbIndices, - const int partitionIndex) const - { - using const_range_type = CacheBlockRange; - using range_type = CacheBlockRange; - const char suffix[] = " Please report this bug to the Tpetra developers."; - - if (cbIndices.first >= cbIndices.second) { - return; // My range of cache blocks is empty; nothing to do - } - - // Q_range: Range of cache blocks in the Q factor. - // C_range: Range of cache blocks in the matrix C. - const_range_type Q_range (Q_, strategy_, - cbIndices.first, cbIndices.second, - contiguousCacheBlocks_); - range_type C_range (C_, strategy_, - cbIndices.first, cbIndices.second, - contiguousCacheBlocks_); - TEUCHOS_TEST_FOR_EXCEPTION - (Q_range.empty(), std::logic_error, - "Q_range is empty, but the range of cache block " - "indices [" << cbIndices.first << ", " - << cbIndices.second << ") is not empty." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (C_range.empty(), std::logic_error, - "C_range is empty, but the range of cache block " - "indices [" << cbIndices.first << ", " - << cbIndices.second << ") is not empty." << suffix); - - // Task-local workspace array of length C_.extent(1). Workspace - // must be per task, else there will be race conditions as - // different tasks attempt to write to and read from the same - // workspace simultaneously. - std::vector work (C_.extent(1)); - - Combine combine; - if (applyType.transposed ()) { - auto Q_rangeIter = Q_range.begin(); - auto C_rangeIter = C_range.begin(); - TEUCHOS_TEST_FOR_EXCEPTION - (Q_rangeIter == Q_range.end(), std::logic_error, - "The Q cache block range claims to be nonempty, " - "but the iterator range is empty." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (C_rangeIter == C_range.end(), std::logic_error, - "The C cache block range claims to be nonempty, " - "but the iterator range is empty." << suffix); - - // Q_top: Topmost cache block in the cache block range of Q. - // C_top: Topmost cache block in the cache block range of C. - const_mat_view_type Q_top = *Q_rangeIter; - mat_view_type C_top = *C_rangeIter; - if (explicitQ_) { - deep_copy (C_top, Scalar {}); - if (partitionIndex == 0) { - for (LocalOrdinal j = 0; j < C_top.extent(1); ++j) { - C_top(j,j) = Scalar (1.0); - } - } - } - LocalOrdinal curTauIndex = cbIndices.first; - - // Apply the first block. - applyFirstCacheBlock (combine, applyType, Q_top, - tauArrays_[curTauIndex++], C_top, work); - - // Apply the rest of the blocks, if any. - ++Q_rangeIter; - ++C_rangeIter; - while (Q_rangeIter != Q_range.end ()) { - TEUCHOS_TEST_FOR_EXCEPTION - (C_rangeIter == C_range.end(), std::logic_error, - "When applying Q^T or Q^H to C: The Q cache " - "block iterator is not yet at the end, but " - "the C cache block iterator is." << suffix); - const_mat_view_type Q_cur = *Q_rangeIter; - mat_view_type C_cur = *C_rangeIter; - ++Q_rangeIter; - ++C_rangeIter; - if (explicitQ_) { - deep_copy (C_cur, Scalar {}); - } - applyCacheBlock (combine, applyType, Q_cur, - tauArrays_[curTauIndex++], - C_top, C_cur, work); - } - } - else { - // Q_top: Topmost cache block in the cache block range of Q. - // C_top: Topmost cache block in the cache block range of C. - const_mat_view_type Q_top = *(Q_range.begin()); - mat_view_type C_top = *(C_range.begin()); - - if (explicitQ_) { - // We've already filled the top ncols x ncols block of - // C_top with data (that's the result of applying the - // internode part of the Q factor via DistTsqr). However, - // we still need to fill the rest of C_top (everything but - // the top ncols rows of C_top) with zeros. - mat_view_type C_top_rest (C_top.extent(0) - C_top.extent(1), - C_top.extent(1), - C_top.data() + C_top.extent(1), - C_top.stride(1)); - deep_copy (C_top_rest, Scalar {}); - } - LocalOrdinal curTauIndex = cbIndices.second-1; - - // When applying Q (rather than Q^T or Q^H), we apply the - // cache blocks in reverse order. - typename const_range_type::iterator Q_rangeIter = Q_range.rbegin(); - typename range_type::iterator C_rangeIter = C_range.rbegin(); - TEUCHOS_TEST_FOR_EXCEPTION - (Q_rangeIter == Q_range.rend(), std::logic_error, - "The Q cache block range claims to be nonempty, " - "but the iterator range is empty." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (C_rangeIter == C_range.rend(), std::logic_error, - "The C cache block range claims to be nonempty, " - "but the iterator range is empty." << suffix); - - // Equality of cache block range iterators only tests the - // cache block index, not reverse-ness. This means we can - // compare a reverse-direction iterator (Q_rangeIter) with - // a forward-direction iterator (Q_range.begin()). - // - // We do this because we need to handle the topmost block - // of Q_range separately (applyFirstCacheBlock(), rather - // than applyCacheBlock()). - while (Q_rangeIter != Q_range.begin ()) { - const_mat_view_type Q_cur = *Q_rangeIter; - mat_view_type C_cur = *C_rangeIter; - - if (explicitQ_) { - deep_copy (C_cur, Scalar {}); - } - TEUCHOS_TEST_FOR_EXCEPTION - (curTauIndex < cbIndices.first, std::logic_error, - "curTauIndex=" << curTauIndex << " out of valid " - "range [" << cbIndices.first << "," - << cbIndices.second << ")." << suffix); - applyCacheBlock (combine, applyType, Q_cur, - tauArrays_[curTauIndex--], - C_top, C_cur, work); - ++Q_rangeIter; - ++C_rangeIter; - } - TEUCHOS_TEST_FOR_EXCEPTION - (curTauIndex < cbIndices.first, std::logic_error, - "curTauIndex=" << curTauIndex << " out of valid range " - "[" << cbIndices.first << "," << cbIndices.second << ")." - << suffix); - // Apply the first block. - applyFirstCacheBlock (combine, applyType, Q_top, - tauArrays_[curTauIndex--], C_top, work); - } - } - - public: - /// \brief Constructor - /// - /// \param applyType [in] Whether we are applying Q, Q^T, or Q^H. - /// \param A [in/out] On input: View of the matrix to factor. - /// On output: (Part of) the implicitly stored Q factor. - /// (The other part is tauArrays.) - /// \param tauArrays [in] Where to write the "TAU" arrays - /// (implicit factorization results) for each cache block. - /// (TAU is what LAPACK's QR factorization routines call this - /// array; see the LAPACK documentation for an explanation.) - /// Indexed by the cache block index; one TAU array per cache - /// block. - /// \param strategy [in] Cache blocking strategy to use. - /// \param numPartitions [in] Number of partitions (positive - /// integer), and therefore the maximum parallelism available - /// to the algorithm. Oversubscribing processors is OK, but - /// should not be done to excess. This is an int, and not a - /// LocalOrdinal, because it is the argument to Kokkos' - /// parallel_for. - /// \param contiguousCacheBlocks [in] Whether the cache blocks - /// of A are stored contiguously. - ApplyFirstPass (const ApplyType& applyType, - const const_mat_view_type& Q, - const std::vector>& tauArrays, - const std::vector& topBlocks, - const mat_view_type& C, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool explicitQ = false, - const bool contiguousCacheBlocks = false) : - applyType_ (applyType), - Q_ (Q), - tauArrays_ (tauArrays), - topBlocks_ (topBlocks), - C_ (C), - strategy_ (strategy), - numPartitions_ (numPartitions), - explicitQ_ (explicitQ), - contiguousCacheBlocks_ (contiguousCacheBlocks) - {} - - /// \brief First pass of applying intranode TSQR's implicit Q factor. - /// - /// Invoked by Kokkos' parallel_for template method. This - /// routine parallelizes over contiguous partitions of the C - /// matrix. Each partition in turn contains cache blocks. We - /// take care not to break up the cache blocks among partitions; - /// this ensures that the cache blocking scheme is the same as - /// SequentialTsqr uses. (However, the implicit Q factor is not - /// compatible with that of SequentialTsqr.) - /// - /// \param partitionIndex [in] Zero-based index of the partition - /// which this instance of ApplyFirstPass is currently - /// processing. If greater than or equal to the number of - /// partitions, this routine does nothing. - void operator() (const int partitionIndex) const - { - const char prefix[] = "TSQR::ApplyFirstPass::operator(): "; - const char suffix[] = " Please report this bug to the Tpetra developers."; - - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || - Q_.empty () || C_.empty ()) { - return; - } - - // We use the same cache block indices for Q and for C. - std::pair cbIndices = - cacheBlockIndexRange (Q_.extent(0), Q_.extent(1), partitionIndex, - numPartitions_, strategy_); - if (cbIndices.second <= cbIndices.first) - return; - { - std::pair cbInds (size_t (cbIndices.first), - size_t (cbIndices.second)); - TEUCHOS_TEST_FOR_EXCEPTION - (cbIndices.first < LocalOrdinal(0), std::logic_error, - prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", " - << Q_.extent(1) << ", " << partitionIndex << ", " - << numPartitions_ << ", strategy) returned a cache block " - "range " << cbIndices.first << "," << cbIndices.second << - " with negative starting index." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (cbInds.second > tauArrays_.size (), std::logic_error, - prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", " - << Q_.extent(1) << ", " << partitionIndex << ", " - << numPartitions_ << ", strategy) returned a cache block " - "range" << cbIndices.first << "," << cbIndices.second << - " with starting index larger than the number of tau " - "arrays " << tauArrays_.size () << "." << suffix); - } - apply (applyType_, cbIndices, partitionIndex); - } - }; - - /// \class CacheBlockFunctor - /// \brief Kokkos functor for KokkosNodeTsqr's (un_)cache_block() methods. - /// \author Mark Hoemmen - template - class CacheBlockFunctor { - private: - using const_mat_view_type = MatView; - using mat_view_type = MatView; - using const_range_type = CacheBlockRange; - using range_type = CacheBlockRange; - - const_mat_view_type A_in_; - mat_view_type A_out_; - CacheBlockingStrategy strategy_; - int numPartitions_; - bool unblock_; - - /// \brief Copy one range of cache blocks into another. - /// - /// \param cbInputRange [in] Range of input cache blocks. - /// \param cbOutputRange [out] Range of output cache blocks. - void copyRange (const_range_type& cbInputRange, - range_type& cbOutputRange) const - { - typedef typename const_range_type::iterator input_iter_type; - typedef typename range_type::iterator output_iter_type; - - input_iter_type inputIter = cbInputRange.begin(); - output_iter_type outputIter = cbOutputRange.begin(); - - input_iter_type inputEnd = cbInputRange.end(); - // TODO (mfh 29 Jun 2012) In a debug build, check in the loop - // below whether outputIter == cbOutputRange.end(). If so, - // throw std::logic_error. Don't declare outputEnd unless - // we're in a debug build, because otherwise the compiler may - // report warnings (gcc 4.5 doesn't; gcc 4.6 does). - // output_iter_type outputEnd = cbOutputRange.end(); - - while (inputIter != inputEnd) { - const_mat_view_type A_in_cur = *inputIter; - mat_view_type A_out_cur = *outputIter; - deep_copy (A_out_cur, A_in_cur); - ++inputIter; - ++outputIter; - } - } - - public: - /// \brief Constructor - /// - /// \param A_in [in] The matrix to (un-)cache-block. - /// \param A_out [in/out] Result of (un-)cache-blocking the - /// matrix A_in. - /// \param strategy [in] Cache blocking strategy. - /// \param numPartitions [in] Number of partitions; maximum - /// available parallelism. - /// \param unblock [in] If false, cache-block A_in (a matrix in - /// column-major order) into A_out. If true, un-cache-block - /// A_in into A_out (a matrix in column-major order). - CacheBlockFunctor (const const_mat_view_type A_in, - const mat_view_type A_out, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool unblock) : - A_in_ (A_in), - A_out_ (A_out), - strategy_ (strategy), - numPartitions_ (numPartitions), - unblock_ (unblock) - { - TEUCHOS_TEST_FOR_EXCEPTION - (A_in_.extent(0) != A_out_.extent(0) || - A_in_.extent(1) != A_out_.extent(1), - std::invalid_argument, - "A_in and A_out do not have the same dimensions: " - "A_in is " << A_in_.extent(0) << " by " - << A_in_.extent(1) << ", but A_out is " - << A_out_.extent(0) << " by " - << A_out_.extent(1) << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (numPartitions_ < 1, std::invalid_argument, - "The number of partitions " << numPartitions_ - << " is not a positive integer."); - } - - /// \brief Method called by Kokkos::parallel_for. - /// - /// \param partitionIndex [in] Zero-based index of the partition - /// of the matrix. We parallelize over partitions. - /// Partitions respect cache blocks. - void operator() (const int partitionIndex) const - { - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || - A_in_.empty()) { - return; - } - else { - using index_range_type = std::pair; - const index_range_type cbIndices = - cacheBlockIndexRange (A_in_.extent (0), A_in_.extent (1), - partitionIndex, numPartitions_, strategy_); - // It's perfectly legal for a partitioning to assign zero - // cache block indices to a particular partition. In that - // case, this task has nothing to do. - if (cbIndices.first >= cbIndices.second) { - return; - } - else { - // If unblock_ is false, then A_in_ is in column-major - // order, and we want to cache-block it into A_out_. If - // unblock_ is true, then A_in_ is cache-blocked, and we - // want to un-cache-block it into A_out_ (a matrix in - // column-major order). - const_range_type inputRange (A_in_, strategy_, cbIndices.first, - cbIndices.second, unblock_); - range_type outputRange (A_out_, strategy_, cbIndices.first, - cbIndices.second, ! unblock_); - copyRange (inputRange, outputRange); - } - } - } - }; - - /// \class MultFunctor - /// \brief Kokkos functor for \c KokkosNodeTsqr::Q_times_B(). - /// \author Mark Hoemmen - template - class MultFunctor { - private: - using const_mat_view_type = MatView; - using mat_view_type = MatView; - using range_type = CacheBlockRange; - - mat_view_type Q_; - const_mat_view_type B_; - CacheBlockingStrategy strategy_; - int numPartitions_; - bool contiguousCacheBlocks_; - - // This uses SystemBlas for now. - // In the future, we may want to use a TPL. - // That means we could switch to RawBlas. - void - multBlock (Impl::SystemBlas& blas, - const mat_view_type& Q_cur, - Matrix& Q_temp) const - { - using Teuchos::NO_TRANS; - const LocalOrdinal numCols = Q_cur.extent (1); - - // GEMM doesn't like aliased arguments, so we use a copy. We - // only copy the current cache block, rather than all of Q; - // this saves memory. - Q_temp.reshape (Q_cur.extent (0), numCols); - deep_copy (Q_temp, Q_cur); - - // Q_cur := Q_temp * B. - blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent(0), numCols, numCols, - Scalar (1.0), - Q_temp.data(), Q_temp.stride(1), B_.data(), B_.stride(1), - Scalar(0), Q_cur.data(), Q_cur.stride(1)); - } - - /// \brief Multiply (in place) each cache block in the range by B_. - /// - /// \param cbRange [in/out] Range of cache blocks. - void multRange (range_type& cbRange) const - { - typedef typename range_type::iterator iter_type; - iter_type iter = cbRange.begin(); - iter_type end = cbRange.end(); - - // Temporary storage for the BLAS' matrix-matrix multiply - // routine (which forbids aliasing of any input argument and - // the output argument). - Matrix Q_temp; - Impl::SystemBlas blas; - while (iter != end) { - mat_view_type Q_cur = *iter; - multBlock (blas, Q_cur, Q_temp); - ++iter; - } - } - - public: - /// \brief Constructor - /// - /// \param Q [in/out] Matrix to multiply in place by B. - /// \param B [in] \f$Q := Q * B\f$. - /// \param strategy [in] Cache-blocking strategy. - /// \param numPartitions [in] Number of partitions of the matrix - /// Q; maximum available parallelism. - /// \param contiguousCacheBlocks [in] Whether the cache blocks - /// of Q are stored contiguously. - MultFunctor (const mat_view_type Q, - const const_mat_view_type B, - const CacheBlockingStrategy& strategy, - const int numPartitions, - const bool contiguousCacheBlocks) : - Q_ (Q), - B_ (B), - strategy_ (strategy), - numPartitions_ (numPartitions), - contiguousCacheBlocks_ (contiguousCacheBlocks) - {} - - /// \brief Method called by Kokkos' parallel_for. - /// - /// \param partitionIndex [in] Zero-based index of the partition - /// of the matrix. We parallelize over partitions. - /// Partitions respect cache blocks. - void operator() (const int partitionIndex) const - { - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || - Q_.empty ()) { - return; - } - else { - typedef std::pair index_range_type; - const index_range_type cbIndices = - cacheBlockIndexRange (Q_.extent (0), Q_.extent (1), partitionIndex, - numPartitions_, strategy_); - if (cbIndices.first >= cbIndices.second) { - return; - } - else { - range_type range (Q_, strategy_, cbIndices.first, - cbIndices.second, contiguousCacheBlocks_); - multRange (range); - } - } - } - }; - - /// \class FillFunctor - /// \brief Kokkos functor for \c KokkosNodeTsqr::fill_with_zeros(). - /// \author Mark Hoemmen - template - class FillFunctor { - private: - using mat_view_type = MatView; - using range_type = CacheBlockRange; - - mat_view_type A_; - CacheBlockingStrategy strategy_; - const Scalar value_; - int numPartitions_; - bool contiguousCacheBlocks_; - - //! Fill (in place) each cache block in the range with value. - void fillRange (range_type& cbRange, const Scalar value) const - { - typedef typename range_type::iterator iter_type; - iter_type iter = cbRange.begin(); - iter_type end = cbRange.end(); - while (iter != end) { - mat_view_type A_cur = *iter; - deep_copy (A_cur, value); - ++iter; - } - } - - public: - /// \brief Constructor - /// - /// \param A [in/out] Matrix to fill with the value. - /// \param strategy [in] Cache-blocking strategy. - /// \param value [in] The value with which to fill A. - /// \param numPartitions [in] Number of partitions of - /// the matrix A; maximum available parallelism. - /// \param contiguousCacheBlocks [in] Whether the cache - /// blocks of A are stored contiguously. - FillFunctor (const mat_view_type A, - const CacheBlockingStrategy& strategy, - const Scalar value, - const int numPartitions, - const bool contiguousCacheBlocks) : - A_ (A), - strategy_ (strategy), - value_ (value), - numPartitions_ (numPartitions), - contiguousCacheBlocks_ (contiguousCacheBlocks) - {} - - /// \brief Method called by Kokkos' parallel_for. - /// - /// \param partitionIndex [in] Zero-based index of the partition - /// of the matrix. We parallelize over partitions. - /// Partitions respect cache blocks. - void operator() (const int partitionIndex) const - { - if (partitionIndex < 0 || partitionIndex >= numPartitions_ || - A_.empty ()) { - return; - } - else { - typedef std::pair index_range_type; - const index_range_type cbIndices = - cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex, - numPartitions_, strategy_); - if (cbIndices.first >= cbIndices.second) { - return; - } - else { - range_type range (A_, strategy_, cbIndices.first, - cbIndices.second, contiguousCacheBlocks_); - fillRange (range, value_); - } - } - } - }; - } // namespace details - - /// \class KokkosNodeTsqrFactorOutput - /// \brief Part of KokkosNodeTsqr's implicit Q representation. - /// \author Mark Hoemmen - /// - /// The \c KokkoNodeTsqr::factor() method represents the Q factor of - /// the matrix A implicitly. Part of that representation is in the - /// A matrix on output, and the other part is returned as an object - /// of this type. The apply() and explicit_Q() methods need both - /// parts of the implicit Q representation in order to do their - /// work. - template - struct KokkosNodeTsqrFactorOutput { - typedef MatView mat_view_type; - - /// \brief Constructor - /// - /// \param theNumCacheBlocks [in] Total number of cache blocks - /// (over all partitions). - /// \param theNumPartitions [in] Number of partitions. This is - /// an int because partition indices are ints, and the latter - /// are ints because they end up as range arguments to Kokkos' - /// parallel_for. - KokkosNodeTsqrFactorOutput (const size_t theNumCacheBlocks, - const int theNumPartitions) : - firstPassTauArrays (theNumCacheBlocks) - { - // Protect the cast to size_t from a negative number of - // partitions. - TEUCHOS_TEST_FOR_EXCEPTION(theNumPartitions < 1, std::invalid_argument, - "TSQR::KokkosNodeTsqrFactorOutput: Invalid number of " - "partitions " << theNumPartitions << "; number of " - "partitions must be a positive integer."); - // If there's only one partition, we don't even need a second - // pass (it's just sequential TSQR), and we don't need a TAU - // array for the top partition. - secondPassTauArrays.resize (size_t (theNumPartitions-1)); - topBlocks.resize (size_t (theNumPartitions)); - } - - //! Total number of cache blocks in the matrix (over all partitions). - int numCacheBlocks() const { return firstPassTauArrays.size(); } - - //! Number of partitions of the matrix; max available parallelism. - int numPartitions() const { return topBlocks.size(); } - - //! TAU arrays from the first pass; one per cache block. - std::vector> firstPassTauArrays; - - /// \brief TAU arrays from the second pass. - /// - /// There is one TAU array per partition, except for the topmost - /// partition. - /// - /// For now, KokkosNodeTsqr::factor() uses only two passes over - /// the matrix. firstPassTauArrays contains the result of the - /// pass over cache blocks, and secondPassTauArrays contains the - /// result of combining the upper triangular R factors from the - /// first pass. Later, we may add more passes, in which case we - /// will likely combine firstPassTauArrays and secondPassTauArrays - /// into a single std::vector (variable number of passes) or - /// Teuchos::Tuple (fixed number of passes). - std::vector> secondPassTauArrays; - - /// \brief Views of the topmost cache blocks in each partition. - /// - /// One entry for each partition. - std::vector topBlocks; - }; - - /// \class KokkosNodeTsqr - /// \brief Intranode (within an MPI process) TSQR parallelized using - /// Kokkos::DefaultHostExecutionSpace. - /// \author Mark Hoemmen - /// - /// \tparam LocalOrdinal The type of indices in the (node-local) - /// matrix. - /// - /// \tparam Scalar The type of entries in the (node-local) matrix. - /// - /// This implementation of the intranode part of TSQR factors the - /// matrix in two passes. The first pass parallelizes over - /// partitions, doing Sequential TSQR over each partition. The - /// second pass combines the R factors from the partitions, and is - /// not currently parallel. Thus, the overall algorithm is similar - /// to that of TbbTsqr, except that: - ///
    - ///
  • TbbTsqr partitions differently; KokkosNodeTsqr's partitions - /// use the same layout of cache blocks as SequentialTsqr, - /// whereas TbbTsqr uses a different layout.
  • - ///
  • TbbTsqr reduces the R factors in parallel; it only needs - /// one "pass."
  • - ///
- template - class KokkosNodeTsqr : - public NodeTsqr>, - public Teuchos::ParameterListAcceptorDefaultBase - { - public: - typedef LocalOrdinal local_ordinal_type; - typedef Scalar scalar_type; - - using const_mat_view_type = MatView; - using mat_view_type = MatView; - - /// \typedef FactorOutput - /// \brief Part of the implicit Q representation returned by factor(). - typedef typename NodeTsqr >::factor_output_type FactorOutput; - - /// \brief Constructor (with user-specified parameters). - /// - /// \param params [in/out] List of parameters. Missing parameters - /// will be filled in with default values. - KokkosNodeTsqr (const Teuchos::RCP& params = Teuchos::null) - { - setParameterList (params); - } - - /// \brief Whether this object is ready to perform computations. - bool ready() const { - return true; - } - - /// \brief One-line description of this object. - /// - /// This implements Teuchos::Describable::description(). - std::string description () const { - using Teuchos::TypeNameTraits; - std::ostringstream os; - os << "KokkosNodeTsqr::name() - << ", Scalar=" - << TypeNameTraits::name() - << ">: \"Cache Size Hint\"=" << strategy_.cache_size_hint() - << ", \"Size of Scalar\"=" << strategy_.size_of_scalar() - << ", \"Num Tasks\"=" << numPartitions_; - return os.str(); - } - - /// \brief Validate and read in parameters. - /// - /// \param paramList [in/out] On input: non-null parameter list - /// containing zero or more of the parameters in \c - /// getValidParameters(). On output: missing parameters (i.e., - /// parameters in \c getValidParameters() but not in the input - /// list) are filled in with default values. - void - setParameterList (const Teuchos::RCP& paramList) - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - using Teuchos::rcp; - - RCP plist; - if (paramList.is_null()) { - plist = rcp (new ParameterList (*getValidParameters ())); - } - else { - plist = paramList; - plist->validateParametersAndSetDefaults (*getValidParameters ()); - } - // Get values of parameters. We do this "transactionally" so - // that (except for validation and filling in defaults above) - // this method has the strong exception guarantee (it either - // returns, or throws an exception with no externally visible - // side effects). - size_t cacheSizeHint, sizeOfScalar; - int numPartitions; - try { - cacheSizeHint = plist->get ("Cache Size Hint"); - sizeOfScalar = plist->get ("Size of Scalar"); - numPartitions = plist->get ("Num Tasks"); - } - catch (Teuchos::Exceptions::InvalidParameter& e) { - std::ostringstream os; - os << "Failed to read default parameters after setting defaults. Pleas" - "e report this bug to the Kokkos developers. Original exception mess" - "age: " << e.what(); - TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str()); - } - numPartitions_ = numPartitions; - - // Recreate the cache blocking strategy. - typedef CacheBlockingStrategy strategy_type; - strategy_ = strategy_type (cacheSizeHint, sizeOfScalar); - - // Save the input parameter list. - setMyParamList (plist); - } - - /// \brief Default valid parameter list. - /// - /// The returned list contains all parameters accepted by \c - /// KokkosNodeTsqr, with their default values and documentation. - Teuchos::RCP - getValidParameters() const - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - - if (defaultParams_.is_null()) { - RCP params = parameterList ("Intranode TSQR"); - params->set ("Cache Size Hint", - static_cast(0), - std::string("Cache size in bytes; a hint for TSQR. Set to t" - "he size of the largest private cache per CPU co" - "re, or the fraction of shared cache per core. " - "If zero, we pick a reasonable default.")); - params->set ("Size of Scalar", - sizeof(Scalar), - std::string ("Size in bytes of the Scalar type. In most " - "cases, the default sizeof(Scalar) is fine. " - "Set a non-default value only when Scalar's " - "data is dynamically allocated (such as for a " - "type with precision variable at run time).")); - - // The number of partitions is an int rather than a - // LocalOrdinal, to ensure that it is always stored with the - // same type, despite the type of LocalOrdinal. Besides, Kokkos - // wants an int anyway. - params->set ("Num Tasks", - defaultNumPartitions (), - std::string ("Number of partitions; the maximum available pa" - "rallelelism in intranode TSQR. Slight oversub" - "scription is OK; undersubscription may have a " - "performance cost.")); - defaultParams_ = params; - } - return defaultParams_; - } - - FactorOutput - factor (const LocalOrdinal numRows, - const LocalOrdinal numCols, - Scalar A[], - const LocalOrdinal lda, - Scalar R[], - const LocalOrdinal ldr, - const bool contiguousCacheBlocks) const - { - mat_view_type A_view (numRows, numCols, A, lda); - mat_view_type R_view (numCols, numCols, R, ldr); - return factorImpl (A_view, R_view, contiguousCacheBlocks); - } - - void - apply (const ApplyType& applyType, - const LocalOrdinal nrows, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const FactorOutput& factorOutput, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguousCacheBlocks) const - { - const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq); - mat_view_type C_view (nrows, ncols_C, C, ldc); - applyImpl (applyType, Q_view, factorOutput, C_view, - false, contiguousCacheBlocks); - } - - void - explicit_Q (const LocalOrdinal nrows, - const LocalOrdinal ncols_Q, - const Scalar Q[], - const LocalOrdinal ldq, - const FactorOutput& factorOutput, - const LocalOrdinal ncols_C, - Scalar C[], - const LocalOrdinal ldc, - const bool contiguousCacheBlocks) const - { - const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq); - mat_view_type C_view (nrows, ncols_C, C, ldc); - applyImpl (ApplyType::NoTranspose, Q_view, factorOutput, - C_view, true, contiguousCacheBlocks); - } - - bool QR_produces_R_factor_with_nonnegative_diagonal () const { - return combine_.QR_produces_R_factor_with_nonnegative_diagonal (); - } - - size_t cache_size_hint() const { - return strategy_.cache_size_hint(); - } - - void - fill_with_zeros (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - const bool contiguousCacheBlocks) const - { - mat_view_type A_view (nrows, ncols, A, lda); - - using functor_type = details::FillFunctor; - const Scalar ZERO {}; - functor_type functor (A_view, strategy_, ZERO, numPartitions_, - contiguousCacheBlocks); - using execution_space = Kokkos::DefaultHostExecutionSpace; - Kokkos::RangePolicy> - range (0, numPartitions_); - Kokkos::parallel_for ("KokkosNodeTsqr::fill_with_zeros", range, functor); - } - - void - cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const Scalar A_in[], - const LocalOrdinal lda_in) const - { - const_mat_view_type A_in_view (nrows, ncols, A_in, lda_in); - - // The leading dimension of A_out doesn't matter here, since its - // cache blocks are to be stored contiguously. We set it - // arbitrarily to a sensible value. - mat_view_type A_out_view (nrows, ncols, A_out, nrows); - - using functor_type = details::CacheBlockFunctor; - functor_type functor (A_in_view, A_out_view, strategy_, - numPartitions_, false); - using execution_space = Kokkos::DefaultHostExecutionSpace; - Kokkos::RangePolicy> - range (0, numPartitions_); - Kokkos::parallel_for ("KokkosNodeTsqr::cache_block", range, functor); - } - - void - un_cache_block (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A_out[], - const LocalOrdinal lda_out, - const Scalar A_in[]) const - { - // The leading dimension of A_in doesn't matter here, since its - // cache blocks are contiguously stored. We set it arbitrarily - // to a sensible value. - const_mat_view_type A_in_view (nrows, ncols, A_in, nrows); - mat_view_type A_out_view (nrows, ncols, A_out, lda_out); - - using functor_type = details::CacheBlockFunctor; - functor_type functor (A_in_view, A_out_view, strategy_, - numPartitions_, true); - using execution_space = Kokkos::DefaultHostExecutionSpace; - Kokkos::RangePolicy> - range (0, numPartitions_); - Kokkos::parallel_for ("KokkosNodeTsqr::un_cache_block", range, functor); - } - - void - Q_times_B (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar Q[], - const LocalOrdinal ldq, - const Scalar B[], - const LocalOrdinal ldb, - const bool contiguousCacheBlocks) const - { - mat_view_type Q_view (nrows, ncols, Q, ldq); - const_mat_view_type B_view (ncols, ncols, B, ldb); - - using functor_type = details::MultFunctor; - functor_type functor (Q_view, B_view, strategy_, numPartitions_, - contiguousCacheBlocks); - using execution_space = Kokkos::DefaultHostExecutionSpace; - Kokkos::RangePolicy> - range (0, numPartitions_); - Kokkos::parallel_for ("KokkosNodeTsqr::Q_times_B", range, functor); - } - - private: - //! Implementation of fundamental TSQR kernels. - Combine combine_; - - //! Workspace for Combine operations. - mutable std::vector work_; - - //! Cache blocking strategy. - CacheBlockingStrategy strategy_; - - /// \brief Number of partitions; max available parallelism. - /// - /// The number of partitions is an int rather than a LocalOrdinal, - /// to ensure that it is always stored in the ParameterList with - /// the same type, despite the type of LocalOrdinal. Besides, - /// Kokkos wants an int anyway. - int numPartitions_; - - //! Default parameter list (set by \c getValidParameters()). - mutable Teuchos::RCP defaultParams_; - - //! Default number of partitions. - int - defaultNumPartitions () const - { - return Kokkos::DefaultHostExecutionSpace::concurrency (); - } - - FactorOutput - factorImpl (mat_view_type A, - mat_view_type R, - const bool contiguousCacheBlocks) const - { - const char prefix[] = "KokkosNodeTsqr::factorImpl: "; - const char suffix[] = " Please report this bug to the Tpetra developers."; - using LO = LocalOrdinal; - using execution_space = Kokkos::DefaultHostExecutionSpace; - Kokkos::RangePolicy> - range (0, numPartitions_); - - if (A.empty ()) { - TEUCHOS_TEST_FOR_EXCEPTION - (! R.empty (), std::logic_error, prefix << "A is empty, " - "but R is not." << suffix); - return FactorOutput (0, 0); - } - const LO numRowsPerCacheBlock = - strategy_.cache_block_num_rows (A.extent(1)); - const LO numCacheBlocks = - strategy_.num_cache_blocks (A.extent(0), A.extent(1), numRowsPerCacheBlock); - // - // Compute the first factorization pass (over partitions). - // - FactorOutput result (numCacheBlocks, numPartitions_); - using first_pass_type = details::FactorFirstPass; - first_pass_type firstPass (A, result.firstPassTauArrays, - result.topBlocks, strategy_, - numPartitions_, contiguousCacheBlocks); - Kokkos::parallel_for ("KokkosNodeTsqr::factorImpl::firstPass", - range, firstPass); - - // Each partition collected a view of its top block, where that - // partition's R factor is stored. The second pass reduces - // those R factors. We do this on one thread to avoid the - // overhead of parallelizing it. If the typical use case is - // oversubscription, you should parallelize this step with - // multiple passes. Note that we can't use parallel_reduce, - // because the tree topology matters. - factorSecondPass (result.topBlocks, result.secondPassTauArrays, - numPartitions_); - - // The "topmost top block" contains the resulting R factor. - const mat_view_type& R_top = result.topBlocks[0]; - TEUCHOS_TEST_FOR_EXCEPTION - (R_top.empty (), std::logic_error, prefix << "After " - "factorSecondPass: result.topBlocks[0] is an empty view." - << suffix); - mat_view_type R_top_square (R_top.extent(1), R_top.extent(1), - R_top.data(), R_top.stride(1)); - deep_copy (R, Scalar {}); - // Only copy the upper triangle of R_top into R. - copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.stride(1), - R_top.data(), R_top.stride(1)); - return result; - } - - void - applyImpl (const ApplyType& applyType, - const const_mat_view_type& Q, - const FactorOutput& factorOutput, - const mat_view_type& C, - const bool explicitQ, - const bool contiguousCacheBlocks) const - { - const char prefix[] = "KokkosNodeTsqr::applyImpl: "; - const char suffix[] = " Please report this bug to the Tpetra developers."; - using LO = LocalOrdinal; - using details::cacheBlockIndexRange; - using first_pass_type = details::ApplyFirstPass; - using execution_space = Kokkos::DefaultHostExecutionSpace; - - TEUCHOS_TEST_FOR_EXCEPTION - (numPartitions_ != factorOutput.numPartitions(), - std::invalid_argument, prefix << "KokkosNodeTsqr's number " - "of partitions " << numPartitions_ << " does not match the " - "given factorOutput's number of partitions " - << factorOutput.numPartitions() << ". This likely means " - "that the given factorOutput object comes from a different " - "instance of KokkosNodeTsqr." << suffix); - const int numParts = numPartitions_; - first_pass_type firstPass (applyType, Q, - factorOutput.firstPassTauArrays, - factorOutput.topBlocks, C, strategy_, - numParts, explicitQ, - contiguousCacheBlocks); - // Get a view of each partition's top block of the C matrix. - std::vector topBlocksOfC (numParts); - { - using index_range_type = std::pair; - using blocker_type = CacheBlocker; - blocker_type C_blocker (C.extent(0), C.extent(1), strategy_); - - // For each partition, collect its top block of C. - for (int partIdx = 0; partIdx < numParts; ++partIdx) { - const index_range_type cbIndices = - cacheBlockIndexRange (C.extent(0), C.extent(1), partIdx, - numParts, strategy_); - if (cbIndices.first >= cbIndices.second) { - topBlocksOfC[partIdx] = mat_view_type (0, 0, nullptr, 0); - } else { - topBlocksOfC[partIdx] = - C_blocker.get_cache_block (C, cbIndices.first, - contiguousCacheBlocks); - } - } - } - - Kokkos::RangePolicy> - range(0, numPartitions_); - if (applyType.transposed ()) { - Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass", - range, firstPass); - applySecondPass (applyType, factorOutput, topBlocksOfC, - strategy_, explicitQ); - } - else { - applySecondPass (applyType, factorOutput, topBlocksOfC, - strategy_, explicitQ); - Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass", - range, firstPass); - } - } - - std::vector - factorPair (const mat_view_type& R_top, - const mat_view_type& R_bot) const - { - TEUCHOS_TEST_FOR_EXCEPTION - (R_top.empty (), std::logic_error, "R_top is empty!"); - TEUCHOS_TEST_FOR_EXCEPTION - (R_bot.empty(), std::logic_error, "R_bot is empty!"); - TEUCHOS_TEST_FOR_EXCEPTION - (work_.size() == 0, std::logic_error, - "Workspace array work_ has length zero."); - TEUCHOS_TEST_FOR_EXCEPTION - (work_.size() < size_t (R_top.extent(1)), std::logic_error, - "Workspace array work_ has length = " << work_.size() - << " < R_top.extent(1) = " << R_top.extent(1) << "."); - - std::vector tau (R_top.extent (1)); - - // Our convention for such helper methods is for the immediate - // parent to allocate workspace (the work_ array in this case). - // - // The statement below only works if R_top and R_bot have a - // nonzero (and the same) number of columns, but we have already - // checked that above. - combine_.factor_pair (R_top, R_bot, tau.data(), work_.data()); - return tau; - } - - void - factorSecondPass (std::vector& topBlocks, - std::vector >& tauArrays, - const int numPartitions) const - { - const char prefix[] = "KokkosNodeTsqr::factorSecondPass: "; - const char suffix[] = " Please report this bug to the Tpetra developers."; - - if (numPartitions <= 1) - return; // Done! - TEUCHOS_TEST_FOR_EXCEPTION - (topBlocks.size () < size_t (numPartitions), std::logic_error, - prefix << "topBlocks.size() (= " << topBlocks.size() << ") " - "< numPartitions (= " << numPartitions << ")." << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (tauArrays.size () < size_t (numPartitions-1), - std::logic_error, prefix << "topBlocks.size() (= " - << topBlocks.size() << ") < numPartitions-1 (= " - << (numPartitions-1) << ")." << suffix); - // The top partition (partition index zero) should always be - // nonempty if we get this far, so its top block should also be - // nonempty. - TEUCHOS_TEST_FOR_EXCEPTION - (topBlocks[0].empty(), std::logic_error, - prefix << "topBlocks[0] is empty." << suffix); - // However, other partitions besides the top one might be empty, - // in which case their top blocks will be empty. We skip over - // the empty partitions in the loop below. - work_.resize (size_t (topBlocks[0].extent(1))); - for (int partIdx = 1; partIdx < numPartitions; ++partIdx) { - if (! topBlocks[partIdx].empty ()) { - tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]); - } - } - } - - void - applyPair (const ApplyType& applyType, - const mat_view_type& R_bot, - const std::vector& tau, - const mat_view_type& C_top, - const mat_view_type& C_bot) const - { - // Our convention for such helper methods is for the immediate - // parent to allocate workspace (the work_ array in this case). - // - // The statement below only works if C_top, R_bot, and C_bot - // have a nonzero (and the same) number of columns, but we have - // already checked that above. - combine_.apply_pair (applyType, C_top.extent(1), R_bot.extent(1), - R_bot.data(), R_bot.stride(1), tau.data(), - C_top.data(), C_top.stride(1), - C_bot.data(), C_bot.stride(1), work_.data()); - } - - void - applySecondPass (const ApplyType& applyType, - const FactorOutput& factorOutput, - std::vector& topBlocksOfC, - const CacheBlockingStrategy& strategy, - const bool explicitQ) const - { - const char prefix[] = "KokkosNodeTsqr::applySecondPass: "; - const char suffix[] = " Please report this bug to the Tpetra developers."; - - const int numParts = factorOutput.numPartitions(); - if (numParts <= 1) - return; // Done! - TEUCHOS_TEST_FOR_EXCEPTION - (topBlocksOfC.size () != size_t (numParts), std::logic_error, - prefix << "topBlocksOfC.size() (= " << topBlocksOfC.size() - << ") != number of partitions (= " << numParts << ")." - << suffix); - TEUCHOS_TEST_FOR_EXCEPTION - (factorOutput.secondPassTauArrays.size () != size_t (numParts-1), - std::logic_error, prefix << - "factorOutput.secondPassTauArrays.size() (= " - << factorOutput.secondPassTauArrays.size() - << ") != number of partitions minus 1 (= " - << (numParts-1) << ")." << suffix); - const LocalOrdinal numCols = topBlocksOfC[0].extent(1); - work_.resize (size_t (numCols)); - - // Top blocks of C are the whole cache blocks. We only want to - // affect the top ncols x ncols part of each of those blocks in - // this method. - mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].data(), - topBlocksOfC[0].stride(1)); - if (applyType.transposed ()) { - // Don't include the topmost (index 0) partition in the - // iteration; that corresponds to C_top_square. - for (int partIdx = 1; partIdx < numParts; ++partIdx) { - // It's legitimate for some partitions not to have any - // cache blocks. In that case, their top block will be - // empty, and we can skip over them. - const mat_view_type& C_cur = topBlocksOfC[partIdx]; - if (! C_cur.empty()) { - mat_view_type C_cur_square (numCols, numCols, C_cur.data (), - C_cur.stride (1)); - // If explicitQ: We've already done the first pass and - // filled the top blocks of C. - applyPair (applyType, factorOutput.topBlocks[partIdx], - factorOutput.secondPassTauArrays[partIdx-1], - C_top_square, C_cur_square); - } - } - } else { - // In non-transposed mode, when computing the first - // C.extent(1) columns of the explicit Q factor, intranode - // TSQR would run after internode TSQR (i.e., DistTsqr) - // (even if only running on a single node in non-MPI mode). - // Therefore, internode TSQR is responsible for filling the - // top block of this node's part of the C matrix. - // - // Don't include the topmost partition in the iteration; - // that corresponds to C_top_square. - for (int partIdx = numParts - 1; partIdx > 0; --partIdx) { - // It's legitimate for some partitions not to have any - // cache blocks. In that case, their top block will be - // empty, and we can skip over them. - const mat_view_type& C_cur = topBlocksOfC[partIdx]; - if (! C_cur.empty()) { - mat_view_type C_cur_square (numCols, numCols, - C_cur.data (), - C_cur.stride (1)); - // The "first" pass (actually the last, only named - // "first" by analogy with factorFirstPass()) will - // fill the rest of these top blocks. For now, we - // just fill the top n x n part of the top blocks - // with zeros. - if (explicitQ) { - deep_copy (C_cur_square, Scalar {}); - } - applyPair (applyType, factorOutput.topBlocks[partIdx], - factorOutput.secondPassTauArrays[partIdx-1], - C_top_square, C_cur_square); - } - } - } - } - - protected: - - /// \brief Return the topmost cache block of the matrix C. - /// - /// NodeTsqr's top_block() method must be implemented using its - /// subclasses' const_top_block() method. This is because - /// top_block() is a template method, and template methods cannot - /// be virtual. - /// - /// \param C [in] View of a matrix, with at least as many rows as - /// columns. - /// \param contiguous_cache_blocks [in] Whether the cache blocks - /// of C are stored contiguously. - /// - /// \return View of the topmost cache block of the matrix C. - const_mat_view_type - const_top_block (const const_mat_view_type& C, - const bool contiguous_cache_blocks) const - { - typedef CacheBlocker blocker_type; - blocker_type blocker (C.extent(0), C.extent(1), strategy_); - - // C_top_block is a view of the topmost cache block of C. - // C_top_block should have >= ncols rows, otherwise either cache - // blocking is broken or the input matrix C itself had fewer - // rows than columns. - const_mat_view_type C_top = blocker.top_block (C, contiguous_cache_blocks); - return C_top; - } - }; -} // namespace TSQR - -#endif // __TSQR_KokkosNodeTsqr_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp deleted file mode 100644 index ab3f0411d22d..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp +++ /dev/null @@ -1,511 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_Test_KokkosNodeTsqrTest_hpp -#define __TSQR_Test_KokkosNodeTsqrTest_hpp - -#include "Tsqr_nodeTestProblem.hpp" -#include "Tsqr_verifyTimerConcept.hpp" -#include "Tsqr_Random_NormalGenerator.hpp" -#include "Tsqr_LocalVerify.hpp" -#include "Tsqr_Matrix.hpp" -#include "Tsqr_KokkosNodeTsqr.hpp" -#include "Teuchos_ScalarTraits.hpp" -#include "Teuchos_Time.hpp" -#include "Teuchos_TypeNameTraits.hpp" -#include -#include -#include -#include - -namespace TSQR { - namespace Test { - /// \fn verifyKokkosNodeTsqr - /// \brief Test accuracy of KokkosNodeTsqr's QR factorization. - /// - /// Test the accuracy of KokkosNodeTsqr's QR factorization on a - /// numRows by numCols matrix, and print results to stdout. - /// - /// \param gen [in/out] Pseudorandom number generator for the - /// normal(0,1) distribution. - /// \param numRows [in] Number of rows in the test matrix. - /// \param numCols [in] Number of columns in the test matrix. - /// \param numPartitions [in] Number of parallel partitions (must - /// be a positive integer). - /// \param cacheSizeHint [in] Cache size hint, in bytes. Zero - /// means pick a reasonable default. - /// \param contiguousCacheBlocks [in] Whether cache blocks in the - /// matrix to factor should be stored contiguously. - /// \param printFieldNames [in] If humanReadable is true, this is - /// ignored; otherwise, whether to print a line of field names - /// before the line of output. - /// \param humanReadable [in] Whether to print output that is easy - /// for humans to read, or instead to print output that is easy - /// for a script to parse. - /// \param debug [in] Whether to print extra debugging output to - /// stderr. - template - void - verifyKokkosNodeTsqr (TSQR::Random::NormalGenerator& gen, - const Ordinal numRows, - const Ordinal numCols, - const int numPartitions, - const size_t cacheSizeHint, - const bool contiguousCacheBlocks, - const bool printFieldNames, - const bool humanReadable, - const bool debug) - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - using Teuchos::TypeNameTraits; - using std::cerr; - using std::cout; - using std::endl; - using node_tsqr_type = TSQR::KokkosNodeTsqr; - typedef typename node_tsqr_type::FactorOutput factor_output_type; - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - // typedef Teuchos::Time timer_type; - typedef Matrix matrix_type; - typedef MatView mat_view_type; - - const std::string scalarTypeName = TypeNameTraits::name(); - - // Set up TSQR implementation. - RCP params = parameterList ("Intranode TSQR"); - params->set ("Cache Size Hint", cacheSizeHint); - params->set ("Num Tasks", numPartitions); - node_tsqr_type actor (params); - if (debug) { - cerr << actor.description() << endl; - if (contiguousCacheBlocks) { - cerr << "-- Test with contiguous cache blocks" << endl; - } - } - - // Allocate space for test problem. - matrix_type A (numRows, numCols); - matrix_type A_copy (numRows, numCols); - matrix_type Q (numRows, numCols); - matrix_type R (numCols, numCols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A, std::numeric_limits::quiet_NaN()); - deep_copy (A_copy, std::numeric_limits::quiet_NaN()); - deep_copy (Q, std::numeric_limits::quiet_NaN()); - deep_copy (R, std::numeric_limits::quiet_NaN()); - } - else { - deep_copy (A, Scalar {}); - deep_copy (A_copy, Scalar {}); - deep_copy (Q, Scalar {}); - deep_copy (R, Scalar {}); - } - const Ordinal lda = numRows; - const Ordinal ldq = numRows; - const Ordinal ldr = numCols; - - // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), true); - - if (debug) { - cerr << "-- Generated test problem" << endl; - // Don't print the matrix if it's too big. - if (A.extent(0) <= 30) { - cerr << "A = " << endl; - print_local_matrix (cerr, A.extent(0), A.extent(1), - A.data(), A.stride(1)); - cerr << endl << endl; - } - } - - // Copy A into A_copy, since TSQR overwrites the input. If - // specified, rearrange the data in A_copy so that the data in - // each cache block is contiguously stored. - if (! contiguousCacheBlocks) { - deep_copy (A_copy, A); - if (debug) { - cerr << "-- Copied test problem from A into A_copy" << endl; - // Don't print the matrix if it's too big. - if (A_copy.extent(0) <= 30) { - cerr << "A_copy = " << endl; - print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), - A_copy.data(), A_copy.stride(1)); - cerr << endl << endl; - } - } - } - else { - actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1)); - if (debug) { - cerr << "-- Reorganized test matrix to have contiguous " - "cache blocks" << endl; - // Don't print the matrix if it's too big. - if (A_copy.extent(0) <= 30) { - cerr << "A_copy = " << endl; - print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1), - A_copy.data(), A_copy.stride(1)); - cerr << endl << endl; - } - } - - // Verify cache blocking, when in debug mode. - if (debug) { - matrix_type A2 (numRows, numCols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A2, std::numeric_limits::quiet_NaN()); - } - - actor.un_cache_block (numRows, numCols, A2.data(), A2.stride(1), A_copy.data()); - if (matrix_equal (A, A2)) { - if (debug) - cerr << "-- Cache blocking test succeeded!" << endl; - } - else { - if (debug) { - cerr << "*** Cache blocking test failed! A != A2 ***" - << endl << endl; - // Don't print the matrices if they are too big. - if (A.extent(0) <= 30 && A2.extent(0) <= 30) { - cerr << "A = " << endl; - print_local_matrix (cerr, A.extent(0), A.extent(1), - A.data(), A.stride(1)); - cerr << endl << "A2 = " << endl; - print_local_matrix (cerr, A2.extent(0), A2.extent(1), - A2.data(), A2.stride(1)); - cerr << endl; - } - } - throw std::logic_error ("Cache blocking failed"); - } - } - } - - // Fill R with zeros, since the factorization may not - // necessarily overwrite the strict lower triangle of R. - if (debug) { - cerr << "-- Filling R with zeros" << endl; - } - deep_copy (R, Scalar {}); - - if (debug) { - cerr << "-- Calling factor()" << endl; - } - - // Factor the matrix and compute the explicit Q factor - factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); - if (debug) { - cerr << "-- Finished factor()" << endl; - cerr << "-- Calling explicit_Q()" << endl; - } - - // KokkosNodeTsqr isn't designed to be used by itself, so we - // have to help it along by filling the top ncols x ncols - // entries with the first ncols columns of the identity matrix. - { - mat_view_type Q_top = - actor.top_block (Q.view (), contiguousCacheBlocks); - mat_view_type Q_top_square (Q_top.extent(1), Q_top.extent(1), - Q_top.data(), Q_top.stride(1)); - deep_copy (Q_top_square, Scalar {}); - for (Ordinal j = 0; j < Q_top_square.extent(1); ++j) { - Q_top_square(j,j) = Scalar (1.0); - } - } - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), - factor_output, numCols, Q.data(), Q.stride(1), - contiguousCacheBlocks); - if (debug) { - cerr << "-- Finished explicit_Q()" << endl; - } - - // "Un"-cache-block the output Q (the explicit Q factor), if - // contiguous cache blocks were used. This is only necessary - // because local_verify() doesn't currently support contiguous - // cache blocks. - if (contiguousCacheBlocks) { - // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (numRows, numCols, A_copy.data(), - A_copy.stride(1), Q.data()); - deep_copy (Q, A_copy); - if (debug) { - cerr << "-- Un-cache-blocked output Q factor" << endl; - } - } - - // Print out the Q and R factors in debug mode. - if (debug) { - // Don't print the matrix if it's too big. - if (Q.extent(0) <= 30) { - cerr << endl << "-- Q factor:" << endl; - print_local_matrix (cerr, Q.extent(0), Q.extent(1), - Q.data(), Q.stride(1)); - cerr << endl << endl; - } - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, numCols, numCols, R.data(), R.stride(1)); - cerr << endl; - } - - // Validate the factorization - std::vector results = - local_verify (numRows, numCols, A.data(), lda, - Q.data(), ldq, R.data(), ldr); - if (debug) - cerr << "-- Finished local_verify" << endl; - - // Print the results - if (humanReadable) { - cout << "KokkosNodeTsqr:" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << numRows << endl - << "# columns: " << numCols << endl - << "# partitions: " << numPartitions << endl - << "cache size hint (revised) in bytes: " << actor.cache_size_hint() << endl - << "contiguous cache blocks? " << contiguousCacheBlocks << endl - << "Absolute residual $\\|A - Q*R\\|_2$: " - << results[0] << endl - << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " - << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " - << results[2] << endl - << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - cout << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",numPartitions" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA" - << endl; - } - cout << "KokkosNodeTsqr" - << "," << scalarTypeName - << "," << numRows - << "," << numCols - << "," << numPartitions - << "," << actor.cache_size_hint() - << "," << contiguousCacheBlocks - << "," << results[0] - << "," << results[1] - << "," << results[2] - << endl; - } - } - - /// \fn benchmarkKokkosNodeTsqr - /// \brief Test performance of KokkosNodeTsqr's QR factorization. - /// - /// Compare the performance of KokkosNodeTsqr's QR factorization - /// to that of LAPACK's QR factorization. Print results to - /// stdout. - /// - /// \param numTrials [in] Number of times to run the benchmark; - /// the timing result is cumulative over all trials. Timing - /// over larger numbers of trials improves certainty of the - /// result. - /// \param numRows [in] Number of rows in the test matrix. - /// \param numCols [in] Number of columns in the test matrix. - /// \param numPartitions [in] Number of parallel partitions (must - /// be a positive integer). - /// \param cacheSizeHint [in] Cache size hint, in bytes. Zero - /// means pick a reasonable default. - /// \param contiguousCacheBlocks [in] Whether cache blocks in the - /// matrix to factor should be stored contiguously. - /// \param printFieldNames [in] If humanReadable is true, this is - /// ignored; otherwise, whether to print a line of field names - /// before the line of output. - /// \param humanReadable [in] Whether to print output that is easy - /// for humans to read, or instead to print output that is easy - /// for a script to parse. - template - void - benchmarkKokkosNodeTsqr (const int numTrials, - const Ordinal numRows, - const Ordinal numCols, - const int numPartitions, - const size_t cacheSizeHint, - const bool contiguousCacheBlocks, - const bool printFieldNames, - const bool humanReadable) - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - using Teuchos::TypeNameTraits; - using std::cerr; - using std::cout; - using std::endl; - using node_tsqr_type = TSQR::KokkosNodeTsqr; - typedef typename node_tsqr_type::FactorOutput factor_output_type; - typedef Teuchos::Time timer_type; - typedef Matrix matrix_type; - - const std::string scalarTypeName = TypeNameTraits::name(); - - // Pseudorandom normal(0,1) generator. Default seed is OK, - // because this is a benchmark, not an accuracy test. - TSQR::Random::NormalGenerator gen; - - // Set up TSQR implementation. - RCP params = parameterList ("Intranode TSQR"); - params->set ("Cache Size Hint", cacheSizeHint); - params->set ("Num Tasks", numPartitions); - node_tsqr_type actor (params); - - // Allocate space for test problem. - matrix_type A (numRows, numCols); - matrix_type A_copy (numRows, numCols); - matrix_type Q (numRows, numCols); - matrix_type R (numCols, numCols); - - // Fill R with zeros, since the factorization may not overwrite - // the strict lower triangle of R. - deep_copy (R, Scalar {}); - - // Create a test problem - nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), false); - - // Copy A into A_copy, since TSQR overwrites the input. If - // specified, rearrange the data in A_copy so that the data in - // each cache block is contiguously stored. - if (contiguousCacheBlocks) { - actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1)); - } else { - deep_copy (A_copy, A); - } - - // Do a few timing runs and throw away the results, just to warm - // up any libraries that do autotuning. - const int numWarmupRuns = 5; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - // Factor the matrix in-place in A_copy, and extract the - // resulting R factor into R. - factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); - // Compute the explicit Q factor (which was stored - // implicitly in A_copy and factor_output) and store in Q. - // We don't need to un-cache-block the output, because we - // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), - factor_output, numCols, Q.data(), Q.stride(1), - contiguousCacheBlocks); - } - - // Benchmark intranode TSQR for numTrials trials. - // - // Name of timer doesn't matter here; we only need the timing. - timer_type timer("KokkosNodeTsqr"); - timer.start(); - for (int trialNum = 0; trialNum < numTrials; ++trialNum) { - // Factor the matrix in-place in A_copy, and extract the - // resulting R factor into R. - factor_output_type factor_output = - actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); - // Compute the explicit Q factor (which was stored - // implicitly in A_copy and factor_output) and store in Q. - // We don't need to un-cache-block the output, because we - // aren't verifying it here. - actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1), - factor_output, numCols, Q.data(), Q.stride(1), - contiguousCacheBlocks); - } - const double timing = timer.stop(); - - // Print the results - if (humanReadable) { - cout << "KokkosNodeTsqr cumulative timings:" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows = " << numRows << endl - << "# columns = " << numCols << endl - << "# partitions: " << numPartitions << endl - << "Cache size hint (in bytes) = " << actor.cache_size_hint() << endl - << "Contiguous cache blocks? " << contiguousCacheBlocks << endl - << "# trials = " << numTrials << endl - << "Total time (s) = " << timing << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - cout << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",numPartitions" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",timing" - << endl; - } - - // We don't include {min,max}_seq_apply_timing() here, because - // those times don't benefit from the accuracy of benchmarking - // for numTrials > 1. Thus, it's misleading to include them - // with tbb_tsqr_timing, the total time over numTrials trials. - cout << "KokkosNodeTsqr" - << "," << scalarTypeName - << "," << numRows - << "," << numCols - << "," << numPartitions - << "," << actor.cache_size_hint() - << "," << contiguousCacheBlocks - << "," << numTrials - << "," << timing - << endl; - } - } - } // namespace Test -} // namespace TSQR - -#endif // __TSQR_Test_KokkosNodeTsqrTest_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp index 2b3b8ddecd5d..46423863d970 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp @@ -37,15 +37,10 @@ // ************************************************************************ //@HEADER -#ifndef __TSQR_Tsqr_MatView_hpp -#define __TSQR_Tsqr_MatView_hpp +#ifndef TSQR_MATVIEW_HPP +#define TSQR_MATVIEW_HPP -// Define for bounds checking and other safety features, undefine for speed. -// #define TSQR_MATVIEW_DEBUG 1 - -#ifdef TSQR_MATVIEW_DEBUG -# include -#endif // TSQR_MATVIEW_DEBUG +#include "Teuchos_TestForException.hpp" #include #include #include @@ -91,50 +86,6 @@ namespace TSQR { return true; } -#ifdef TSQR_MATVIEW_DEBUG - template - class MatViewVerify { - public: - static void - verify (const Ordinal num_rows, - const Ordinal num_cols, - const Scalar* const A, - const Ordinal leading_dim) - { - using std::endl; - - bool good = true; - std::ostringstream os; - if (! std::numeric_limits::is_integer) { - good = false; - os << "Error: Ordinal type must be an integer."; - } - if (std::numeric_limits::is_signed) { - if (num_rows < 0) { - good = false; - os << "Error: num_rows (= " << num_rows << ") < 0."; - } - if (num_cols < 0) { - good = false; - os << "Error: num_cols (= " << num_cols << ") < 0."; - } - if (leading_dim < 0) { - good = false; - os << "Error: leading_dim (= " << leading_dim << ") < 0."; - } - } - if (leading_dim < num_rows) { - good = false; - os << "Error: leading_dim (= " << leading_dim << ") < num_rows (= " - << num_rows << ")."; - } - if (! good) { - throw std::invalid_argument (os.str ()); - } - } - }; -#endif // TSQR_MATVIEW_DEBUG - // Forward declaration template class Matrix; @@ -163,12 +114,7 @@ namespace TSQR { ncols_(num_cols), lda_(leading_dim), A_(A) - { -#ifdef TSQR_MATVIEW_DEBUG - MatViewVerify:: - verify (num_rows, num_cols, A, leading_dim); -#endif // TSQR_MATVIEW_DEBUG - } + {} MatView (const MatView& view) = default; MatView& operator= (const MatView& view) = default; @@ -197,155 +143,11 @@ namespace TSQR { operator() (const ordinal_type i, const ordinal_type j) const { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed) { - if (i < 0 || i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); - } - else if (j < 0 || j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); - } - } - else { - if (i >= extent(0)) { - throw std::invalid_argument("Row range invalid"); - } - else if (j >= extent(1)) { - throw std::invalid_argument("Column range invalid"); - } - } - if (A_ == nullptr) { - throw std::logic_error("Attempt to reference NULL data"); - } -#endif // TSQR_MATVIEW_DEBUG return A_[i + j * this->stride(1)]; } pointer data() const { return A_; } - bool empty() const { return extent(0) == 0 || extent(1) == 0; } - - /// Return a "row block" (submatrix of consecutive rows in the - /// inclusive range [firstRow,lastRow]). - MatView row_block (const ordinal_type firstRow, - const ordinal_type lastRow) - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed) { - if (firstRow < 0 || firstRow > lastRow || lastRow >= extent(0)) { - throw std::invalid_argument ("Row range invalid"); - } - } - else { - if (firstRow > lastRow || lastRow >= extent(0)) { - throw std::invalid_argument ("Row range invalid"); - } - } -#endif // TSQR_MATVIEW_DEBUG - return MatView (lastRow - firstRow + 1, extent(1), data() + firstRow, stride(1)); - } - - /// Split off and return the top cache block of nrows_top rows. - /// Modify *this to be the "rest" of the matrix. - /// - /// \note Only use this method to split off a single cache block. - /// It breaks if you try to use it otherwise. - /// - /// \param nrows_top [in] Number of rows in the top block (which - /// this method returns) - /// - /// \param b_contiguous_blocks [in] Whether or not the entries of - /// the top block are stored contiguously in *this. The default - /// is no (false). - /// - /// \return The top block of nrows_top rows. Data is a shallow - /// copy of the data in *this. - MatView - split_top (const ordinal_type nrows_top, - const bool b_contiguous_blocks = false) - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed && nrows_top < 0) { - std::ostringstream os; - os << "nrows_top (= " << nrows_top << ") < 0"; - throw std::invalid_argument (os.str()); - } - else if (nrows_top > extent(0)) { - std::ostringstream os; - os << "nrows_top (= " << nrows_top << ") > nrows (= " << extent(0) << ")"; - throw std::invalid_argument (os.str()); - } -#endif // TSQR_MATVIEW_DEBUG - - pointer const A_top_ptr = data(); - pointer A_rest_ptr; - const ordinal_type nrows_rest = extent(0) - nrows_top; - ordinal_type lda_top, lda_rest; - if (b_contiguous_blocks) { - lda_top = nrows_top; - lda_rest = nrows_rest; - A_rest_ptr = A_top_ptr + nrows_top * extent(1); - } - else { - lda_top = stride(1); - lda_rest = stride(1); - A_rest_ptr = A_top_ptr + nrows_top; - } - MatView A_top (nrows_top, extent(1), data(), lda_top); - A_ = A_rest_ptr; - nrows_ = nrows_rest; - lda_ = lda_rest; - - return A_top; - } - - /// Split off and return the bottom block. Modify *this to be the - /// "rest" of the matrix. - MatView - split_bottom (const ordinal_type nrows_bottom, - const bool b_contiguous_blocks = false) - { -#ifdef TSQR_MATVIEW_DEBUG - if (std::numeric_limits::is_signed && nrows_bottom < 0) { - throw std::invalid_argument ("nrows_bottom < 0"); - } - if (nrows_bottom > extent(0)) { - throw std::invalid_argument ("nrows_bottom > nrows"); - } -#endif // TSQR_MATVIEW_DEBUG - - pointer const A_rest_ptr = data(); - pointer A_bottom_ptr; - const ordinal_type nrows_rest = extent(0) - nrows_bottom; - ordinal_type lda_bottom, lda_rest; - if (b_contiguous_blocks) { - lda_bottom = nrows_bottom; - lda_rest = extent(0) - nrows_bottom; - A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1); - } - else { - lda_bottom = stride(1); - lda_rest = stride(1); - A_bottom_ptr = A_rest_ptr + nrows_rest; - } - MatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom); - A_ = A_rest_ptr; - nrows_ = nrows_rest; - lda_ = lda_rest; - - return A_bottom; - } - - bool operator== (const MatView& rhs) const { - return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) && - stride(1) == rhs.stride(1) && data() == rhs.data(); - } - - bool operator!= (const MatView& rhs) const { - return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) || - stride(1) != rhs.stride(1) || data() != rhs.data(); - } - private: ordinal_type nrows_ = 0; ordinal_type ncols_ = 0; @@ -378,22 +180,25 @@ namespace TSQR { { const ptrdiff_t tgt_nrows (tgt.extent (0)); const ptrdiff_t tgt_ncols (tgt.extent (1)); - if (tgt_nrows != ptrdiff_t (src.extent (0)) || - tgt_ncols != ptrdiff_t (src.extent (1))) { - std::ostringstream os; - os << "TSQR::deep_copy: dimensions of tgt (output matrix) and " - "src (input matrix) are not compatible. tgt is " - << tgt.extent (0) << " x " << tgt.extent (1) << ", but src " - "is " << src.extent (0) << " x " << src.extent (1) << "."; - throw std::invalid_argument (os.str ()); - } - for (ptrdiff_t j = 0; j < tgt_ncols; ++j) { - auto* const tgt_j = &tgt(0,j); - const auto* const src_j = &src(0,j); - for (ptrdiff_t i = 0; i < tgt_nrows; ++i) { - tgt_j[i] = src_j[i]; + + if (tgt_nrows == ptrdiff_t (src.extent (0)) || + tgt_ncols == ptrdiff_t (src.extent (1))) { + for (ptrdiff_t j = 0; j < tgt_ncols; ++j) { + auto* const tgt_j = &tgt(0,j); + const auto* const src_j = &src(0,j); + for (ptrdiff_t i = 0; i < tgt_nrows; ++i) { + tgt_j[i] = src_j[i]; + } } } + else { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::invalid_argument, "TSQR::deep_copy: dimensions " + "of tgt (output matrix) and src (input matrix) are not " + "compatible. tgt is " << tgt.extent (0) << " x " << + tgt.extent (1) << ", but src is " << src.extent (0) << " x " + << src.extent (1) << "."); + } } template @@ -427,7 +232,130 @@ namespace TSQR { return {A_top, A_bot}; } -} // namespace TSQR + template + std::pair + partition_1x2 (const MatViewType& A, + const typename MatViewType::ordinal_type ncols_left) + { + using ordinal_type = typename MatViewType::ordinal_type; + using pointer = typename MatViewType::pointer; + + const ordinal_type nrows = A.extent(0); + const ordinal_type ncols = A.extent(1); + const ordinal_type ncols_right = ncols - ncols_left; + // assumes column major + const auto right_offset = A.stride(1) * ncols_right; + + pointer A_top_ptr = A.data(); + pointer A_bot_ptr = A.data() + right_offset; + + MatViewType A_top (nrows, ncols_left, A_top_ptr, A.stride(1)); + MatViewType A_bot (nrows, ncols_right, A_bot_ptr, A.stride(1)); + return {A_top, A_bot}; + } + + /// \brief Split off and return the top block of nrows_top rows. + /// Modify A in place to be the "rest" of the matrix. + /// + /// \param A [in] On input: The whole matrix view. On output: A + /// view of the "rest" of the matrix, that is, the part "below" + /// the returned matrix view. + /// + /// \param nrows_top [in] Number of rows in the top block (which + /// this method returns). + /// + /// \param contiguousCacheBlocks [in] Whether or not the entries of + /// the top block are stored contiguously in A. The default is no + /// (false). + /// + /// \return A view of the top block of nrows_top rows. + template + MatView + split_top (MatView& A, + const LO nrows_top, + const bool contiguousCacheBlocks = false) + { + using pointer = typename MatView::pointer; + pointer A_top_ptr = A.data(); + pointer A_rest_ptr {}; + const LO nrows_rest = A.extent(0) - nrows_top; + const LO ncols = A.extent(1); + + LO lda_top, lda_rest; + if (contiguousCacheBlocks) { + lda_top = nrows_top; + lda_rest = nrows_rest; + A_rest_ptr = A_top_ptr + nrows_top * ncols; + } + else { + lda_top = A.stride(1); + lda_rest = A.stride(1); + A_rest_ptr = A_top_ptr + nrows_top; + } + MatView A_top (nrows_top, ncols, A_top_ptr, lda_top); + A = MatView (nrows_rest, ncols, A_rest_ptr, lda_rest); + return A_top; + } + + /// \brief Split off and return the bottom block. Modify A to be + /// the "rest" of the matrix. + template + MatView + split_bottom (MatView& A, + const LO nrows_bottom, + const bool contiguousCacheBlocks = false) + { + using pointer = typename MatView::pointer; + + pointer A_rest_ptr = A.data(); + pointer A_bottom_ptr {}; + const LO nrows_rest = A.extent(0) - nrows_bottom; + const LO ncols = A.extent(1); + + LO lda_bottom, lda_rest; + if (contiguousCacheBlocks) { + lda_bottom = nrows_bottom; + lda_rest = A.extent(0) - nrows_bottom; + A_bottom_ptr = A_rest_ptr + nrows_rest * ncols; + } + else { + lda_bottom = A.stride(1); + lda_rest = A.stride(1); + A_bottom_ptr = A_rest_ptr + nrows_rest; + } + MatView A_bottom (nrows_bottom, ncols, A_bottom_ptr, lda_bottom); + A = MatView (nrows_rest, ncols, A_rest_ptr, lda_rest); + return A_bottom; + } + template + bool empty (const MatView& A) { + return A.extent(0) == 0 || A.extent(1) == 0; + } + + template + void + copy_upper_triangle (const MatView& R_out, + const MatView& R_in) + { + const LO nrows = R_out.extent (0); + const LO ncols = R_out.extent (1); + + if (nrows >= ncols) { + for (LO j = 0; j < ncols; ++j) { + for (LO i = 0; i <= j; ++i) { + R_out(i,j) = R_in(i,j); + } + } + } + else { + auto R_out_lr = partition_1x2 (R_out, nrows); + auto R_in_lr = partition_1x2 (R_in, nrows); + copy_upper_triangle (R_out_lr.first, R_in_lr.first); + deep_copy (R_out_lr.second, R_in_lr.second); + } + } + +} // namespace TSQR -#endif // __TSQR_Tsqr_MatView_hpp +#endif // TSQR_MATVIEW_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp index 2bb78584016e..24f7fac61afd 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp @@ -77,98 +77,12 @@ namespace TSQR { using mat_view_type = MatView; using const_mat_view_type = MatView; - private: - static bool - fits_in_size_t (const ordinal_type& ord) - { - const ordinal_type result = ordinal_type (size_t (ord)); - return (ord == result); - } - - /// Check whether num_rows*num_cols makes sense as an amount of - /// storage (for the num_rows by num_cols dense matrix). Not - /// making sense includes negative values for either parameter (if - /// they are signed types), or overflow when computing their - /// product. Throw an exception of the appropriate type for any - /// of these cases. Otherwise, return num_rows*num_cols as a - /// size_t. - /// - /// \param num_rows [in] Number of rows in the matrix - /// \param num_cols [in] Number of columns in the matrix - /// \return num_rows*num_cols - size_t - verified_alloc_size (const ordinal_type num_rows, - const ordinal_type num_cols) const - { - static_assert (std::numeric_limits::is_integer, - "ordinal_type must be an integer type."); - // Quick exit also checks for zero num_cols (which prevents - // division by zero in the tests below). - if (num_rows == 0 || num_cols == 0) { - return size_t(0); - } - - // If ordinal_type is signed, make sure that num_rows and num_cols - // are nonnegative. - if (std::numeric_limits::is_signed) { - if (num_rows < 0) { - std::ostringstream os; - os << "# rows (= " << num_rows << ") < 0"; - throw std::logic_error (os.str()); - } - else if (num_cols < 0) { - std::ostringstream os; - os << "# columns (= " << num_cols << ") < 0"; - throw std::logic_error (os.str()); - } - } - - // If ordinal_type is bigger than a size_t, do special range - // checking. The compiler warns (comparison of signed and - // unsigned) if ordinal_type is a signed type and we try to do - // "numeric_limits::max() < - // std::numeric_limits::max()", so instead we cast each - // of num_rows and num_cols to size_t and back to ordinal_type again, - // and see if we get the same result. If not, then we - // definitely can't return a size_t product of num_rows and - // num_cols. - if (! fits_in_size_t (num_rows)) { - std::ostringstream os; - os << "# rows (= " << num_rows << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } - else if (! fits_in_size_t (num_cols)) { - std::ostringstream os; - os << "# columns (= " << num_cols << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } - - // Both num_rows and num_cols fit in a size_t, and are - // nonnegative. Now check whether their product also fits in a - // size_t. - // - // Note: This may throw a SIGFPE (floating-point exception) if - // num_cols is zero. Be sure to check first (above). - if (size_t (num_rows) > - std::numeric_limits::max() / size_t (num_cols)) { - std::ostringstream os; - os << "num_rows (= " << num_rows << ") * num_cols (= " - << num_cols << ") > max size_t value (= " - << std::numeric_limits::max() << ")"; - throw std::range_error (os.str()); - } - return size_t (num_rows) * size_t (num_cols); - } - - public: //! Constructor with dimensions. Matrix (const ordinal_type num_rows, const ordinal_type num_cols) : nrows_ (num_rows), ncols_ (num_cols), - A_ (verified_alloc_size (num_rows, num_cols)) + A_ (size_t (num_rows) * size_t (num_cols)) {} //! Constructor with dimensions and fill datum. @@ -177,7 +91,7 @@ namespace TSQR { const non_const_value_type& value) : nrows_ (num_rows), ncols_ (num_cols), - A_ (verified_alloc_size (num_rows, num_cols), value) + A_ (size_t (num_rows) * size_t (num_cols), value) {} /// \brief Copy constructor. @@ -188,21 +102,17 @@ namespace TSQR { Matrix (const Matrix& in) : nrows_ (in.extent(0)), ncols_ (in.extent(1)), - A_ (verified_alloc_size (in.extent(0), in.extent(1))) + A_ (size_t (in.extent(0)) * size_t (in.extent(1))) { - if (! in.empty()) { - MatView this_view - (extent(0), extent(1), data(), stride(1)); - MatView in_view - (in.extent(0), in.extent(1), in.data(), in.stride(1)); - deep_copy (this_view, in_view); - } + MatView in_view + (in.extent(0), in.extent(1), in.data(), in.stride(1)); + deep_copy (*this, in_view); } //! Default constructor (constructs an empty matrix). Matrix () = default; - /// \brief "Copy constructor" from a matrix view type. + /// \brief "Copy constructor" from a Matrix or MatrixView. /// /// This constructor allocates a new matrix and copies the /// elements of the input view into the resulting new matrix. @@ -212,7 +122,7 @@ namespace TSQR { Matrix (const MatrixViewType& in) : nrows_ (in.extent(0)), ncols_ (in.extent(1)), - A_ (verified_alloc_size (in.extent(0), in.extent(1))) + A_ (size_t (in.extent(0)) * size_t (in.extent(1))) { if (A_.size() != 0) { MatView this_view @@ -246,18 +156,6 @@ namespace TSQR { return A_[i]; } - //! Equality: ONLY compares dimensions and pointers (shallow). - template - bool operator== (const MatrixViewType& B) const - { - if (data() != B.data() || extent(0) != B.extent(0) || - extent(1) != B.extent(1) || stride(1) != B.stride(1)) { - return false; - } else { - return true; - } - } - constexpr ordinal_type extent (const int r) const noexcept { return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0)); } @@ -266,9 +164,6 @@ namespace TSQR { return r == 0 ? ordinal_type(1) : (r == 1 ? nrows_ : ordinal_type(0)); } - //! Whether the matrix is empty (has either zero rows or zero columns). - bool empty() const { return extent(0) == 0 || extent(1) == 0; } - //! A non-const pointer to the matrix data. pointer data() { @@ -308,7 +203,7 @@ namespace TSQR { if (num_rows == extent(0) && num_cols == extent(1)) return; // no need to reallocate or do anything else - const size_t alloc_size = verified_alloc_size (num_rows, num_cols); + const size_t alloc_size = size_t (num_rows) * size_t (num_cols); nrows_ = num_rows; ncols_ = num_cols; A_.resize (alloc_size); @@ -327,6 +222,11 @@ namespace TSQR { std::vector A_; }; + template + bool empty (const Matrix& A) { + return A.extent(0) == 0 || A.extent(1) == 0; + } + template void deep_copy (Matrix& tgt, const SourceScalar& src) @@ -344,6 +244,23 @@ namespace TSQR { deep_copy (tgt.view(), src); } + template + void + copy_upper_triangle (Matrix& R_out, + const MatView& R_in) + { + copy_upper_triangle (R_out.view (), R_in); + } + + template + void + copy_upper_triangle (Matrix& R_out, + const Matrix& R_in) + { + auto R_out_view = R_out.view (); + copy_upper_triangle (R_out_view, R_in.const_view ()); + } + template std::pair, MatView> partition_2x1 (Matrix& A, diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp index 6bd5406e13eb..e870193352ca 100644 --- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp @@ -42,9 +42,6 @@ #include "Tsqr_ConfigDefs.hpp" #include "Tsqr_Mgs.hpp" -#ifdef HAVE_KOKKOSTSQR_TBB -# include "TbbTsqr_TbbMgs.hpp" -#endif // HAVE_KOKKOSTSQR_TBB #include "Tsqr_TestSetup.hpp" #include "Tsqr_GlobalVerify.hpp" #include "Tsqr_printGlobalMatrix.hpp" @@ -68,13 +65,6 @@ namespace TSQR { if (which == "MpiSeqMGS") { return std::string ("MPI parallel / sequential MGS"); } - else if (which == "MpiTbbMGS") { -#ifdef HAVE_KOKKOSTSQR_TBB - return std::string ("MPI parallel / TBB parallel MGS"); -#else - throw std::logic_error("MGS not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } else { throw std::logic_error("Unknown MGS implementation type \"" + which + "\""); } @@ -184,16 +174,7 @@ namespace TSQR { } } - if (which == "MpiTbbMGS") { -#ifdef HAVE_KOKKOSTSQR_TBB - typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type; - mgs_type mgser (scalarComm); - MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); -#else - throw std::logic_error("MGS not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqMGS") { + if (which == "MpiSeqMGS") { typedef MGS mgs_type; mgs_type mgser (scalarComm); MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug); @@ -238,9 +219,6 @@ namespace TSQR { << "# rows = " << nrows_global << endl << "# columns = " << ncols << endl << "# MPI processes = " << nprocs << endl; - if (which == "MpiTbbTSQR") { - cout << "# cores per process = " << num_cores << endl; - } cout << "Absolute residual $\\|A - Q*R\\|_2: " << results[0] << endl << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " @@ -253,11 +231,8 @@ namespace TSQR { cout << which << "," << nrows_global << "," << ncols - << "," << nprocs; - if (which == "MpiTbbTSQR") { - cout << "," << num_cores << endl; - } - cout << "," << results[0] + << "," << nprocs + << "," << results[0] << "," << results[1] << "," << results[2] << endl; @@ -384,17 +359,7 @@ namespace TSQR { // Set up MGS and run the benchmark. double mgs_timing; // Total run time in seconds of all ntrials trials - if (which == "MpiTbbMGS") { -#ifdef HAVE_KOKKOSTSQR_TBB - typedef TSQR::TBB::TbbMgs mgs_type; - mgs_type mgser (scalarComm); - mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, - ntrials, human_readable); -#else - throw std::logic_error("MGS not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqMGS") { + if (which == "MpiSeqMGS") { typedef MGS mgs_type; mgs_type mgser (scalarComm); mgs_timing = do_mgs_benchmark (mgser, Q_local, R, @@ -428,11 +393,8 @@ namespace TSQR { cout << mgs_human_readable_name(which) << ":" << endl << "# rows = " << nrows_global << endl << "# columns = " << ncols << endl - << "# MPI processes = " << nprocs << endl; - if (which == "MpiTbbTSQR") { - cout << "# cores per process = " << num_cores << endl; - } - cout << "# trials = " << ntrials << endl + << "# MPI processes = " << nprocs << endl + << "# trials = " << ntrials << endl << "Min total time (s) over all MPI processes = " << min_mgs_timing << endl << "Max total time (s) over all MPI processes = " @@ -443,11 +405,8 @@ namespace TSQR { cout << which << "," << nrows_global << "," << ncols - << "," << nprocs; - if (which == "MpiTbbTSQR") { - cout << "," << num_cores << endl; - } - cout << "," << ntrials + << "," << nprocs + << "," << ntrials << "," << min_mgs_timing << "," << max_mgs_timing << endl; diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp index 155081ca8d38..06c9c6ee1484 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp @@ -48,11 +48,21 @@ #include "Teuchos_as.hpp" #include "Teuchos_Describable.hpp" #include "Tsqr_Impl_Lapack.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_RCP.hpp" #include "Teuchos_ScalarTraits.hpp" #include "Teuchos_TypeNameTraits.hpp" #include namespace TSQR { + namespace Impl { + template + class NodeFactorOutput { + public: + virtual ~NodeFactorOutput() = default; + }; + } // namespace Impl + /// \class NodeTsqr /// \brief Common interface and functionality for intranode TSQR. /// @@ -63,37 +73,16 @@ namespace TSQR { /// \tparam Ordinal The (local) Ordinal type; the type of indices /// into a matrix on a node /// \tparam Scalar Tthe type of elements stored in the matrix - /// \tparam FactorOutputType The type returned by factor(). - /// - /// We template on FactorOutputType for compile-time polymorphism. - /// This lets subclasses define the \c factor() method, without - /// constraining them to inherit their particular FactorOutputType - /// from a common abstract base class. FactorOutputType is meant to - /// be either just a simple composition of std::pair and - /// std::vector, or a simple struct. Its contents are specific to - /// each intranode TSQR implementation. and are not intended to be - /// polymorphic, so it would not make sense for all the different - /// FactorOutputType types to inherit from a common base class. - /// - /// Templating on FactorOutputType means that we can't use run-time - /// polymorphism to swap between NodeTsqr subclasses, since the - /// latter are really subclasses of different NodeTsqr - /// instantiations (i.e., different FactorOutputType types). - /// However, inheriting from different specializations of NodeTsqr - /// does enforce correct compile-time polymorphism in a syntactic - /// way. It also avoids repeated code for common functionality. - /// Full run-time polymorphism of different NodeTsqr subclasses - /// would not be useful. This is because ultimately each subclass - /// is bound to a Kokkos Node type, and those only use compile-time - /// polymorphism. - template + template class NodeTsqr : public Teuchos::Describable { public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef FactorOutputType factor_output_type; - typedef MatView mat_view_type; - typedef MatView const_mat_view_type; + using ordinal_type = Ordinal; + using scalar_type = Scalar; + using magnitude_type = + typename Teuchos::ScalarTraits::magnitudeType; + using factor_output_type = Impl::NodeFactorOutput; + using mat_view_type = MatView; + using const_mat_view_type = MatView; //! Constructor NodeTsqr() = default; @@ -101,6 +90,17 @@ namespace TSQR { //! Virtual destructor, for memory safety of derived classes. virtual ~NodeTsqr() = default; + //! List of valid parameters for the NodeTsqr subclass. + virtual Teuchos::RCP + getValidParameters () const = 0; + + //! Validate and read in parameters. + virtual void + setParameterList (const Teuchos::RCP& paramList) = 0; + + //! Whether the subclass wants large arrays as GPU device memory. + virtual bool wants_device_memory () const { return false; } + /// \brief Whether this object is ready to perform computations. /// /// Some NodeTsqr subclasses require additional initialization @@ -162,7 +162,7 @@ namespace TSQR { /// /// \return Part of the implicit representation of the Q factor. /// The other part is the A matrix on output. - virtual factor_output_type + virtual Teuchos::RCP factor (const Ordinal nrows, const Ordinal ncols, Scalar A[], @@ -203,7 +203,7 @@ namespace TSQR { const Ordinal ncols_Q, const Scalar Q[], const Ordinal ldq, - const FactorOutputType& factorOutput, + const factor_output_type& factorOutput, const Ordinal ncols_C, Scalar C[], const Ordinal ldc, @@ -248,6 +248,47 @@ namespace TSQR { const Ordinal ldc, const bool contiguousCacheBlocks) const = 0; + /// \brief Force the diagonal entries of the R factor to be + /// nonnegative, and change the columns of Q (result of + /// explicit_Q) to match (if needed). + virtual void + force_nonnegative_diagonal (const Ordinal nrows, + const Ordinal ncols, + Scalar Q[], + const Ordinal ldq, + Scalar R[], + const Ordinal ldr) const + { + mat_view_type Q_view (nrows, ncols, Q, ldq); + mat_view_type R_view (ncols, ncols, R, ldr); + + // The complex-arithmetic specialization does nothing, since + // _GEQR{2,F} for complex arithmetic returns an R factor with + // nonnegative diagonal already. However, we need the code to + // compile regardless. + using STS = Teuchos::ScalarTraits; + if (! STS::isComplex) { + using mag_type = typename STS::magnitudeType; + constexpr mag_type ZERO {}; + + for (Ordinal k = 0; k < ncols; ++k) { + if (STS::real (R_view(k,k)) < ZERO) { + // Scale column k of Q_view. + Scalar* const Q_k = &Q_view(0,k); + for (Ordinal i = 0; i < nrows; ++i) { + Q_k[i] = -Q_k[i]; + } + // Scale row k of R_view. R_view is upper triangular, + // so we only have to scale right of (and including) the + // diagonal entry. + for (int j = k; j < ncols; ++j) { + R_view(k,j) = -R_view(k,j); + } + } + } + } + } + /// \brief Cache block A_in into A_out. /// /// \param nrows [in] Number of rows in A_in and A_out. @@ -339,7 +380,9 @@ namespace TSQR { /// \endcode virtual const_mat_view_type const_top_block (const const_mat_view_type& C, - const bool contiguousCacheBlocks) const = 0; + const bool /* contiguousCacheBlocks */) const { + return C; + } public: /// \brief Return view of topmost cache block of C. @@ -387,6 +430,49 @@ namespace TSQR { C_top.stride(1)); } + /// \brief Copy from "native" NodeTsqr device storage, to a packed + /// host matrix. + virtual Matrix + copy_to_host (const MatView& C) const + { + // FIXME (mfh 17 Dec 2019) Need to reimplement in + // CuSolverNodeTsqr, since C is device memory there. + // + // The same concerns as in CuSolverNodeTsqr::extract_R, about + // Kokkos::deep_copy not wanting to copy from noncontiguous + // device memory to contiguous host memory, apply here. + return Matrix (C); + } + + /// \brief Copy from a host matrix, to "native" NodeTsqr device + /// storage. + virtual void + copy_from_host (const MatView& C_device, + const MatView& C_host) const + { + // FIXME (mfh 17 Dec 2019) Need to reimplement in + // CuSolverNodeTsqr, since C_device is device memory there. + // + // The same concerns as in CuSolverNodeTsqr::extract_R, about + // Kokkos::deep_copy not wanting to copy between noncontiguous + // device memory and contiguous host memory, apply here. + deep_copy (C_device, C_host); + } + + //! Set the first C.extent(1) diagonal entries of C to 1.0. + virtual void + set_diagonal_entries_to_one + (const MatView& C) const + { + // NOTE (mfh 17 Dec 2019) Downstream classes must reimplement + // this if C is device memory for those classes. See + // wants_device_memory above. + const Ordinal ncols = C.extent (1); + for (Ordinal j = 0; j < ncols; ++j) { + C(j,j) = Scalar (1.0); + } + } + /// \brief Does factor() compute R with nonnegative diagonal? /// /// When using a QR factorization to orthogonalize a block of @@ -454,9 +540,9 @@ namespace TSQR { }; - template + template Ordinal - NodeTsqr:: + NodeTsqr:: reveal_R_rank (const Ordinal ncols, Scalar R[], const Ordinal ldr, @@ -467,7 +553,6 @@ namespace TSQR { using Teuchos::as; using Teuchos::TypeNameTraits; typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; typedef Teuchos::ScalarTraits STM; TEUCHOS_TEST_FOR_EXCEPTION(tol < 0, std::invalid_argument, @@ -612,9 +697,9 @@ namespace TSQR { return rank; } - template + template Ordinal - NodeTsqr:: + NodeTsqr:: reveal_rank (const Ordinal nrows, const Ordinal ncols, Scalar Q[], diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp index 0d2d84b580a4..161c7e6cc377 100644 --- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp @@ -37,108 +37,146 @@ // ************************************************************************ //@HEADER -#ifndef __TSQR_NodeTsqrFactory_hpp -#define __TSQR_NodeTsqrFactory_hpp +/// \file Tsqr_NodeTsqrFactory.hpp +/// \brief Declaration and definition of a factory for creating an +/// instance of the right NodeTsqr subclass. -#include "Tsqr_ConfigDefs.hpp" -#include "Kokkos_DefaultNode.hpp" +#ifndef TSQR_NODETSQRFACTORY_HPP +#define TSQR_NODETSQRFACTORY_HPP -#ifdef HAVE_KOKKOSTSQR_TBB -# include "TbbTsqr.hpp" -#endif // HAVE_KOKKOSTSQR_TBB - -#include "Tsqr_KokkosNodeTsqr.hpp" #include "Tsqr_SequentialTsqr.hpp" - -#include "Teuchos_ParameterList.hpp" -#include "Teuchos_ParameterListExceptions.hpp" +#include "Tsqr_CombineNodeTsqr.hpp" +#include "Tsqr_CuSolverNodeTsqr.hpp" #include "Teuchos_RCP.hpp" -#include "Teuchos_ScalarTraits.hpp" -#include "Teuchos_TypeNameTraits.hpp" - -#include - +#include "Teuchos_TestForException.hpp" +#ifdef HAVE_TPETRATSQR_COMPLEX +# include "Kokkos_Complex.hpp" +#endif // HAVE_TPETRATSQR_COMPLEX +#include +#include namespace TSQR { - /// \class NodeTsqrFactory - /// \brief Factory for creating an instance of the right \c NodeTsqr subclass. + /// \brief Factory for creating an instance of the right NodeTsqr + /// subclass. /// \author Mark Hoemmen /// - /// \tparam Node The Kokkos Node type - /// \tparam Scalar The type of entries in the matrices to factor - /// \tparam LocalOrdinal The type of local indices in the matrices to factor + /// \tparam Scalar The type of entries in the matrices to factor. + /// \tparam LocalOrdinal The type of local indices in the matrices + /// to factor. + /// \tparam Device Kokkos::Device specialization used by the + /// matrices to factor. /// - /// This class maps from a particular Kokkos \c Node type, to the - /// corresponding \c NodeTsqr subclass. It lets you construct a - /// default ParameterList for that \c NodeTsqr subclass, as well as - /// an instance of the \c NodeTsqr subclass. It also provides - /// typedefs for template metaprogramming. + /// This class maps from (Scalar, LocalOrdinal, Device), to the + /// corresponding NodeTsqr subclass. It lets you construct a + /// default ParameterList for that NodeTsqr subclass, as well as an + /// instance of the NodeTsqr subclass. It also provides type + /// aliases for template metaprogramming. /// - /// The "right" \c NodeTsqr subclass is a function of the \c Node - /// template parameter, and possibly also of the other template - /// parameters. + /// The "right" NodeTsqr subclass is a function of Device, and + /// possibly also of the other template parameters. /// /// \note If this class does not have a partial - /// specialization for your \c Node type, it defaults to use + /// specialization for your Device type, it defaults to use /// SequentialTsqr. That class does not use threads, and /// only knows how to deal with host data; it cannot handle GPU /// device-resident data. Thus, it may perform poorly. - template + template class NodeTsqrFactory { public: - //! The Kokkos Node type. - typedef Node node_type; - //! Pointer (RCP) to node_type. - typedef Teuchos::RCP node_ptr; - - //! The NodeTsqr subclass corresponding to the Kokkos Node type. - typedef SequentialTsqr node_tsqr_type; + using node_tsqr_type = NodeTsqr; - /// \brief Default parameter list for intranode TSQR. + /// \brief Get the default implementation of NodeTsqr. /// - /// \note The default implementation returns an empty (not null) - /// parameter list. Each specialization for a specific Node - /// type redefines this method to return a parameter list - /// appropriate for that Node type's TSQR implementation. - static Teuchos::RCP - getDefaultParameters () + /// The default implementation is a function of the template + /// parameters, especialy Scalar and Device. + static Teuchos::RCP + getNodeTsqr () { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - - RCP params = parameterList ("NodeTsqr"); - // Create a temporary node_tsqr_type instance in order to get - // default parameters. The empty input parameter list will get - // filled in with default values of missing parameters. - node_tsqr_type nodeTsqr (params); - - return params; + using Teuchos::rcp; + +#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + using execution_space = typename Device::execution_space; + constexpr bool is_cuda = + std::is_same::value; + if (is_cuda) { + return rcp (new CuSolverNodeTsqr); + } + else { +#endif + + // NOTE (mfh 02 Dec 2019) SequentialTsqr does not currently + // give correct results for complex Scalar types, so we use + // CombineNodeTsqr in that case. +#ifdef HAVE_TPETRATSQR_COMPLEX + constexpr bool is_complex = + std::is_same>::value || + std::is_same>::value || + std::is_same>::value || + std::is_same>::value; +#else + constexpr bool is_complex = false; +#endif // HAVE_TPETRATSQR_COMPLEX + if (is_complex) { + return rcp (new CombineNodeTsqr); + } + else { + return rcp (new SequentialTsqr); + } + +#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + } +#endif } - /// \brief Return a pointer to the intranode TSQR implementation. + /// \brief Get a specific implementation of NodeTsqr. /// - /// \param node [in/out] Pointer to the Kokkos Node instance. - /// - /// \param plist [in/out] Parameter list for configuring the - /// NodeTsqr implementation. + /// \param name [in] Either "SequentialTsqr", "CombineNodeTsqr", + /// or "Default". "Default" means "return what the above + /// zero-argument overload of getNodeTsqr() returns." static Teuchos::RCP - makeNodeTsqr (const Teuchos::RCP& node, - const Teuchos::RCP& plist) + getNodeTsqr (const std::string& name) { - (void) node; - return rcp (new node_tsqr_type (plist)); + using Teuchos::rcp; + if (name == "SequentialTsqr" || name == "Sequential") { + return rcp (new SequentialTsqr); + } + else if (name == "CombineNodeTsqr" || name == "Combine") { + return rcp (new CombineNodeTsqr); + } +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + else if (name == "CuSolverNodeTsqr" || name == "CuSolver") { + return rcp (new CuSolverNodeTsqr); + } +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER + else if (name == "Default") { + return getNodeTsqr (); + } + else { + const char prefix[] = "TSQR::NodeTsqrFactory::getNodeTsqr: "; + const std::vector validNames + {{"SequentialTsqr", + "CombineNodeTsqr", +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + "CuSolverNodeTsqr", +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER + "Default"}}; + std::ostringstream os; + os << prefix << "Invalid NodeTsqr subclass name \"" << name + << "\". Valid names are: {"; + for (size_t k = 0; k < validNames.size (); ++k) { + os << "\"" << validNames[k] << "\""; + if (k + size_t (1) < validNames.size ()) { + os << ", "; + } + } + os << "}."; + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::invalid_argument, os.str ()); + } } - /// \brief Prepare the NodeTsqr instance for use. - /// - /// \pre ! nodeTsqr.is_null() - /// \post nodeTsqr->ready() - static void - prepareNodeTsqr (const Teuchos::RCP& /* nodeTsqr */) - {} }; } // namespace TSQR -#endif // __TSQR_NodeTsqrFactory_hpp +#endif // TSQR_NODETSQRFACTORY_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp deleted file mode 100644 index 530dba578814..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp +++ /dev/null @@ -1,781 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_Test_DistTest_hpp -#define __TSQR_Test_DistTest_hpp - -#include "Tsqr_ConfigDefs.hpp" -#include "Tsqr_Random_NormalGenerator.hpp" -#include "Tsqr_verifyTimerConcept.hpp" -#include "Tsqr_generateStack.hpp" -#include "Tsqr_DistTsqr.hpp" -#include "Tsqr_GlobalTimeStats.hpp" -#include "Tsqr_GlobalVerify.hpp" -#include "Tsqr_printGlobalMatrix.hpp" -#include -#include -#include -#include - -namespace TSQR { - namespace Test { - /// \class DistTsqrVerifier - /// \brief Generic version of \c DistTsqr accuracy test. - template - class DistTsqrVerifier { - TSQR::Random::NormalGenerator gen_; - Teuchos::RCP > const ordinalComm_; - Teuchos::RCP > const scalarComm_; - std::string scalarTypeName_; - std::ostream& out_; - std::ostream& err_; - const bool testFactorExplicit_, testFactorImplicit_; - const bool humanReadable_, printMatrices_, debug_; - - public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - typedef typename std::vector result_type; - typedef Matrix matrix_type; - - /// \brief Constructor, with custom seed value - /// - /// \param scalarComm [in/out] Communicator object over which to - /// test. - /// \param seed [in] 4-element vector; the random seed input of - /// TSQR::Random::NormalGenerator (which see, since there are - /// restrictions on the set of valid seeds) - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// template type parameter - /// \param out [out] Output stream to which to write results - /// \param err [out] Output stream to which to write any - /// debugging outputs (if applicable) or errors - /// \param testFactorExplicit [in] Whether to test - /// DistTsqr::factorExplicit() - /// \param testFactorImplicit [in] Whether to test - /// DistTsqr::factor() and DistTsqr::explicit_Q() - /// \param humanReadable [in] Whether printed results should be - /// easy for humans to read (vs. easy for parsers to parse) - /// \param debug [in] Whether to write verbose debug output to - /// err - DistTsqrVerifier (const Teuchos::RCP >& ordinalComm, - const Teuchos::RCP >& scalarComm, - const std::vector& seed, - const std::string& scalarTypeName, - std::ostream& out, - std::ostream& err, - const bool testFactorExplicit, - const bool testFactorImplicit, - const bool humanReadable, - const bool printMatrices, - const bool debug) : - gen_ (seed), - ordinalComm_ (ordinalComm), - scalarComm_ (scalarComm), - scalarTypeName_ (scalarTypeName), - out_ (out), - err_ (err), - testFactorExplicit_ (testFactorExplicit), - testFactorImplicit_ (testFactorImplicit), - humanReadable_ (humanReadable), - printMatrices_ (printMatrices), - debug_ (debug) - {} - - /// \brief Constructor, with default seed value - /// - /// This constructor sets a default seed (for the pseudorandom - /// number generator), which is the same seed (0,0,0,1) each - /// time. - /// - /// \param scalarComm [in/out] Communicator object over which to - /// test. - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// template type parameter - /// \param out [out] Output stream to which to write results - /// \param err [out] Output stream to which to write any - /// debugging outputs (if applicable) or errors - /// \param testFactorExplicit [in] Whether to test - /// DistTsqr::factorExplicit() - /// \param testFactorImplicit [in] Whether to test - /// DistTsqr::factor() and DistTsqr::explicit_Q() - /// \param humanReadable [in] Whether printed results should be - /// easy for humans to read (vs. easy for parsers to parse) - /// \param debug [in] Whether to write verbose debug output to - /// err - DistTsqrVerifier (const Teuchos::RCP >& ordinalComm, - const Teuchos::RCP >& scalarComm, - const std::string& scalarTypeName, - std::ostream& out, - std::ostream& err, - const bool testFactorExplicit, - const bool testFactorImplicit, - const bool humanReadable, - const bool printMatrices, - const bool debug) : - ordinalComm_ (ordinalComm), - scalarComm_ (scalarComm), - scalarTypeName_ (scalarTypeName), - out_ (out), - err_ (err), - testFactorExplicit_ (testFactorExplicit), - testFactorImplicit_ (testFactorImplicit), - humanReadable_ (humanReadable), - printMatrices_ (printMatrices), - debug_ (debug) - {} - - /// \brief Get seed vector for pseudorandom number generator - /// - /// Fill seed (changing size of vector as necessary) with the - /// seed vector used by the pseudorandom number generator. You - /// can use this to resume the pseudorandom number stream from - /// where you last were. - void - getSeed (std::vector& seed) const - { - gen_.getSeed (seed); - } - - /// \brief Run the DistTsqr accuracy test - /// - /// \param numCols [in] Number of columns in the matrix to test. - /// Number of rows := (# MPI processors) * ncols. - void - verify (const Ordinal numCols, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - - const int myRank = scalarComm_->rank(); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "Verifying DistTsqr:" << endl; - scalarComm_->barrier(); - } - - // Generate test problem. - Matrix< Ordinal, Scalar > A_local, Q_local, R; - testProblem (A_local, Q_local, R, numCols); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Generated test problem." << endl; - scalarComm_->barrier(); - } - - // Set up TSQR implementation. - DistTsqr par; - par.init (scalarComm_); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- DistTsqr object initialized" << endl << endl; - } - - // Whether we've printed field names (i.e., column headers) - // yet. Only matters for non-humanReadable output. - bool printedFieldNames = false; - - // Test DistTsqr::factor() and DistTsqr::explicit_Q(). - if (testFactorImplicit_) - { - // Factor the matrix A (copied into R, which will be - // overwritten on output) - typedef typename DistTsqr::FactorOutput - factor_output_type; - factor_output_type factorOutput = par.factor (R.view()); - if (debug_) - { - scalarComm_->barrier(); - if (myRank == 0) - err_ << "-- Finished DistTsqr::factor" << endl; - } - // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); - if (debug_) { - scalarComm_->barrier(); - if (myRank == 0) { - err_ << "-- Finished DistTsqr::explicit_Q" << endl; - } - } - // Verify the factorization - result_type result = - global_verify (numCols, numCols, A_local.data(), A_local.stride(1), - Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), - scalarComm_.get()); - if (debug_) { - scalarComm_->barrier(); - if (myRank == 0) { - err_ << "-- Finished global_verify" << endl; - } - } - reportResults ("DistTsqr", numCols, result, - additionalFieldNames, additionalData, - printFieldNames && (! printedFieldNames)); - if (printFieldNames && (! printedFieldNames)) - printedFieldNames = true; - } - - // Test DistTsqr::factorExplicit() - if (testFactorExplicit_) { - // Factor the matrix and compute the explicit Q factor, both - // in a single operation. - par.factorExplicit (R.view(), Q_local.view()); - if (debug_) { - scalarComm_->barrier(); - if (myRank == 0) { - err_ << "-- Finished DistTsqr::factorExplicit" << endl; - } - } - - if (printMatrices_) { - if (myRank == 0) { - err_ << std::endl << "Computed Q factor:" << std::endl; - } - printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get()); - if (myRank == 0) { - err_ << std::endl << "Computed R factor:" << std::endl; - print_local_matrix (err_, R.extent(0), R.extent(1), R.data(), R.stride(1)); - err_ << std::endl; - } - } - - // Verify the factorization - result_type result = - global_verify (numCols, numCols, A_local.data(), A_local.stride(1), - Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), - scalarComm_.get()); - if (debug_) { - scalarComm_->barrier(); - if (myRank == 0) { - err_ << "-- Finished global_verify" << endl; - } - } - reportResults ("DistTsqrRB", numCols, result, - additionalFieldNames, additionalData, - printFieldNames && (! printedFieldNames)); - if (printFieldNames && (! printedFieldNames)) { - printedFieldNames = true; - } - } - } - - private: - /// Report verification results. Call on ALL MPI processes, not - /// just Rank 0. - /// - /// \param method [in] String to print before reporting results - /// \param numCols [in] Number of columns in the matrix tested. - /// \param result [in] (relative residual, orthogonality) - void - reportResults (const std::string& method, - const Ordinal numCols, - const result_type& result, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - - const int numProcs = scalarComm_->size(); - const int myRank = scalarComm_->rank(); - - if (myRank == 0) - { - if (humanReadable_) - { - out_ << method << " accuracy results:" << endl - << "Scalar type = " << scalarTypeName_ << endl - << "Number of columns = " << numCols << endl - << "Number of (MPI) processes = " << numProcs << endl - << "Absolute residual $\\| A - Q R \\|_2: " - << result[0] << endl - << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " - << result[1] << endl - << "Test matrix norm $\\| A \\|_F$: " - << result[2] << endl; - } - else - { - // Use scientific notation for floating-point numbers - out_ << std::scientific; - - if (printFieldNames) - { - out_ << "%method,scalarType,numCols,numProcs" - ",absFrobResid,absFrobOrthog,frobA"; - if (! additionalFieldNames.empty()) - out_ << "," << additionalFieldNames; - out_ << endl; - } - - out_ << method - << "," << scalarTypeName_ - << "," << numCols - << "," << numProcs - << "," << result[0] - << "," << result[1] - << "," << result[2]; - if (! additionalData.empty()) - out_ << "," << additionalData; - out_ << endl; - } - } - } - - void - testProblem (Matrix< Ordinal, Scalar >& A_local, - Matrix< Ordinal, Scalar >& Q_local, - Matrix< Ordinal, Scalar >& R, - const Ordinal numCols) - { - const Ordinal numRowsLocal = numCols; - - // A_local: Space for the matrix A to factor -- local to each - // processor. - // - // A_global: Global matrix (only nonempty on Proc 0); only - // used temporarily. - Matrix< Ordinal, Scalar > A_global; - - // This modifies A_local on all procs, and A_global on Proc 0. - par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_); - - if (printMatrices_) { - const int myRank = scalarComm_->rank(); - if (myRank == 0) { - err_ << "Input matrix A:" << std::endl; - } - printGlobalMatrix (err_, A_local, scalarComm_.get(), ordinalComm_.get()); - if (myRank == 0) { - err_ << std::endl; - } - } - - // Copy the test problem input into R, since the factorization - // will overwrite it in place with the final R factor. - R.reshape (numCols, numCols); - deep_copy (R, Scalar {}); - deep_copy (R, A_local); - - // Prepare space in which to construct the explicit Q factor - // (local component on this processor) - Q_local.reshape (numRowsLocal, numCols); - deep_copy (Q_local, Scalar {}); - } - }; - - - /// \class DistTsqrBenchmarker - /// \brief Generic version of \c DistTsqr performance test. - template< class Ordinal, class Scalar, class TimerType > - class DistTsqrBenchmarker { - TSQR::Random::NormalGenerator< Ordinal, Scalar > gen_; - Teuchos::RCP< MessengerBase< Scalar > > scalarComm_; - Teuchos::RCP< MessengerBase< double > > doubleComm_; - std::string scalarTypeName_; - - std::ostream& out_; - std::ostream& err_; - const bool testFactorExplicit_, testFactorImplicit_; - const bool humanReadable_, debug_; - - public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type; - typedef TimerType timer_type; - - /// \brief Constructor, with custom seed value - /// - /// \param scalarComm [in/out] Communicator object over which - /// to test. - /// \param doubleComm [in/out] Communicator object for doubles, - /// used for finding the min and max of timing results over - /// all the MPI processes. - /// \param seed [in] 4-element vector; the random seed input of - /// TSQR::Random::NormalGenerator (which see, since there are - /// restrictions on the set of valid seeds) - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// template type parameter - /// \param out [out] Output stream to which to write results - /// \param err [out] Output stream to which to write any - /// debugging outputs (if applicable) or errors - /// \param testFactorExplicit [in] Whether to test - /// DistTsqr::factorExplicit() - /// \param testFactorImplicit [in] Whether to test - /// DistTsqr::factor() and DistTsqr::explicit_Q() - /// \param humanReadable [in] Whether printed results should be - /// easy for humans to read (vs. easy for parsers to parse) - /// \param debug [in] Whether to write verbose debug output to - /// err - DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, - const Teuchos::RCP< MessengerBase< double > >& doubleComm, - const std::vector& seed, - const std::string& scalarTypeName, - std::ostream& out, - std::ostream& err, - const bool testFactorExplicit, - const bool testFactorImplicit, - const bool humanReadable, - const bool debug) : - gen_ (seed), - scalarComm_ (scalarComm), - doubleComm_ (doubleComm), - scalarTypeName_ (scalarTypeName), - out_ (out), - err_ (err), - testFactorExplicit_ (testFactorExplicit), - testFactorImplicit_ (testFactorImplicit), - humanReadable_ (humanReadable), - debug_ (debug) - {} - - /// \brief Constructor, with default seed value - /// - /// This constructor sets a default seed (for the pseudorandom - /// number generator), which is the same seed (0,0,0,1) each - /// time. - /// - /// \param scalarComm [in/out] Communicator object over which - /// to test. - /// \param doubleComm [in/out] Communicator object for doubles, - /// used for finding the min and max of timing results over - /// all the MPI processes. - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// template type parameter - /// \param out [out] Output stream to which to write results - /// \param err [out] Output stream to which to write any - /// debugging outputs (if applicable) or errors - /// \param testFactorExplicit [in] Whether to test - /// DistTsqr::factorExplicit() - /// \param testFactorImplicit [in] Whether to test - /// DistTsqr::factor() and DistTsqr::explicit_Q() - /// \param humanReadable [in] Whether printed results should be - /// easy for humans to read (vs. easy for parsers to parse) - /// \param debug [in] Whether to write verbose debug output to - /// err - DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, - const Teuchos::RCP< MessengerBase< double > >& doubleComm, - const std::string& scalarTypeName, - std::ostream& out, - std::ostream& err, - const bool testFactorExplicit, - const bool testFactorImplicit, - const bool humanReadable, - const bool debug) : - scalarComm_ (scalarComm), - doubleComm_ (doubleComm), - scalarTypeName_ (scalarTypeName), - out_ (out), - err_ (err), - testFactorExplicit_ (testFactorExplicit), - testFactorImplicit_ (testFactorImplicit), - humanReadable_ (humanReadable), - debug_ (debug) - {} - - /// \brief Get seed vector for pseudorandom number generator - /// - /// Fill seed (changing size of vector as necessary) with the - /// seed vector used by the pseudorandom number generator. You - /// can use this to resume the pseudorandom number stream from - /// where you last were. - void - getSeed (std::vector& seed) const - { - gen_.getSeed (seed); - } - - /// \brief Run the DistTsqr benchmark - /// - /// \param numTrials [in] Number of times to repeat the computation - /// in a single timing run - /// \param numCols [in] Number of columns in the matrix to test. - /// Number of rows := (# MPI processors) * ncols - void - benchmark (const int numTrials, - const Ordinal numCols, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - - // Set up test problem. - Matrix< Ordinal, Scalar > A_local, Q_local, R; - testProblem (A_local, Q_local, R, numCols); - - // Set up TSQR implementation. - DistTsqr par; - par.init (scalarComm_); - - // Whether we've printed field names (i.e., column headers) - // yet. Only matters for non-humanReadable output. - bool printedFieldNames = false; - - if (testFactorImplicit_) - { - std::string timerName ("DistTsqr"); - typedef typename DistTsqr::FactorOutput - factor_output_type; - - // Throw away some number of runs, because some MPI libraries - // (recent versions of OpenMPI at least) do autotuning for the - // first few collectives calls. - const int numThrowAwayRuns = 5; - for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum) - { - // Factor the matrix A (copied into R, which will be - // overwritten on output) - factor_output_type factorOutput = par.factor (R.view()); - // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); - } - - // Now do the actual timing runs. Benchmark DistTsqr - // (factor() and explicit_Q()) for numTrials trials. - timer_type timer (timerName); - timer.start(); - for (int trialNum = 0; trialNum < numTrials; ++trialNum) - { - // Factor the matrix A (copied into R, which will be - // overwritten on output) - factor_output_type factorOutput = par.factor (R.view()); - // Compute the explicit Q factor - par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput); - } - // Cumulative timing on this MPI process. - // "Cumulative" means the elapsed time of numTrials executions. - const double localCumulativeTiming = timer.stop(); - - // reportResults() must be called on all processes, since this - // figures out the min and max timings over all processes. - reportResults (timerName, numTrials, numCols, localCumulativeTiming, - additionalFieldNames, additionalData, - printFieldNames && (! printedFieldNames)); - if (printFieldNames && (! printedFieldNames)) - printedFieldNames = true; - } - - if (testFactorExplicit_) - { - std::string timerName ("DistTsqrRB"); - - // Throw away some number of runs, because some MPI libraries - // (recent versions of OpenMPI at least) do autotuning for the - // first few collectives calls. - const int numThrowAwayRuns = 5; - for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum) - { - par.factorExplicit (R.view(), Q_local.view()); - } - - // Benchmark DistTsqr::factorExplicit() for numTrials trials. - timer_type timer (timerName); - timer.start(); - for (int trialNum = 0; trialNum < numTrials; ++trialNum) - { - par.factorExplicit (R.view(), Q_local.view()); - } - // Cumulative timing on this MPI process. - // "Cumulative" means the elapsed time of numTrials executions. - const double localCumulativeTiming = timer.stop(); - - // Report cumulative (not per-invocation) timing results - reportResults (timerName, numTrials, numCols, localCumulativeTiming, - additionalFieldNames, additionalData, - printFieldNames && (! printedFieldNames)); - if (printFieldNames && (! printedFieldNames)) - printedFieldNames = true; - - // Per-invocation timings (for factorExplicit() benchmark - // only). localTimings were computed on this MPI process; - // globalTimings are statistical summaries of those over - // all MPI processes. We only collect that data for - // factorExplicit(). - std::vector< TimeStats > localTimings; - std::vector< TimeStats > globalTimings; - par.getFactorExplicitTimings (localTimings); - for (std::vector< TimeStats >::size_type k = 0; k < localTimings.size(); ++k) - globalTimings.push_back (globalTimeStats (*doubleComm_, localTimings[k])); - std::vector< std::string > timingLabels; - par.getFactorExplicitTimingLabels (timingLabels); - - if (humanReadable_) - out_ << timerName << " per-invocation benchmark results:" << endl; - - const std::string labelLabel ("label,scalarType"); - for (std::vector< std::string >::size_type k = 0; k < timingLabels.size(); ++k) - { - // Only print column headers (i.e., field names) once, if at all. - const bool printHeaders = (k == 0) && printFieldNames; - globalTimings[k].print (out_, humanReadable_, - timingLabels[k] + "," + scalarTypeName_, - labelLabel, printHeaders); - } - } - } - - private: - /// Report timing results to the given output stream - /// - /// \param method [in] String to print before reporting results - /// \param numTrials [in] Number of times to repeat the computation - /// in a single timing run - /// \param numCols [in] Number of columns in the matrix to test. - /// Number of rows := (# MPI processors) * ncols - /// \param timing [in] Total benchmark time, as measured on this - /// MPI process. This may differ on each process; we report - /// the min and the max. - /// - /// \warning Call on ALL MPI processes, not just Rank 0! - void - reportResults (const std::string& method, - const int numTrials, - const ordinal_type numCols, - const double localTiming, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - - // Find min and max timing over all MPI processes - TimeStats localStats; - localStats.update (localTiming); - TimeStats globalStats = globalTimeStats (*doubleComm_, localStats); - - // Only Rank 0 prints the final results. - const bool printResults = (doubleComm_->rank() == 0); - if (printResults) - { - const int numProcs = doubleComm_->size(); - if (humanReadable_) - { - out_ << method << " cumulative benchmark results (total time over all trials):" << endl - << "Scalar type = " << scalarTypeName_ << endl - << "Number of columns = " << numCols << endl - << "Number of (MPI) processes = " << numProcs << endl - << "Number of trials = " << numTrials << endl - << "Min timing (in seconds) = " << globalStats.min() << endl - << "Mean timing (in seconds) = " << globalStats.mean() << endl - << "Max timing (in seconds) = " << globalStats.max() << endl - << endl; - } - else - { - // Use scientific notation for floating-point numbers - out_ << std::scientific; - - if (printFieldNames) - { - out_ << "%method,scalarType,numCols,numProcs,numTrials" - << ",minTiming,meanTiming,maxTiming"; - if (! additionalFieldNames.empty()) - out_ << "," << additionalFieldNames; - out_ << endl; - } - - out_ << method - << "," << scalarTypeName_ - << "," << numCols - << "," << numProcs - << "," << numTrials - << "," << globalStats.min() - << "," << globalStats.mean() - << "," << globalStats.max(); - if (! additionalData.empty()) - out_ << "," << additionalData; - out_ << endl; - } - } - } - - void - testProblem (Matrix< Ordinal, Scalar >& A_local, - Matrix< Ordinal, Scalar >& Q_local, - Matrix< Ordinal, Scalar >& R, - const Ordinal numCols) - { - const Ordinal numRowsLocal = numCols; - - // A_local: Space for the matrix A to factor -- local to each - // processor. - // - // A_global: Global matrix (only nonempty on Proc 0); only - // used temporarily. - Matrix A_global; - - // This modifies A_local on all procs, and A_global on Proc 0. - par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_); - - // Copy the test problem input into R, since the factorization - // will overwrite it in place with the final R factor. - R.reshape (numCols, numCols); - deep_copy (R, A_local); - - // Prepare space in which to construct the explicit Q factor - // (local component on this processor) - Q_local.reshape (numRowsLocal, numCols); - deep_copy (Q_local, Scalar {}); - } - - /// Make sure that timer_type satisfies the TimerType concept. - /// - static void - conceptChecks () - { - verifyTimerConcept(); - } - }; - - - } // namespace Test -} // namespace TSQR - -#endif // __TSQR_Test_DistTest_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp index 399f13fa8fde..c3e24ac02569 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp @@ -100,17 +100,12 @@ namespace TSQR { std::vector tau (std::min(nrows, ncols)); // Workspace query - Scalar _lwork1, _lwork2; - lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(), &_lwork1, -1); - lapack.compute_explicit_Q (nrows, ncols, ncols, - Q, ldq, tau.data(), - &_lwork2, -1); - - // Allocate workspace. abs() returns a magnitude_type, and we - // can compare those using std::max. If Scalar is complex, - // you can't compare it using max. - const Ordinal lwork = checkedCast (std::max (STS::magnitude (_lwork1), - STS::magnitude (_lwork2))); + const int lwork1 = + lapack.compute_QR_lwork (nrows, ncols, Q, ldq); + const int lwork2 = + lapack.compute_explicit_Q_lwork (nrows, ncols, ncols, + Q, ldq, tau.data ()); + const Ordinal lwork = std::max (lwork1, lwork2); std::vector work (lwork); lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(), @@ -140,19 +135,12 @@ namespace TSQR { // Fill Q with random numbers this->fill_random (nrows, ncols, Q, ldq); - // Get ready for QR factorization Impl::Lapack lapack; - - // Workspace query - Scalar _lwork1; - lapack.compute_QR (nrows, ncols, Q, ldq, tau, &_lwork1, -1); - - // Allocate workspace. - const Ordinal lwork = checkedCast (STS::magnitude (_lwork1)); + const int lwork = + lapack.compute_QR_lwork (nrows, ncols, Q, ldq); std::vector work (lwork); - lapack.compute_QR (nrows, ncols, Q, ldq, tau, - work.data(), lwork); + work.data (), lwork); } template< class MatrixViewType > @@ -192,25 +180,29 @@ namespace TSQR { implicit_Q (V, tau_V.data()); // Workspace query for ORMQR. - Scalar _lwork1, _lwork2; Impl::Lapack lapack; - lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols, - U.data(), U.stride(1), tau_U.data(), - A, lda, &_lwork1, -1); + const int lwork1 = + lapack.apply_Q_factor_lwork ('L', 'N', nrows, ncols, ncols, + U.data (), U.stride (1), + tau_U.data (), A, lda); + int lwork2 = 0; if (STS::isComplex) { - lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols, - V.data(), V.stride(1), tau_V.data(), - A, lda, &_lwork2, -1); + lwork2 = + lapack.apply_Q_factor_lwork ('R', 'C', + nrows, ncols, ncols, + V.data (), V.stride (1), + tau_V.data (), A, lda); } else { - lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols, - V.data(), V.stride(1), tau_V.data(), - A, lda, &_lwork2, -1); + lwork2 = + lapack.apply_Q_factor_lwork ('R', 'T', + nrows, ncols, ncols, + V.data (), V.stride (1), + tau_V.data (), A, lda); } // Allocate workspace. - Ordinal lwork = checkedCast (std::max (STS::magnitude (_lwork1), - STS::magnitude (_lwork2))); + Ordinal lwork (std::max (lwork1, lwork2)); std::vector work (lwork); // Apply U to the left side of A, and V^H to the right side of A. @@ -258,16 +250,13 @@ namespace TSQR { std::vector tau (n); // Workspace size query for QR factorization. - Scalar _lwork1; Impl::Lapack lapack; - lapack.compute_QR (n, n, R, ldr, tau.data(), &_lwork1, -1); - - // Allocate workspace - Ordinal lwork = checkedCast (STS::magnitude (_lwork1)); - std::vector work (lwork); + const int lwork = lapack.compute_QR_lwork (n, n, R, ldr); // Compute QR factorization (implicit representation in place). - lapack.compute_QR (n, n, R, ldr, tau.data(), work.data(), lwork); + std::vector work (lwork); + lapack.compute_QR (n, n, R, ldr, tau.data (), + work.data (), lwork); // Zero out the stuff below the diagonal of R, leaving just the R factor. for (Ordinal j = 0; j < n; ++j) { diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp deleted file mode 100644 index 727c50019482..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp +++ /dev/null @@ -1,1112 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#include "Tsqr_SeqTest.hpp" -#include "Tsqr_Random_NormalGenerator.hpp" -#include "Tsqr_nodeTestProblem.hpp" -#include "Tsqr_verifyTimerConcept.hpp" -#include "Tsqr_LocalVerify.hpp" -#include "Tsqr_Matrix.hpp" -#include "Tsqr_SequentialTsqr.hpp" -#include "Tsqr_Util.hpp" -#include "Tsqr_Impl_Lapack.hpp" -#include "Teuchos_Time.hpp" -#include -#include // size_t definition -#include -#include -#include -#include -#include -#include -#include - - -namespace TSQR { - namespace Test { - - template - static Ordinal - lworkQueryLapackQr (Impl::Lapack& lapack, - const Ordinal nrows, - const Ordinal ncols, - const Ordinal lda) - { - using std::ostringstream; - using std::endl; - using STS = Teuchos::ScalarTraits; - using mag_type = typename STS::magnitudeType; - - Scalar d_lwork_geqrf {}; - lapack.compute_QR (nrows, ncols, nullptr, lda, nullptr, - &d_lwork_geqrf, -1); - - Scalar d_lwork_orgqr {}; - // A workspace query appropriate for computing the explicit Q - // factor (nrows x ncols) in place, from the QR factorization of - // an nrows x ncols matrix with leading dimension lda. - lapack.compute_explicit_Q (nrows, ncols, ncols, nullptr, lda, - nullptr, &d_lwork_orgqr, -1); - - // LAPACK workspace queries do return their results as a - // double-precision floating-point value, but LAPACK promises - // that that value will fit in an int. Thus, we don't need to - // check for valid casts to int below. I include the checks - // just to be "bulletproof" and also to show how to do the - // checks for later reference. - const mag_type lwork_geqrf_test = - static_cast (static_cast (STS::magnitude (d_lwork_geqrf))); - if (lwork_geqrf_test != STS::magnitude (d_lwork_geqrf)) { - ostringstream os; - os << "LAPACK _GEQRF workspace query returned a result, " - << d_lwork_geqrf << ", bigger than the max Ordinal value, " - << std::numeric_limits::max (); - throw std::range_error (os.str ()); - } - const Scalar lwork_orgqr_test = - static_cast (static_cast (STS::magnitude ((d_lwork_orgqr)))); - if (lwork_orgqr_test != STS::magnitude (d_lwork_orgqr)) { - ostringstream os; - os << "LAPACK _UNGQR workspace query returned a result, " - << d_lwork_orgqr << ", bigger than the max Ordinal value, " - << std::numeric_limits::max(); - throw std::range_error (os.str()); - } - return std::max (static_cast (STS::magnitude (d_lwork_geqrf)), - static_cast (STS::magnitude (d_lwork_orgqr))); - } - - /// Test the accuracy of sequential TSQR on an nrows by ncols - /// matrix (using the given cache block size (in bytes)), and - /// print the results to stdout. - template< class Ordinal, class Scalar > - static void - verifySeqTsqrTemplate (std::ostream& out, - TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator, - const std::string& datatype, - const std::string& shortDatatype, - const Ordinal nrows, - const Ordinal ncols, - const size_t cache_size_hint, - const bool contiguous_cache_blocks, - const bool save_matrices, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable, - const bool b_debug) - { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - using std::cerr; - using std::endl; - using std::pair; - using std::string; - using std::vector; - - SequentialTsqr actor (cache_size_hint); - Ordinal numCacheBlocks; - - if (b_debug) { - cerr << "Sequential TSQR test problem:" << endl - << "* " << nrows << " x " << ncols << endl - << "* Cache size hint of " << actor.cache_size_hint() << " bytes" << endl; - if (contiguous_cache_blocks) { - cerr << "* Contiguous cache blocks" << endl; - } - } - - Matrix A (nrows, ncols); - Matrix A_copy (nrows, ncols); - Matrix Q (nrows, ncols); - Matrix R (ncols, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); - deep_copy (A_copy, std::numeric_limits::quiet_NaN()); - deep_copy (Q, std::numeric_limits::quiet_NaN()); - deep_copy (R, std::numeric_limits::quiet_NaN()); - } - const Ordinal lda = nrows; - const Ordinal ldq = nrows; - const Ordinal ldr = ncols; - - // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true); - - if (save_matrices) { - string filename = "A_" + shortDatatype + ".txt"; - if (b_debug) { - cerr << "-- Saving test problem to \"" << filename << "\"" << endl; - } - std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, A.data(), A.stride(1)); - fileOut.close(); - } - - if (b_debug) { - cerr << "-- Generated test problem" << endl; - } - - // Copy A into A_copy, since TSQR overwrites the input. If - // specified, rearrange the data in A_copy so that the data in - // each cache block is contiguously stored. - if (! contiguous_cache_blocks) { - deep_copy (A_copy, A); - if (b_debug) { - cerr << "-- Copied test problem from A into A_copy" << endl; - } - } - else { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); - if (b_debug) { - cerr << "-- Reorganized test matrix to have contiguous " - "cache blocks" << endl; - } - - // Verify cache blocking, when in debug mode. - if (b_debug) { - Matrix A2 (nrows, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A2, std::numeric_limits::quiet_NaN ()); - } - actor.un_cache_block (nrows, ncols, A2.data (), A2.stride (1), - A_copy.data ()); - if (matrix_equal (A, A2)) { - if (b_debug) { - cerr << "-- Cache blocking test succeeded!" << endl; - } - } - else { - throw std::logic_error ("Cache blocking failed"); - } - } - } - - // Fill R with zeros, since the factorization may not overwrite - // the strict lower triangle of R. - deep_copy (R, Scalar {}); - - // Count the number of cache blocks that factor() will use. - // This is only for diagnostic purposes. - numCacheBlocks = - actor.factor_num_cache_blocks (nrows, ncols, A_copy.data(), - A_copy.stride(1), contiguous_cache_blocks); - // In debug mode, report how many cache blocks factor() will use. - if (b_debug) { - cerr << "-- Number of cache blocks factor() will use: " - << numCacheBlocks << endl << endl; - } - - // Factor the matrix and compute the explicit Q factor - typedef typename SequentialTsqr::FactorOutput - factor_output_type; - factor_output_type factorOutput = - actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguous_cache_blocks); - if (b_debug) { - cerr << "-- Finished SequentialTsqr::factor" << endl; - } - if (save_matrices) { - string filename = "R_" + shortDatatype + ".txt"; - if (b_debug) { - cerr << "-- Saving R factor to \"" << filename << "\"" << endl; - } - std::ofstream fileOut (filename.c_str ()); - print_local_matrix (fileOut, ncols, ncols, R.data (), R.stride (1)); - fileOut.close (); - } - - actor.explicit_Q (nrows, ncols, A_copy.data(), lda, factorOutput, - ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); - if (b_debug) { - cerr << "-- Finished SequentialTsqr::explicit_Q" << endl; - } - - // "Un"-cache-block the output, if contiguous cache blocks were - // used. This is only necessary because local_verify() doesn't - // currently support contiguous cache blocks. - if (contiguous_cache_blocks) { - // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data()); - deep_copy (Q, A_copy); - if (b_debug) { - cerr << "-- Un-cache-blocked output Q factor" << endl; - } - } - - if (save_matrices) { - string filename = "Q_" + shortDatatype + ".txt"; - if (b_debug) { - cerr << "-- Saving Q factor to \"" << filename << "\"" << endl; - } - std::ofstream fileOut (filename.c_str()); - print_local_matrix (fileOut, nrows, ncols, Q.data(), Q.stride(1)); - fileOut.close(); - } - - // Print out the R factor - if (false && b_debug) { - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); - cerr << endl; - } - - // Validate the factorization - vector< magnitude_type > results = - local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr); - if (b_debug) { - cerr << "-- Finished local_verify" << endl; - } - - // Print the results - if (human_readable) { - out << "Sequential cache-blocked TSQR:" << endl - << "Scalar type: " << datatype << endl - << "Matrix dimensions: " << nrows << " by " << ncols << endl - << "Cache size hint in bytes: " << actor.cache_size_hint() << endl - << "Number of cache blocks: " << numCacheBlocks << endl - << "Contiguous cache blocks? " << contiguous_cache_blocks << endl - << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl - << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl - << endl << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - out << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA"; - if (! additionalFieldNames.empty()) - out << "," << additionalFieldNames; - out << endl; - } - out << "SeqTSQR" - << "," << datatype - << "," << nrows - << "," << ncols - << "," << actor.cache_size_hint() - << "," << contiguous_cache_blocks - << "," << results[0] - << "," << results[1] - << "," << results[2]; - if (! additionalData.empty ()) { - out << "," << additionalData; - } - out << endl; - } - } - - - void - verifySeqTsqr (std::ostream& out, - const int nrows, - const int ncols, - const size_t cache_size_hint, - const bool test_complex_arithmetic, - const bool save_matrices, - const bool contiguous_cache_blocks, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable, - const bool b_debug) - { - using TSQR::Random::NormalGenerator; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - using std::string; - using std::vector; - - // - // We do tests one after another, using the seed from the - // previous test in the current test, so that the pseudorandom - // streams used by the tests are independent. - // - - // On output: Seed for the next pseudorandom number generator. - vector< int > iseed(4); - string datatype; // name of the current datatype being tested - string shortDatatype; // one-letter version of datatype - - // First test. The PRNG seeds itself with a default value. - // This will be the same each time, so if you want - // nondeterministic behavior, you should pick the seed values - // yourself. Only print field names (if at all) for the first - // data type tested; field names are only printed if output is - // not human_readable. - NormalGenerator< int, float > normgenS; - datatype = "float"; - shortDatatype = "S"; - verifySeqTsqrTemplate (out, normgenS, datatype, shortDatatype, nrows, ncols, - cache_size_hint, contiguous_cache_blocks, - save_matrices, additionalFieldNames, additionalData, - printFieldNames, human_readable, b_debug); - // Fetch the pseudorandom seed from the previous test. - normgenS.getSeed (iseed); - NormalGenerator< int, double > normgenD (iseed); - // Next test. - datatype = "double"; - shortDatatype = "D"; - verifySeqTsqrTemplate (out, normgenD, datatype, shortDatatype, nrows, ncols, - cache_size_hint, contiguous_cache_blocks, - save_matrices, additionalFieldNames, additionalData, - printFieldNames, human_readable, b_debug); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (test_complex_arithmetic) { - normgenD.getSeed (iseed); - NormalGenerator< int, complex > normgenC (iseed); - datatype = "complex"; - shortDatatype = "C"; - verifySeqTsqrTemplate (out, normgenC, datatype, shortDatatype, nrows, ncols, - cache_size_hint, contiguous_cache_blocks, - save_matrices, additionalFieldNames, additionalData, - printFieldNames, human_readable, b_debug); - normgenC.getSeed (iseed); - NormalGenerator< int, complex > normgenZ (iseed); - datatype = "complex"; - shortDatatype = "Z"; - verifySeqTsqrTemplate (out, normgenZ, datatype, shortDatatype, nrows, ncols, - cache_size_hint, contiguous_cache_blocks, - save_matrices, additionalFieldNames, additionalData, - printFieldNames, human_readable, b_debug); - } -#else // HAVE_KOKKOSTSQR_COMPLEX - if (test_complex_arithmetic) { - throw std::logic_error ("Trilinos was not built with " - "complex arithmetic support"); - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - - - - template< class Ordinal, class Scalar > - static void - verifyLapackTemplate (std::ostream& out, - TSQR::Random::NormalGenerator& generator, - const std::string& datatype, - const Ordinal nrows, - const Ordinal ncols, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable, - const bool b_debug) - { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - using std::ostringstream; - using std::cerr; - using std::endl; - - Impl::Lapack lapack; - - if (b_debug) { - cerr << "LAPACK test problem:" << endl - << "* " << nrows << " x " << ncols << endl; - } - - Matrix A (nrows, ncols); - Matrix A_copy (nrows, ncols); - Matrix Q (nrows, ncols); - Matrix R (ncols, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); - deep_copy (A_copy, std::numeric_limits::quiet_NaN()); - deep_copy (Q, std::numeric_limits::quiet_NaN()); - deep_copy (R, std::numeric_limits::quiet_NaN()); - } - const Ordinal lda = nrows; - const Ordinal ldq = nrows; - const Ordinal ldr = ncols; - - // Create a test problem - nodeTestProblem (generator, nrows, ncols, - A.data (), A.stride (1), true); - if (b_debug) { - cerr << "-- Generated test problem" << endl; - } - - // Copy A into A_copy, since LAPACK QR overwrites the input. - deep_copy (A_copy, A); - if (b_debug) { - cerr << "-- Copied test problem from A into A_copy" << endl; - } - - // Now determine the required workspace for the factorization. - const Ordinal lwork = - lworkQueryLapackQr (lapack, nrows, ncols, A_copy.stride (1)); - std::vector work (lwork); - std::vector tau (ncols); - - // Fill R with zeros, since the factorization may not overwrite - // the strict lower triangle of R. - deep_copy (R, Scalar {}); - - lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.stride(1), - tau.data(), work.data(), lwork); - // Copy out the R factor from A_copy (where we computed the QR - // factorization in place) into R. - copy_upper_triangle (ncols, ncols, R.data(), ldr, A_copy.data(), lda); - - if (b_debug) { - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); - cerr << endl; - } - - // The explicit Q factor will be computed in place, so copy the - // result of the factorization into Q. - deep_copy (Q, A_copy); - - lapack.compute_explicit_Q (nrows, ncols, ncols, Q.data(), ldq, - tau.data(), work.data(), lwork); - - // Validate the factorization - std::vector results = - local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, - R.data(), ldr); - - // Print the results - if (human_readable) { - out << "LAPACK QR (DGEQRF and DUNGQR):" << endl - << "Scalar type: " << datatype << endl - << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl - << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl - << endl << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - out << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA"; - if (! additionalFieldNames.empty ()) { - out << "," << additionalFieldNames; - } - out << endl; - } - out << "LAPACK" - << "," << datatype - << "," << nrows - << "," << ncols - << "," << size_t(0) // cache_size_hint - << "," << false // contiguous_cache_blocks - << "," << results[0] - << "," << results[1] - << "," << results[2]; - if (! additionalData.empty ()) { - out << "," << additionalData; - } - out << endl; - } - } - - - void - verifyLapack (std::ostream& out, - const int nrows, - const int ncols, - const bool test_complex_arithmetic, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable, - const bool b_debug) - { - using TSQR::Random::NormalGenerator; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - using std::string; - using std::vector; - - // - // We do tests one after another, using the seed from the - // previous test in the current test, so that the pseudorandom - // streams used by the tests are independent. - // - - // On output: Seed for the next pseudorandom number generator. - vector< int > iseed(4); - string datatype; // name of the current datatype being tested - - // First test. The PRNG seeds itself with a default value. - // This will be the same each time, so if you want - // nondeterministic behavior, you should pick the seed values - // yourself. - NormalGenerator< int, float > normgenS; - datatype = "float"; - verifyLapackTemplate (out, normgenS, datatype, nrows, ncols, - additionalFieldNames, additionalData, - printFieldNames, human_readable, b_debug); - // Fetch the pseudorandom seed from the previous test. - normgenS.getSeed (iseed); - NormalGenerator< int, double > normgenD (iseed); - // Next test. - datatype = "double"; - verifyLapackTemplate (out, normgenD, datatype, nrows, ncols, - additionalFieldNames, additionalData, - false, human_readable, b_debug); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (test_complex_arithmetic) { - normgenD.getSeed (iseed); - NormalGenerator< int, complex > normgenC (iseed); - datatype = "complex"; - verifyLapackTemplate (out, normgenC, datatype, nrows, ncols, - additionalFieldNames, additionalData, - false, human_readable, b_debug); - normgenC.getSeed (iseed); - NormalGenerator< int, complex > normgenZ (iseed); - datatype = "complex"; - verifyLapackTemplate (out, normgenZ, datatype, nrows, ncols, - additionalFieldNames, additionalData, - false, human_readable, b_debug); - } -#else // HAVE_KOKKOSTSQR_COMPLEX - if (test_complex_arithmetic) { - throw std::logic_error ("Trilinos was not built with " - "complex arithmetic support"); - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - - /// \class LapackBenchmarker - /// \brief Template version of LAPACK QR benchmark - /// - /// LAPACK QR benchmark, templated on Ordinal, Scalar, and - /// TimerType. - template< class Ordinal, class Scalar, class TimerType > - class LapackBenchmarker { - public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - - /// \brief Constructor - /// - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// type. - /// \param out [out] Reference to the output stream (e.g., - /// std::cout) to which to write benchmark results. - /// \param humanReadable [in] Whether to print results to out in - /// a verbose human-readable way, or in a way that is easy to - /// parse with a script. In either case, the results will be - /// printed in ASCII format. - LapackBenchmarker (const std::string& scalarTypeName, - std::ostream& out = std::cout, - const bool humanReadable = false) : - scalarTypeName_ (scalarTypeName), - out_ (out), - humanReadable_ (humanReadable) - { - TSQR::Test::verifyTimerConcept< TimerType >(); - } - - void - benchmark (const int numTrials, - const Ordinal numRows, - const Ordinal numCols, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - Matrix A (numRows, numCols); - Matrix Q (numRows, numCols); - Matrix R (numCols, numCols); - const Ordinal lda = numRows; - const Ordinal ldq = numRows; - const Ordinal ldr = numCols; - - // Create a test problem - nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false); - - // Copy A into Q, since LAPACK QR overwrites the input. We only - // need Q because LAPACK's computation of the explicit Q factor - // occurs in place. This doesn't work with TSQR. To give - // LAPACK QR the fullest possible advantage over TSQR, we don't - // allocate an A_copy here (as we would when benchmarking TSQR). - deep_copy (Q, A); - - // Determine the required workspace for the factorization - const Ordinal lwork = lworkQueryLapackQr (lapack_, numRows, numCols, lda); - std::vector work (lwork); - std::vector tau (numCols); - - // Benchmark LAPACK's QR factorization for numTrials trials. - // - // Name of timer doesn't matter here; we only need the timing. - TimerType timer("LAPACK"); - timer.start(); - for (int trialNum = 0; trialNum < numTrials; ++trialNum) { - lapack_.compute_QR (numRows, numCols, - Q.data(), ldq, tau.data(), - work.data(), lwork); - // Extract the upper triangular factor R from Q (where it - // was computed in place by GEQRF), since UNGQR will - // overwrite all of Q with the explicit Q factor. - copy_upper_triangle (numRows, numCols, R.data(), ldr, - Q.data(), ldq); - lapack_.compute_explicit_Q (numRows, numCols, numCols, - Q.data(), ldq, tau.data(), - work.data(), lwork); - } - const double lapackTiming = timer.stop(); - reportResults (numTrials, numRows, numCols, lapackTiming, - additionalFieldNames, additionalData, printFieldNames); - } - - - private: - //! Wrapper around LAPACK routines. - Impl::Lapack lapack_; - - /// \brief Pseudorandom normal(0,1) generator. - /// - /// Default seed is OK, because this is a benchmark, not an - /// accuracy test. - TSQR::Random::NormalGenerator< ordinal_type, scalar_type > gen_; - - //! Human-readable string representation of the Scalar type. - std::string scalarTypeName_; - - //! Output stream to which to print benchmark results. - std::ostream& out_; - - /// \brief Whether results should be printed in a human-readable way, - /// - /// rather than a way easily parsed by a script. - bool humanReadable_; - - /// \brief Report benchmark results to out_ - void - reportResults (const int numTrials, - const Ordinal numRows, - const Ordinal numCols, - const double lapackTiming, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - if (humanReadable_) { - out_ << "LAPACK\'s QR factorization (_GEQRF + _UNGQR):" << endl - << "Scalar type = " << scalarTypeName_ << endl - << "# rows = " << numRows << endl - << "# columns = " << numCols << endl - << "# trials = " << numTrials << endl - << "Total time (s) = " << lapackTiming << endl - << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - out_ << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",timing"; - if (! additionalFieldNames.empty ()) { - out_ << "," << additionalFieldNames; - } - out_ << endl; - } - // "0" refers to the cache size hint, which is not - // applicable in this case; we retain it for easy - // comparison of results with SequentialTsqr (so that the - // number of fields is the same in both cases). "false" - // (that follows 0) refers to whether or not contiguous - // cache blocks were used (see TSQR::SequentialTsqr); this - // is also not applicable in this case. - out_ << "LAPACK" - << "," << scalarTypeName_ - << "," << numRows - << "," << numCols - << "," << 0 - << "," << false - << "," << numTrials - << "," << lapackTiming; - if (! additionalData.empty ()) { - out_ << "," << additionalData; - } - out_ << endl; - } - } - }; - - - void - benchmarkLapack (std::ostream& out, - const int numRows, - const int numCols, - const int numTrials, - const bool testComplex, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool humanReadable) - { - typedef Teuchos::Time timer_type; - const bool testReal = true; - using std::string; - - // Only print field names (if at all) for the first data type tested. - bool printedFieldNames = false; - - if (testReal) { - { // Scalar=float - typedef LapackBenchmarker< int, float, timer_type > benchmark_type; - string scalarTypeName ("float"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - { // Scalar=double - typedef LapackBenchmarker< int, double, timer_type > benchmark_type; - string scalarTypeName ("double"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - } - - if (testComplex) { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; - { // Scalar=complex - typedef LapackBenchmarker< int, complex, timer_type > benchmark_type; - string scalarTypeName ("complex"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - { // Scalar=complex - typedef LapackBenchmarker, timer_type> benchmark_type; - string scalarTypeName ("complex"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } -#else // Don't HAVE_KOKKOSTSQR_COMPLEX - throw std::logic_error ("Trilinos was not built with " - "complex arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - } - - - - /// \class SeqTsqrBenchmarker - /// \brief Template version of SequentialTsqr benchmark. - /// - /// SequentialTsqr benchmark, templated on Ordinal, Scalar, and - /// TimerType. - template - class SeqTsqrBenchmarker { - public: - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - - /// \brief Constructor - /// - /// \param scalarTypeName [in] Human-readable name of the Scalar - /// type. - /// \param out [out] Reference to the output stream (e.g., - /// std::cout) to which to write benchmark results. - /// \param humanReadable [in] Whether to print results to out in - /// a verbose human-readable way, or in a way that is easy to - /// parse with a script. In either case, the results will be - /// printed in ASCII format. - SeqTsqrBenchmarker (const std::string& scalarTypeName, - std::ostream& out = std::cout, - const bool humanReadable = false) : - scalarTypeName_ (scalarTypeName), - out_ (out), - humanReadable_ (humanReadable) - { - // Make sure that TimerType satisfies the required interface. - TSQR::Test::verifyTimerConcept(); - } - - void - benchmark (const int numTrials, - const Ordinal numRows, - const Ordinal numCols, - const size_t cacheSizeHint, - const bool contiguousCacheBlocks, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - SequentialTsqr actor (cacheSizeHint); - - Matrix A (numRows, numCols); - Matrix A_copy (numRows, numCols); - Matrix Q (numRows, numCols); - Matrix R (numCols, numCols); - const Ordinal lda = numRows; - const Ordinal ldq = numRows; - - // Create a test problem - nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false); - - // Copy A into A_copy, since TSQR overwrites the input - deep_copy (A_copy, A); - - // Benchmark sequential TSQR for numTrials trials. - // - // Name of timer doesn't matter here; we only need the timing. - TimerType timer("SeqTSQR"); - timer.start(); - for (int trialNum = 0; trialNum < numTrials; ++trialNum) { - // Factor the matrix and extract the resulting R factor - auto factorOutput = - actor.factor (numRows, numCols, A_copy.data(), lda, - R.data(), R.stride(1), contiguousCacheBlocks); - // Compute the explicit Q factor. Unlike with LAPACK QR, - // this doesn't happen in place: the implicit Q factor is - // stored in A_copy, and the explicit Q factor is written to - // Q. - actor.explicit_Q (numRows, numCols, A_copy.data(), lda, factorOutput, - numCols, Q.data(), ldq, contiguousCacheBlocks); - } - const double seqTsqrTiming = timer.stop(); - reportResults (numTrials, numRows, numCols, actor.cache_size_hint(), - contiguousCacheBlocks, seqTsqrTiming, - additionalFieldNames, additionalData, printFieldNames); - } - - - private: - /// \brief Pseudorandom normal(0,1) generator. - /// - /// Default seed is OK, because this is a benchmark, not an - /// accuracy test. - TSQR::Random::NormalGenerator gen_; - - //! Human-readable string representation of the Scalar type. - std::string scalarTypeName_; - - //! Output stream to which to print benchmark results. - std::ostream& out_; - - /// \brief Whether results should be printed in a human-readable way, - /// - /// as opposed to a way easily parsed by a script. - bool humanReadable_; - - //! Report benchmark results to out_ - void - reportResults (const int numTrials, - const Ordinal numRows, - const Ordinal numCols, - const size_t actualCacheSizeHint, - const bool contiguousCacheBlocks, - const double seqTsqrTiming, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames) - { - using std::endl; - if (humanReadable_) { - out_ << "Sequential (cache-blocked) TSQR:" << endl - << "Scalar type = " << scalarTypeName_ << endl - << "# rows = " << numRows << endl - << "# columns = " << numCols << endl - << "cache size hint in bytes = " << actualCacheSizeHint << endl - << "contiguous cache blocks? " << contiguousCacheBlocks << endl - << "# trials = " << numTrials << endl - << "Total time (s) = " << seqTsqrTiming << endl - << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - out_ << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",timing"; - if (! additionalFieldNames.empty ()) { - out_ << "," << additionalFieldNames; - } - out_ << endl; - } - out_ << "SeqTSQR" - << "," << scalarTypeName_ - << "," << numRows - << "," << numCols - << "," << actualCacheSizeHint - << "," << contiguousCacheBlocks - << "," << numTrials - << "," << seqTsqrTiming; - if (! additionalData.empty ()) { - out_ << "," << additionalData; - } - out_ << endl; - } - } - }; - - - void - benchmarkSeqTsqr (std::ostream& out, - const int numRows, - const int numCols, - const int numTrials, - const size_t cacheSizeHint, - const bool contiguousCacheBlocks, - const bool testComplex, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool humanReadable) - { - typedef Teuchos::Time timer_type; - const bool testReal = true; - using std::string; - - // Only print field names (if at all) for the first data type tested. - bool printedFieldNames = false; - - if (testReal) { - { // Scalar=float - typedef SeqTsqrBenchmarker benchmark_type; - string scalarTypeName ("float"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, - contiguousCacheBlocks, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - { // Scalar=double - typedef SeqTsqrBenchmarker< int, double, timer_type > benchmark_type; - string scalarTypeName ("double"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, - contiguousCacheBlocks, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - } - - if (testComplex) { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; - { // Scalar=complex - typedef SeqTsqrBenchmarker< int, complex, timer_type > benchmark_type; - string scalarTypeName ("complex"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, - contiguousCacheBlocks, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } - { // Scalar=complex - typedef SeqTsqrBenchmarker< int, complex, timer_type > benchmark_type; - string scalarTypeName ("complex"); - benchmark_type widget (scalarTypeName, out, humanReadable); - widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, - contiguousCacheBlocks, - additionalFieldNames, additionalData, - printFieldNames && ! printedFieldNames); - if (printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - } -#else // Don't HAVE_KOKKOSTSQR_COMPLEX - throw std::logic_error ("Trilinos was not built with " - "complex arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - } - - - - } // namespace Test -} // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp deleted file mode 100644 index 9f290c2e9c53..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp +++ /dev/null @@ -1,133 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_Test_SeqTest_hpp -#define __TSQR_Test_SeqTest_hpp - -#include "Tsqr_ConfigDefs.hpp" -#include // size_t definition -#include -#include - -namespace TSQR { - namespace Test { - /// \brief Test accuracy of SequentialTsqr. - /// - /// Test the accuracy of our sequential TSQR implementation - /// (SequentialTsqr), on an nrows by ncols matrix, using the given - /// cache size hint (in bytes). Print the results to the given - /// output stream out. - void - verifySeqTsqr (std::ostream& out, - const int nrows, - const int ncols, - const size_t cache_size_hint, - const bool test_complex_arithmetic, - const bool save_matrices, - const bool contiguous_cache_blocks, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable = false, - const bool b_debug = false); - - /// \brief Test accuracy of LAPACK's QR factorization. - /// - /// Test the accuracy of LAPACK's QR factorization (_GEQRF + - /// _ORGQR) on an nrows by ncols matrix, and print the results to - /// the given output stream out. - void - verifyLapack (std::ostream& out, - const int nrows, - const int ncols, - const bool test_complex_arithmetic, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool human_readable, - const bool b_debug = false); - - /// \brief Test performance of SequentialTsqr. - /// - /// Test the run time over ntrials trials of sequential TSQR, on - /// an nrows by ncols matrix (using the given cache block size (in - /// bytes)), and print the results to the given output stream out. - /// - /// \param human_readable [in] If true, print the benchmark - /// results to stdout in human-readable format. Otherwise, - /// print them as two rows of comma-delimited ASCII, in an - /// abbreviated format suitable for automatic processing. - void - benchmarkSeqTsqr (std::ostream& out, - const int numRows, - const int numCols, - const int numTrials, - const size_t cacheSizeHint, - const bool contiguousCacheBlocks, - const bool testComplex, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool humanReadable); - - /// \brief Test performance of LAPACK's QR factorization. - /// - /// Test the run time over numTrials trials of LAPACK QR (_GEQRF + - /// _ORGQR), on a numRows by numCols matrix, and print the results - /// to the given output stream out. - /// - /// \param humanReadable [in] If true, print the benchmark results - /// to out in human-readable format. Otherwise, print them as - /// two rows of comma-delimited ASCII, in an abbreviated format - /// suitable for automatic processing. - void - benchmarkLapack (std::ostream& out, - const int numRows, - const int numCols, - const int numTrials, - const bool testComplex, - const std::string& additionalFieldNames, - const std::string& additionalData, - const bool printFieldNames, - const bool humanReadable); - - } // namespace Test -} // namespace TSQR - -#endif // __TSQR_Test_SeqTest_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp index f768fe5ae898..aa305064776b 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp @@ -147,7 +147,7 @@ namespace TSQR { Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (), A_cur.stride (1), Scalar (0), ATA.data (), ATA.stride (1)); // Process the remaining cache blocks in order. - while (! A_rest.empty ()) { + while (! empty (A_rest)) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); // ATA := ATA + A_cur^T * A_cur // @@ -178,7 +178,7 @@ namespace TSQR { { mat_view_type R_out (ncols, ncols, R, ldr); deep_copy (R_out, Scalar {}); - copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.stride(1)); + copy_upper_triangle (R, ATA); } // Compute A := A * R^{-1}. We do this in place in A, using @@ -202,7 +202,7 @@ namespace TSQR { A_cur.data (), A_cur.stride (1)); // Process the remaining cache blocks in order. - while (! A_rest.empty ()) { + while (! empty (A_rest)) { A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, A_cur.extent (0), ncols, @@ -225,25 +225,25 @@ namespace TSQR { const LocalOrdinal ncols_C, Scalar C[], const LocalOrdinal ldc, - const bool contiguous_cache_blocks = false) + const bool contiguousCacheBlocks = false) { if (ncols_Q != ncols_C) throw std::logic_error("SequentialCholeskyQR::explicit_Q() " "does not work if ncols_C != ncols_Q"); const LocalOrdinal ncols = ncols_Q; - if (contiguous_cache_blocks) { + if (contiguousCacheBlocks) { CacheBlocker blocker (nrows, ncols, strategy_); mat_view_type C_rest (nrows, ncols, C, ldc); const_mat_view_type Q_rest (nrows, ncols, Q, ldq); mat_view_type C_cur = - blocker.split_top_block (C_rest, contiguous_cache_blocks); + blocker.split_top_block (C_rest, contiguousCacheBlocks); const_mat_view_type Q_cur = - blocker.split_top_block (Q_rest, contiguous_cache_blocks); + blocker.split_top_block (Q_rest, contiguousCacheBlocks); - while (! C_rest.empty ()) { + while (! empty (C_rest)) { deep_copy (Q_cur, C_cur); } } @@ -253,7 +253,6 @@ namespace TSQR { } } - /// Cache-block the given A_in matrix, writing the results to A_out. void cache_block (const LocalOrdinal nrows, @@ -262,11 +261,10 @@ namespace TSQR { const Scalar A_in[], const LocalOrdinal lda_in) const { - CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); + CacheBlocker blocker (nrows, ncols, strategy_); blocker.cache_block (nrows, ncols, A_out, A_in, lda_in); } - /// "Un"-cache-block the given A_in matrix, writing the results to A_out. void un_cache_block (const LocalOrdinal nrows, diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp index 0390be6c05f5..78cd2e91a84f 100644 --- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp @@ -40,15 +40,14 @@ /// \file Tsqr_SequentialTsqr.hpp /// \brief Implementation of the sequential cache-blocked part of TSQR. -#ifndef __TSQR_Tsqr_SequentialTsqr_hpp -#define __TSQR_Tsqr_SequentialTsqr_hpp +#ifndef TSQR_SEQUENTIALTSQR_HPP +#define TSQR_SEQUENTIALTSQR_HPP #include "Tsqr_ApplyType.hpp" #include "Tsqr_Matrix.hpp" #include "Tsqr_CacheBlockingStrategy.hpp" #include "Tsqr_CacheBlocker.hpp" -#include "Tsqr_Combine.hpp" -#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Impl_CombineUser.hpp" #include "Tsqr_NodeTsqr.hpp" #include "Tsqr_Util.hpp" #include "Tsqr_Impl_SystemBlas.hpp" @@ -60,10 +59,34 @@ #include #include #include -#include // std::pair #include namespace TSQR { + namespace Impl { + template + class SequentialTsqrFactorOutput : + public NodeFactorOutput + { + private: + using my_data_type = std::vector>; + public: + SequentialTsqrFactorOutput () = default; + ~SequentialTsqrFactorOutput () override = default; + + void add_and_consume (std::vector&& tau) { + data_.emplace_back (tau); + } + typename my_data_type::const_iterator begin() const { + return data_.begin(); + } + typename my_data_type::const_reverse_iterator rbegin() const { + return data_.rbegin(); + } + private: + my_data_type data_; + }; + } // namespace Impl + /// \class SequentialTsqr /// \brief Sequential cache-blocked TSQR factorization. /// \author Mark Hoemmen @@ -91,11 +114,12 @@ namespace TSQR { /// may be different on different architectures. /// /// SequentialTsqr is designed to be used as the "intranode TSQR" - /// part of the full TSQR implementation in \c Tsqr. The \c Tsqr - /// class can use any of various intranode TSQR implementations. + /// part of the full TSQR implementation in Tsqr. The Tsqr class + /// can use any of various intranode TSQR implementations. /// SequentialTsqr is an appropriate choice when running in MPI-only - /// mode. Other intranode TSQR implementations, such as \c TbbTsqr, - /// are appropriate for hybrid parallelism (MPI + threads). + /// mode. Other intranode TSQR implementations, such as TbbTsqr + /// (which has been removed temporarily) are appropriate for hybrid + /// parallelism (MPI + threads). /// /// SequentialTsqr is unlikely to benefit from a multithreaded BLAS /// implementation. In fact, implementations of LAPACK's QR @@ -103,28 +127,48 @@ namespace TSQR { /// multithreading when factoring tall skinny matrices. (See our /// Supercomputing 2009 paper and my IPDPS 2011 paper.) This is why /// we built other intranode TSQR factorizations that do effectively - /// exploit thread-level parallelism, such as \c TbbTsqr. + /// exploit thread-level parallelism, such as TbbTsqr. /// - /// \note To implementers: SequentialTsqr cannot currently be a \c + /// \note To implementers: SequentialTsqr cannot currently be a /// Teuchos::ParameterListAcceptorDefaultBase, because the latter /// uses RCP, and RCPs (more specifically, their reference counts) - /// are not currently thread safe. \c TbbTsqr uses SequentialTsqr - /// in parallel to implement each thread's cache-blocked TSQR. - /// This can be fixed as soon as RCPs are made thread safe. + /// are not currently thread safe. TbbTsqr uses SequentialTsqr in + /// parallel to implement each thread's cache-blocked TSQR. This + /// can be fixed as soon as RCPs are made thread safe. template class SequentialTsqr : - public NodeTsqr>> + public NodeTsqr, + private Impl::CombineUser { + private: + using base_type = NodeTsqr; + using my_factor_output_type = + Impl::SequentialTsqrFactorOutput; + public: - using ordinal_type = LocalOrdinal; - using scalar_type = Scalar; - using mat_view_type = MatView; - using const_mat_view_type = MatView; - using magnitude_type = typename Teuchos::ScalarTraits::magnitudeType; - using FactorOutput = typename NodeTsqr>>::factor_output_type; + using ordinal_type = typename base_type::ordinal_type; + using scalar_type = typename base_type::scalar_type; + using mat_view_type = typename base_type::mat_view_type; + using const_mat_view_type = + typename base_type::const_mat_view_type; + using magnitude_type = typename base_type::magnitude_type; + using factor_output_type = typename base_type::factor_output_type; private: + Combine& + getMyCombine (const ordinal_type /* maxNumCols */) const + { + // FIXME (mfh 20 Dec 2019) If SequentialTsqr has more than one + // cache block, it only passes tests if you use CombineNative. + // This likely explains why it fails with complex Scalar types, + // since CombineNative just uses CombineDefault in that case. I + // tried making SequentialTsqr's implementation of + // QR_produces_R_factor_with_nonnegative_diagonal always return + // false, but that didn't help, so the issue likely is + // CombineDefault. + return this->getCombine ("CombineNative"); + } + /// \brief Factor the first cache block of the matrix. /// /// Compute the QR factorization of the first cache block A_top. @@ -154,59 +198,14 @@ namespace TSQR { /// R factor. mat_view_type factor_first_block (Combine& combine, - mat_view_type& A_top, + const mat_view_type& A_top, std::vector& tau, - std::vector& work) const - { - const LocalOrdinal ncols = A_top.extent(1); - combine.factor_first (A_top, tau.data(), work.data()); - return mat_view_type(ncols, ncols, A_top.data(), A_top.stride(1)); - } - - /// Apply the Q factor of the first (topmost) cache blocks, as - /// computed by factor_first_block() and stored implicitly in - /// Q_first and tau, to the first (topmost) block C_first of the - /// matrix C. - void - apply_first_block (Combine& combine, - const ApplyType& applyType, - const const_mat_view_type& Q_first, - const std::vector& tau, - mat_view_type& C_first, - std::vector& work) const + Scalar work[], + const LocalOrdinal lwork) const { - combine.apply_first (applyType, Q_first, tau.data(), - C_first, work.data()); - } - - void - combine_apply (Combine& combine, - const ApplyType& apply_type, - const const_mat_view_type& Q_cur, - const std::vector& tau, - mat_view_type& C_top, - mat_view_type& C_cur, - std::vector& work) const - { - const LocalOrdinal nrows_local = Q_cur.extent(0); - const LocalOrdinal ncols_Q = Q_cur.extent(1); - const LocalOrdinal ncols_C = C_cur.extent(1); - - combine.apply_inner (apply_type, - nrows_local, ncols_C, ncols_Q, - Q_cur.data(), C_cur.stride(1), tau.data(), - C_top.data(), C_top.stride(1), - C_cur.data(), C_cur.stride(1), work.data()); - } - - void - combine_factor (Combine& combine, - mat_view_type& R, - mat_view_type& A_cur, - std::vector& tau, - std::vector& work) const - { - combine.factor_inner (R, A_cur, tau.data(), work.data()); + combine.factor_first (A_top, tau.data (), work, lwork); + const LocalOrdinal ncols = A_top.extent (1); + return partition_2x1 (A_top, ncols).first; } public: @@ -276,14 +275,14 @@ namespace TSQR { setParameterList (params); } - /// \brief Valid default parameters for SequentialTsqr. + /// \brief List of valid parameters for SequentialTsqr. /// /// \note This object has to create a new parameter list each /// time, since it cannot cache an RCP (due to thread safety -- /// TbbTsqr invokes multiple instances of SequentialTsqr in /// parallel). Teuchos::RCP - getValidParameters () const + getValidParameters () const override { using Teuchos::ParameterList; using Teuchos::parameterList; @@ -315,7 +314,7 @@ namespace TSQR { /// For a list of currently understood parameters, see the /// parameter list returned by \c getValidParameters(). void - setParameterList (const Teuchos::RCP& plist) + setParameterList (const Teuchos::RCP& plist) override { using Teuchos::Exceptions::InvalidParameter; using Teuchos::ParameterList; @@ -360,7 +359,7 @@ namespace TSQR { /// This implements Teuchos::Describable::description(). For now, /// SequentialTsqr uses the default implementation of /// Teuchos::Describable::describe(). - std::string description () const { + std::string description () const override { std::ostringstream os; os << "Intranode Tall Skinny QR (TSQR): sequential cache-blocked " "implementation with cache size hint " << this->cache_size_hint() @@ -369,16 +368,20 @@ namespace TSQR { } //! Whether this object is ready to perform computations. - bool ready() const { + bool ready() const override { return true; } - /// \brief Does factor() compute R with nonnegative diagonal? - /// - /// See the \c NodeTsqr documentation for details. - bool QR_produces_R_factor_with_nonnegative_diagonal () const { - using combine_type = Combine; - return combine_type::QR_produces_R_factor_with_nonnegative_diagonal(); + //! Whether factor() promises to compute R with a nonnegative diagonal. + bool + QR_produces_R_factor_with_nonnegative_diagonal () const override + { + // FIXME (19 Dec 2019) If the combine type is dynamic, we can't + // answer this question without knowing the number of columns. + // Just guess for now. + constexpr LocalOrdinal fakeNumCols = 10; + auto& c = this->getMyCombine (fakeNumCols); + return c.QR_produces_R_factor_with_nonnegative_diagonal (); } /// \brief Cache size hint (in bytes) used for the factorization. @@ -386,74 +389,10 @@ namespace TSQR { /// This may be different than the cache size hint argument /// specified in the constructor. SequentialTsqr treats that as a /// hint, not a command. - size_t cache_size_hint () const { + size_t cache_size_hint () const override { return strategy_.cache_size_hint(); } - /// \brief Compute QR factorization (implicitly stored Q factor) of A. - /// - /// Compute the QR factorization in place of the nrows by ncols - /// matrix A, with nrows >= ncols. The matrix A is stored either - /// in column-major order (the default) or with contiguous - /// column-major cache blocks, with leading dimension lda >= - /// nrows. Write the resulting R factor to the top block of A (in - /// place). (You can get a view of this via the top_block() - /// method.) Everything below the upper triangle of A is - /// overwritten with part of the implicit representation of the Q - /// factor. The other part of that representation is returned. - /// - /// \param nrows [in] Number of rows in the matrix A. - /// \param ncols [in] Number of columns in the matrix A. - /// \param A [in/out] On input: the nrows by ncols matrix to - /// factor. On output: part of the representation of the - /// implicitly stored Q factor. - /// \param lda [in] Leading dimension of A, if A is stored in - /// column-major order. Otherwise its value doesn't matter. - /// \param contiguous_cache_blocks [in] Whether the matrix A is - /// stored in a contiguously cache-blocked format. - /// - /// \return Part of the representation of the implicitly stored Q - /// factor. The complete representation includes A (on output). - /// The FactorOutput and A go together. - FactorOutput - factor (const LocalOrdinal nrows, - const LocalOrdinal ncols, - Scalar A[], - const LocalOrdinal lda, - const bool contiguous_cache_blocks) const - { - CacheBlocker blocker (nrows, ncols, strategy_); - Combine combine; - std::vector work (ncols); - FactorOutput tau_arrays; - - // We say "A_rest" because it points to the remaining part of - // the matrix left to factor; at the beginning, the "remaining" - // part is the whole matrix, but that will change as the - // algorithm progresses. - // - // Note: if the cache blocks are stored contiguously, lda won't - // be the correct leading dimension of A, but it won't matter: - // we only ever operate on A_cur here, and A_cur's leading - // dimension is set correctly by A_rest.split_top(). - mat_view_type A_rest (nrows, ncols, A, lda); - // This call modifies A_rest. - mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - - // Factor the topmost block of A. - std::vector tau_first (ncols); - mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work); - tau_arrays.push_back (tau_first); - - while (! A_rest.empty()) { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); - std::vector tau (ncols); - combine_factor (combine, R_view, A_cur, tau, work); - tau_arrays.push_back (tau); - } - return tau_arrays; - } - /// \brief Extract R factor from \c factor() results. /// /// The five-argument version of \c factor() leaves the R factor @@ -480,7 +419,7 @@ namespace TSQR { deep_copy (R_view, Scalar {}); // Copy out the upper triangle of the R factor from A into R. - copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.stride(1)); + copy_upper_triangle (R, A_top); } /// \brief Compute the QR factorization of the matrix A. @@ -490,20 +429,23 @@ namespace TSQR { /// when using SequentialTsqr as the intranode TSQR implementation /// in \c Tsqr. The five-argument version is more useful when /// using SequentialTsqr inside of another intranode TSQR - /// implementation, such as \c TbbTsqr. - FactorOutput + /// implementation, such as TbbTsqr. + Teuchos::RCP factor (const LocalOrdinal nrows, const LocalOrdinal ncols, Scalar A[], const LocalOrdinal lda, Scalar R[], const LocalOrdinal ldr, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { - CacheBlocker blocker (nrows, ncols, strategy_); - Combine combine; - std::vector work (ncols); - FactorOutput tau_arrays; + using LO = LocalOrdinal; + CacheBlocker blocker (nrows, ncols, strategy_); + auto& combine = this->getMyCombine (ncols); + const LO lwork = combine.work_size (nrows, ncols, ncols); + std::vector work (lwork); + Teuchos::RCP tau_arrays + (new my_factor_output_type); // We say "A_rest" because it points to the remaining part of // the matrix left to factor; at the beginning, the "remaining" @@ -513,21 +455,25 @@ namespace TSQR { // Note: if the cache blocks are stored contiguously, lda won't // be the correct leading dimension of A, but it won't matter: // we only ever operate on A_cur here, and A_cur's leading - // dimension is set correctly by A_rest.split_top(). + // dimension is set correctly by split_top_block. mat_view_type A_rest (nrows, ncols, A, lda); // This call modifies A_rest. - mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + mat_view_type A_cur = + blocker.split_top_block (A_rest, contigCacheBlocks); // Factor the topmost block of A. std::vector tau_first (ncols); - mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work); - tau_arrays.push_back (tau_first); + mat_view_type R_view = + factor_first_block (combine, A_cur, tau_first, + work.data (), lwork); + tau_arrays->add_and_consume (std::move (tau_first)); - while (! A_rest.empty()) { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + while (! empty (A_rest)) { + A_cur = blocker.split_top_block (A_rest, contigCacheBlocks); std::vector tau (ncols); - combine_factor (combine, R_view, A_cur, tau, work); - tau_arrays.push_back (tau); + combine.factor_inner (R_view, A_cur, tau.data (), + work.data (), lwork); + tau_arrays->add_and_consume (std::move (tau)); } // Copy the R factor resulting from the factorization out of @@ -535,7 +481,7 @@ namespace TSQR { // output argument. mat_view_type R_out (ncols, ncols, R, ldr); deep_copy (R_out, Scalar {}); - copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.stride(1)); + copy_upper_triangle (R_out, R_view); return tau_arrays; } @@ -554,7 +500,7 @@ namespace TSQR { /// \param lda [in] If the matrix A is stored in column-major /// order: the leading dimension (a.k.a. stride) of A. /// Otherwise, the value of this parameter doesn't matter. - /// \param contiguous_cache_blocks [in] Whether the cache blocks + /// \param contigCacheBlocks [in] Whether the cache blocks /// in the matrix A are stored contiguously. /// /// \return Number of cache blocks in the matrix A: a positive integer. @@ -563,21 +509,22 @@ namespace TSQR { const LocalOrdinal ncols, const Scalar A[], const LocalOrdinal lda, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const { - CacheBlocker blocker (nrows, ncols, strategy_); - LocalOrdinal count = 0; + using LO = LocalOrdinal; + CacheBlocker blocker (nrows, ncols, strategy_); + LO count = 0; const_mat_view_type A_rest (nrows, ncols, A, lda); - if (A_rest.empty()) { + if (empty (A_rest)) { return count; } - - const_mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + const_mat_view_type A_cur = + blocker.split_top_block (A_rest, contigCacheBlocks); ++count; // first factor step - while (! A_rest.empty()) { - A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks); + while (! empty (A_rest)) { + A_cur = blocker.split_top_block (A_rest, contigCacheBlocks); ++count; // next factor step } return count; @@ -592,37 +539,63 @@ namespace TSQR { const LocalOrdinal ncols_Q, const Scalar Q[], const LocalOrdinal ldq, - const FactorOutput& factor_output, + const factor_output_type& factor_output, const LocalOrdinal ncols_C, Scalar C[], const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { + using LO = LocalOrdinal; + const char prefix[] = "TSQR::SequentialTsqr::apply: "; + // Quick exit and error tests if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) { return; } else if (ldc < nrows) { std::ostringstream os; - os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")"; + os << prefix << "ldc (= " << ldc << ") < nrows (= " + << nrows << ")"; throw std::invalid_argument (os.str()); } else if (ldq < nrows) { std::ostringstream os; - os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")"; + os << prefix << "ldq (= " << ldq << ") < nrows (= " + << nrows << ")"; throw std::invalid_argument (os.str()); } + const my_factor_output_type& tau_arrays = [&] () { + const my_factor_output_type* tau_arrays_ptr = + dynamic_cast (&factor_output); + if (tau_arrays_ptr == nullptr) { + using Teuchos::demangleName; + using Teuchos::TypeNameTraits; + using Teuchos::typeName; + std::ostringstream os; + os << prefix << "Input factor_output_type object was not " + "created by the same type of SequentialTsqr object as " + "this one. This object has type " << typeName (*this) << + " and its subclass of factor_output_type has type " << + TypeNameTraits::name () << ", but " + "the input factor_output_type object has dynamic type " + << demangleName (typeid (factor_output).name ()); + throw std::invalid_argument (os.str ()); + } + return *tau_arrays_ptr; + } (); + // If contiguous cache blocks are used, then we have to use the // same convention as we did for factor(). Otherwise, we are // free to choose the cache block dimensions as we wish in // apply(), independently of what we did in factor(). - CacheBlocker blocker (nrows, ncols_Q, strategy_); - Combine combine; + CacheBlocker blocker (nrows, ncols_Q, strategy_); + auto& combine = + this->getMyCombine (std::max (ncols_Q, ncols_C)); + const LO lwork = combine.work_size (nrows, ncols_Q, ncols_C); + std::vector work (lwork); - const bool transposed = apply_type.transposed(); - const FactorOutput& tau_arrays = factor_output; // rename for encapsulation - std::vector work (ncols_C); + const bool transposed = apply_type.transposed (); // We say "*_rest" because it points to the remaining part of // the matrix left to factor; at the beginning, the "remaining" @@ -640,37 +613,51 @@ namespace TSQR { // Identify the top ncols_C by ncols_C block of C. C_rest is // not modified. - mat_view_type C_top = blocker.top_block (C_rest, contiguous_cache_blocks); + mat_view_type C_top = + blocker.top_block (C_rest, contigCacheBlocks); if (transposed) { - const_mat_view_type Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); - mat_view_type C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); + const_mat_view_type Q_cur = + blocker.split_top_block (Q_rest, contigCacheBlocks); + mat_view_type C_cur = + blocker.split_top_block (C_rest, contigCacheBlocks); // Apply the topmost block of Q. auto tau_iter = tau_arrays.begin(); - const std::vector& tau = *tau_iter++; - apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work); - - while (! Q_rest.empty()) { - Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks); - C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks); - combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); + const std::vector& tau_first = *tau_iter++; + combine.apply_first (apply_type, Q_cur, tau_first.data (), + C_cur, work.data (), lwork); + while (! empty (Q_rest)) { + Q_cur = blocker.split_top_block (Q_rest, contigCacheBlocks); + C_cur = blocker.split_top_block (C_rest, contigCacheBlocks); + const Scalar* tau = tau_iter->data (); + combine.apply_inner (apply_type, Q_cur, tau, C_top, C_cur, + work.data (), lwork); + tau_iter++; } } else { - // Start with the last local Q factor and work backwards up the matrix. - auto tau_iter = tau_arrays.rbegin(); - - const_mat_view_type Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); - mat_view_type C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); - - while (! Q_rest.empty()) { - combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work); - Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks); - C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks); + // Start with the last local Q factor and work backwards up + // the matrix. + auto tau_iter = tau_arrays.rbegin (); + const_mat_view_type Q_cur = + blocker.split_bottom_block (Q_rest, contigCacheBlocks); + mat_view_type C_cur = + blocker.split_bottom_block (C_rest, contigCacheBlocks); + while (! empty (Q_rest)) { + const Scalar* tau = tau_iter->data (); + combine.apply_inner (apply_type, Q_cur, tau, C_top, C_cur, + work.data (), lwork); + tau_iter++; + Q_cur = + blocker.split_bottom_block (Q_rest, contigCacheBlocks); + C_cur = + blocker.split_bottom_block (C_rest, contigCacheBlocks); } // Apply to last (topmost) cache block. - apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work); + const std::vector& tau_first = *tau_iter++; + combine.apply_first (apply_type, Q_cur, tau_first.data (), + C_cur, work.data (), lwork); } } @@ -682,38 +669,27 @@ namespace TSQR { const LocalOrdinal ncols_Q, const Scalar Q[], const LocalOrdinal ldq, - const FactorOutput& factor_output, + const factor_output_type& factor_output, const LocalOrdinal ncols_C, Scalar C[], const LocalOrdinal ldc, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { - // Identify top ncols_C by ncols_C block of C. C_view is not - // modified. top_block() will set C_top to have the correct - // leading dimension, whether or not cache blocks are stored - // contiguously. mat_view_type C_view (nrows, ncols_C, C, ldc); - mat_view_type C_top = this->top_block (C_view, contiguous_cache_blocks); - - // Fill C with zeros, and then fill the topmost block of C with - // the first ncols_C columns of the identity matrix, so that C - // itself contains the first ncols_C columns of the identity - // matrix. - fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks); - for (LocalOrdinal j = 0; j < ncols_C; ++j) { - C_top(j, j) = Scalar(1.0); - } - - // Apply the Q factor to C, to extract the first ncols_C columns - // of Q in explicit form. + deep_copy (C_view, Scalar {}); + // Don't just call set_diagonal_entries_to_one(C_view), because + // that doesn't respect contigCacheBlocks. + auto C_top = this->top_block (C_view, contigCacheBlocks); + deep_copy (C_top, Scalar {}); + this->set_diagonal_entries_to_one (C_top); apply (ApplyType::NoTranspose, nrows, ncols_Q, Q, ldq, factor_output, - ncols_C, C, ldc, contiguous_cache_blocks); + ncols_C, C, ldc, contigCacheBlocks); } /// \brief Compute Q := Q*B. /// - /// See the \c NodeTsqr documentation for details. + /// See the NodeTsqr documentation for details. void Q_times_B (const LocalOrdinal nrows, const LocalOrdinal ncols, @@ -721,12 +697,10 @@ namespace TSQR { const LocalOrdinal ldq, const Scalar B[], const LocalOrdinal ldb, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { using Teuchos::NO_TRANS; - - // We don't do any other error checking here (e.g., matrix - // dimensions), though it would be a good idea to do so. + using LO = LocalOrdinal; // Take the easy exit if available. if (ncols == 0 || nrows == 0) { @@ -739,14 +713,13 @@ namespace TSQR { // computation is completely independent of the others; a slight // restructuring of this code would parallelize nicely using // OpenMP. - CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_); + CacheBlocker blocker (nrows, ncols, strategy_); Impl::SystemBlas blas; mat_view_type Q_rest (nrows, ncols, Q, ldq); - Matrix - Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized - while (! Q_rest.empty ()) { + Matrix Q_cur_copy (0, 0); // will be resized + while (! empty (Q_rest)) { mat_view_type Q_cur = - blocker.split_top_block (Q_rest, contiguous_cache_blocks); + blocker.split_top_block (Q_rest, contigCacheBlocks); // GEMM doesn't like aliased arguments, so we use a copy. // We only copy the current cache block, rather than all of @@ -754,9 +727,13 @@ namespace TSQR { Q_cur_copy.reshape (Q_cur.extent (0), ncols); deep_copy (Q_cur_copy, Q_cur); // Q_cur := Q_cur_copy * B. - blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent (0), ncols, ncols, - Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.stride (1), - B, ldb, Scalar {}, Q_cur.data (), Q_cur.stride (1)); + constexpr Scalar ZERO {}; + constexpr Scalar ONE (1.0); + blas.GEMM (NO_TRANS, NO_TRANS, + Q_cur.extent (0), ncols, ncols, + ONE, Q_cur_copy.data (), Q_cur_copy.stride (1), + B, ldb, + ZERO, Q_cur.data (), Q_cur.stride (1)); } } @@ -775,9 +752,10 @@ namespace TSQR { const LocalOrdinal ncols, Scalar A_out[], const Scalar A_in[], - const LocalOrdinal lda_in) const + const LocalOrdinal lda_in) const override { - CacheBlocker blocker (nrows, ncols, strategy_); + CacheBlocker blocker + (nrows, ncols, strategy_); blocker.cache_block (nrows, ncols, A_out, A_in, lda_in); } @@ -802,9 +780,10 @@ namespace TSQR { const LocalOrdinal ncols, Scalar A_out[], const LocalOrdinal lda_out, - const Scalar A_in[]) const + const Scalar A_in[]) const override { - CacheBlocker blocker (nrows, ncols, strategy_); + CacheBlocker blocker + (nrows, ncols, strategy_); blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in); } @@ -818,17 +797,19 @@ namespace TSQR { /// \param A [out] nrows by ncols column-major-order dense matrix /// with leading dimension lda /// \param lda [in] Leading dimension of A: lda >= nrows - /// \param contiguous_cache_blocks [in] Whether the cache blocks + /// \param contigCacheBlocks [in] Whether the cache blocks /// in A are stored contiguously. void fill_with_zeros (const LocalOrdinal nrows, const LocalOrdinal ncols, Scalar A[], const LocalOrdinal lda, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { - CacheBlocker blocker (nrows, ncols, strategy_); - blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks); + CacheBlocker blocker + (nrows, ncols, strategy_); + blocker.fill_with_zeros (nrows, ncols, A, lda, + contigCacheBlocks); } protected: @@ -840,29 +821,27 @@ namespace TSQR { /// /// \param C [in] View of a matrix, with at least as many rows as /// columns. - /// \param contiguous_cache_blocks [in] Whether the cache blocks - /// of C are stored contiguously. + /// \param contigCacheBlocks [in] Whether the cache blocks of C + /// are stored contiguously. /// /// \return View of the topmost cache block of the matrix C. const_mat_view_type const_top_block (const const_mat_view_type& C, - const bool contiguous_cache_blocks) const + const bool contigCacheBlocks) const override { // The CacheBlocker object knows how to construct a view of the // top cache block of C. This is complicated because cache // blocks (in C) may or may not be stored contiguously. If they // are stored contiguously, the CacheBlocker knows the right // layout, based on the cache blocking strategy. - typedef CacheBlocker blocker_type; - blocker_type blocker (C.extent(0), C.extent(1), strategy_); - - // C_top_block is a view of the topmost cache block of C. - // C_top_block should have >= ncols rows, otherwise either cache - // blocking is broken or the input matrix C itself had fewer - // rows than columns. - const_mat_view_type C_top_block = - blocker.top_block (C, contiguous_cache_blocks); - return C_top_block; + using blocker_type = CacheBlocker; + blocker_type blocker (C.extent (0), C.extent (1), strategy_); + + // This is a view of the topmost cache block of C. C_top_block + // should have >= ncols rows, otherwise either cache blocking is + // broken or the input matrix C itself had fewer rows than + // columns. + return blocker.top_block (C, contigCacheBlocks); } private: @@ -872,4 +851,4 @@ namespace TSQR { } // namespace TSQR -#endif // __TSQR_Tsqr_SequentialTsqr_hpp +#endif // TSQR_SEQUENTIALTSQR_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp deleted file mode 100644 index ad86d8c3d206..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp +++ /dev/null @@ -1,423 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_Test_TbbTest_hpp -#define __TSQR_Test_TbbTest_hpp - -#include "Tsqr_nodeTestProblem.hpp" -#include "Tsqr_verifyTimerConcept.hpp" -#include "Tsqr_Random_NormalGenerator.hpp" - -#include "Tsqr_LocalVerify.hpp" -#include "Tsqr_Matrix.hpp" -#include "Tsqr_Util.hpp" -#include "TbbTsqr.hpp" - -#include "Teuchos_LAPACK.hpp" -#include "Teuchos_Time.hpp" - -#include -#include // size_t definition -#include -#include -#include -#include - -using std::make_pair; -using std::pair; -using std::vector; - -using std::cerr; -using std::cout; -using std::endl; - -namespace TSQR { - namespace Test { - /// Test the accuracy of Intel TBB TSQR on an nrows by ncols - /// matrix (using the given number of cores and the given cache - /// block size (in bytes)), and print the results to stdout. - template - void - verifyTbbTsqr (const std::string& scalarTypeName, - TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator, - const Ordinal nrows, - const Ordinal ncols, - const int num_cores, - const size_t cache_size_hint, - const bool contiguous_cache_blocks, - const bool printFieldNames, - const bool human_readable, - const bool b_debug = false) - { - typedef Teuchos::Time timer_type; - typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type; - typedef typename node_tsqr_type::FactorOutput factor_output_type; - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType magnitude_type; - using std::cerr; - using std::cout; - using std::endl; - - node_tsqr_type actor (num_cores, cache_size_hint); - - if (b_debug) { - cerr << "Intel TBB TSQR test problem:" << endl - << "* " << nrows << " x " << ncols << endl - << "* # cores: " << num_cores << endl - << "* Cache size hint in bytes: " << actor.cache_size_hint() << endl; - if (contiguous_cache_blocks) { - cerr << "* Contiguous cache blocks" << endl; - } - } - - Matrix< Ordinal, Scalar > A (nrows, ncols); - Matrix< Ordinal, Scalar > A_copy (nrows, ncols); - Matrix< Ordinal, Scalar > Q (nrows, ncols); - Matrix< Ordinal, Scalar > R (ncols, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) { - deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN()); - deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN()); - deep_copy (Q, std::numeric_limits< Scalar >::quiet_NaN()); - deep_copy (R, std::numeric_limits< Scalar >::quiet_NaN()); - } - const Ordinal lda = nrows; - const Ordinal ldq = nrows; - const Ordinal ldr = ncols; - - // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true); - - if (b_debug) { - cerr << "-- Generated test problem" << endl; - } - - // Copy A into A_copy, since TSQR overwrites the input. If - // specified, rearrange the data in A_copy so that the data in - // each cache block is contiguously stored. - if (! contiguous_cache_blocks) { - deep_copy (A_copy, A); - if (b_debug) { - cerr << "-- Copied test problem from A into A_copy" << endl; - } - } - else { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); - if (b_debug) { - cerr << "-- Reorganized test matrix to have contiguous " - "cache blocks" << endl; - } - // Verify cache blocking, when in debug mode. - if (b_debug) { - Matrix< Ordinal, Scalar > A2 (nrows, ncols); - if (std::numeric_limits< Scalar >::has_quiet_NaN) { - deep_copy (A2, std::numeric_limits< Scalar >::quiet_NaN()); - } - actor.un_cache_block (nrows, ncols, A2.data(), A2.stride(1), A_copy.data()); - if (matrix_equal (A, A2)) { - if (b_debug) { - cerr << "-- Cache blocking test succeeded!" << endl; - } - } - else { - throw std::logic_error ("Cache blocking failed"); - } - } - } - - // Fill R with zeros, since the factorization may not overwrite - // the strict lower triangle of R. - deep_copy (R, Scalar {}); - - // Factor the matrix and compute the explicit Q factor - factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), R.data(), - R.stride(1), contiguous_cache_blocks); - if (b_debug) { - cerr << "-- Finished TbbTsqr::factor" << endl; - } - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), factor_output, - ncols, Q.data(), Q.stride(1), contiguous_cache_blocks); - if (b_debug) { - cerr << "-- Finished TbbTsqr::explicit_Q" << endl; - } - - // "Un"-cache-block the output Q (the explicit Q factor), if - // contiguous cache blocks were used. This is only necessary - // because local_verify() doesn't currently support contiguous - // cache blocks. - if (contiguous_cache_blocks) { - // Use A_copy as temporary storage for un-cache-blocking Q. - actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data()); - deep_copy (Q, A_copy); - if (b_debug) { - cerr << "-- Un-cache-blocked output Q factor" << endl; - } - } - - // Print out the R factor - if (b_debug) { - cerr << endl << "-- R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); - cerr << endl; - } - - // Validate the factorization - std::vector< magnitude_type > results = - local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr); - if (b_debug) { - cerr << "-- Finished local_verify" << endl; - } - - // Print the results - if (human_readable) { - cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << nrows << endl - << "# columns: " << ncols << endl - << "# cores: " << num_cores << endl - << "Cache size hint in bytes: " << actor.cache_size_hint() << endl - << "Contiguous cache blocks? " << contiguous_cache_blocks << endl - << "Absolute residual $\\|A - Q*R\\|_2$: " - << results[0] << endl - << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " - << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " - << results[2] << endl - << endl; - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - cout << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",numThreads" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA" - << endl; - } - cout << "TbbTsqr" - << "," << scalarTypeName - << "," << nrows - << "," << ncols - << "," << num_cores - << "," << actor.cache_size_hint() - << "," << contiguous_cache_blocks - << "," << results[0] - << "," << results[1] - << "," << results[2] - << endl; - } - } - - /// \brief Benchmark Intel TBB TSQR vs. LAPACK's QR, and print the - /// results to stdout. - /// - /// \note c++0x support is need in order to have a default - /// template parameter argument for a template function, otherwise - /// we would have templated this function on TimerType and made - /// Teuchos::Time the default. - template< class Ordinal, class Scalar > - void - benchmarkTbbTsqr (const std::string& scalarTypeName, - const int ntrials, - const Ordinal nrows, - const Ordinal ncols, - const int num_cores, - const size_t cache_size_hint, - const bool contiguous_cache_blocks, - const bool printFieldNames, - const bool human_readable) - { - using TSQR::TBB::TbbTsqr; - using std::cerr; - using std::cout; - using std::endl; - - typedef Teuchos::Time timer_type; - typedef Ordinal ordinal_type; - typedef Scalar scalar_type; - typedef Matrix< ordinal_type, scalar_type > matrix_type; - typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type; - - // Pseudorandom normal(0,1) generator. Default seed is OK, - // because this is a benchmark, not an accuracy test. - TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator; - - // Set up TSQR implementation. - node_tsqr_type actor (num_cores, cache_size_hint); - - matrix_type A (nrows, ncols); - matrix_type A_copy (nrows, ncols); - matrix_type Q (nrows, ncols); - matrix_type R (ncols, ncols, scalar_type(0)); - - // Fill R with zeros, since the factorization may not overwrite - // the strict lower triangle of R. - deep_copy (R, scalar_type {}); - - // Create a test problem - nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), false); - - // Copy A into A_copy, since TSQR overwrites the input. If - // specified, rearrange the data in A_copy so that the data in - // each cache block is contiguously stored. - if (contiguous_cache_blocks) { - actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1)); - } - else { - deep_copy (A_copy, A); - } - - // Do a few timing runs and throw away the results, just to warm - // up any libraries that do autotuning. - const int numWarmupRuns = 5; - for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) { - // Factor the matrix in-place in A_copy, and extract the - // resulting R factor into R. - typedef typename node_tsqr_type::FactorOutput factor_output_type; - factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguous_cache_blocks); - // Compute the explicit Q factor (which was stored - // implicitly in A_copy and factor_output) and store in Q. - // We don't need to un-cache-block the output, because we - // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), - factor_output, ncols, Q.data(), Q.stride(1), - contiguous_cache_blocks); - } - - // Benchmark TBB-based TSQR for ntrials trials. - // - // Name of timer doesn't matter here; we only need the timing. - timer_type timer("TbbTsqr"); - timer.start(); - for (int trial_num = 0; trial_num < ntrials; ++trial_num) { - // Factor the matrix in-place in A_copy, and extract the - // resulting R factor into R. - typedef typename node_tsqr_type::FactorOutput factor_output_type; - factor_output_type factor_output = - actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguous_cache_blocks); - // Compute the explicit Q factor (which was stored - // implicitly in A_copy and factor_output) and store in Q. - // We don't need to un-cache-block the output, because we - // aren't verifying it here. - actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), - factor_output, ncols, Q.data(), Q.stride(1), - contiguous_cache_blocks); - } - const double tbb_tsqr_timing = timer.stop(); - - // Print the results - if (human_readable) { - cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << nrows << endl - << "# columns: " << ncols << endl - << "# cores: " << num_cores << endl - << "Cache size hint in bytes: " << actor.cache_size_hint() << endl - << "Contiguous cache blocks? " << contiguous_cache_blocks << endl - << "# trials: " << ntrials << endl - << "Total time (s) = " << tbb_tsqr_timing << endl - << "Total time (s) in factor() (min over all tasks): " - << (ntrials * actor.min_seq_factor_timing()) << endl - << "Total time (s) in factor() (max over all tasks): " - << (ntrials * actor.max_seq_factor_timing()) << endl - << "Total time (s) in apply() (min over all tasks): " - << (ntrials * actor.min_seq_apply_timing()) << endl - << "Total time (s) in apply() (max over all tasks): " - << (ntrials * actor.max_seq_apply_timing()) << endl - << endl << endl; - cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl; - - std::vector stats; - actor.getStats (stats); - std::vector labels; - actor.getStatsLabels (labels); - - const std::string labelLabel ("label"); - for (std::vector::size_type k = 0; k < labels.size(); ++k) { - const bool printHeaders = (k == 0); - if (stats[k].count() > 0) - stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders); - } - } - else { - if (printFieldNames) { - const char prefix[] = "%"; - cout << prefix - << "method" - << ",scalarType" - << ",numRows" - << ",numCols" - << ",numThreads" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",timing" - << endl; - } - - // We don't include {min,max}_seq_apply_timing() here, because - // those times don't benefit from the accuracy of benchmarking - // for ntrials > 1. Thus, it's misleading to include them - // with tbb_tsqr_timing, the total time over ntrials trials. - cout << "TbbTsqr" - << "," << scalarTypeName - << "," << nrows - << "," << ncols - << "," << num_cores - << "," << actor.cache_size_hint() - << "," << contiguous_cache_blocks - << "," << ntrials - << "," << tbb_tsqr_timing - << endl; - } - } - } // namespace Test -} // namespace TSQR - -#endif // __TSQR_Test_TbbTest_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp new file mode 100644 index 000000000000..ba99ac49332a --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp @@ -0,0 +1,84 @@ +#include "Tsqr_Test_MpiAndKokkosScope.hpp" +#include "Kokkos_Core.hpp" +#include "Teuchos_oblackholestream.hpp" +#include "Teuchos_CommHelpers.hpp" +#ifdef HAVE_MPI +# include "Teuchos_DefaultMpiComm.hpp" +# include "Teuchos_Assert.hpp" +#else +# include "Teuchos_DefaultSerialComm.hpp" +#endif // HAVE_MPI +#include +#include + +namespace TSQR { +namespace Test { + +#ifdef HAVE_MPI +MpiScope::MpiScope(int* argc, char*** argv) { + (void) MPI_Init(argc, argv); + + int rawSize = 0; + (void) MPI_Comm_size(MPI_COMM_WORLD, &rawSize); + + std::ostringstream os; + os << "MpiScope: Result of MPI_Comm_size on MPI_COMM_WORLD: " + << rawSize << std::endl; + std::cerr << os.str(); +} +MpiScope::~MpiScope() { + (void) MPI_Finalize(); +} +#else +MpiScope::MpiScope(int*, char***) { + std::cerr << "MpiScope: HAVE_MPI is NOT defined" << std::endl; +} +MpiScope::~MpiScope() {} +#endif // HAVE_MPI + +Teuchos::RCP> +MpiAndKokkosScope::getDefaultComm() +{ +#ifdef HAVE_MPI + int initialized = 0; + (void) MPI_Initialized(&initialized); + TEUCHOS_ASSERT( initialized == 1 ); + + using comm_type = Teuchos::MpiComm; + const auto comm = Teuchos::rcp(new comm_type(MPI_COMM_WORLD)); +#else + using comm_type = Teuchos::SerialComm; + const auto comm = Teuchos::rcp(new comm_type); +#endif // HAVE_MPI + + return comm; +} + +MpiAndKokkosScope:: +MpiAndKokkosScope(int* argc, char*** argv) : + mpiScope_(argc, argv), + blackHole_(new Teuchos::oblackholestream), + comm_(getDefaultComm()), + kokkosScope_(new Kokkos::ScopeGuard(*argc, *argv)) +{} + +Teuchos::RCP> +MpiAndKokkosScope::getComm() const { + return comm_; +} + +std::ostream& MpiAndKokkosScope::outStream() const { + // Only Process 0 gets to write to cout and cerr. The other MPI + // processes send their output to a "black hole" (something that + // acts like /dev/null). + return comm_->getRank() == 0 ? std::cout : + static_cast(*blackHole_); +} + +std::ostream& MpiAndKokkosScope::errStream() const { + return comm_->getRank() == 0 ? std::cerr : + static_cast(*blackHole_); +} + +} // namespace Test +} // namespace TSQR diff --git a/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp new file mode 100644 index 000000000000..fc317fbc9f55 --- /dev/null +++ b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp @@ -0,0 +1,50 @@ +#ifndef TSQR_TEST_MPIANDKOKKOSSCOPE_HPP +#define TSQR_TEST_MPIANDKOKKOSSCOPE_HPP + +#include "Teuchos_RCP.hpp" +#include +#include + +namespace Kokkos { +class ScopeGuard; +} // namespace Kokkos + +namespace Teuchos { +template class Comm; +} // namespace Teuchos + +namespace TSQR { +namespace Test { + +class MpiScope { +public: + MpiScope(int* argc, char*** argv); + ~MpiScope(); +}; + +// Scope guard for TSQR's tests, that automatically initializes and +// finalizes both MPI (if building with MPI enabled) and Kokkos. +class MpiAndKokkosScope { +public: + MpiAndKokkosScope(int* argc, char*** argv); + + Teuchos::RCP> getComm() const; + std::ostream& outStream() const; + std::ostream& errStream() const; + +private: + static Teuchos::RCP> getDefaultComm(); + + MpiScope mpiScope_; + std::unique_ptr blackHole_; + Teuchos::RCP> comm_; + // The only reason ever to handle a scope guard by pointer is for + // implementation hiding via the "pImpl" (pointer to implementation) + // idiom. + std::unique_ptr kokkosScope_; +}; + +} // namespace Test +} // namespace TSQR + +#endif // TSQR_TEST_MPIANDKOKKOSSCOPE_HPP diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp deleted file mode 100644 index dea7317ad040..000000000000 --- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp +++ /dev/null @@ -1,801 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#ifndef __TSQR_Test_TsqrTest_hpp -#define __TSQR_Test_TsqrTest_hpp - -#include "Tsqr.hpp" -#ifdef HAVE_KOKKOSTSQR_TBB -# include "TbbTsqr.hpp" -#endif // HAVE_KOKKOSTSQR_TBB -#include "Tsqr_TestSetup.hpp" -#include "Tsqr_GlobalVerify.hpp" -#include "Tsqr_printGlobalMatrix.hpp" -#include "Tsqr_verifyTimerConcept.hpp" -#include "Teuchos_ScalarTraits.hpp" -#include // size_t -#include -#include -#include - -namespace TSQR { - namespace Test { - template - class TsqrVerifier { - public: - using tsqr_type = TsqrType; - using scalar_type = typename tsqr_type::scalar_type; - using ordinal_type = typename tsqr_type::ordinal_type; - using matrix_type = Matrix; - using factor_output_type = typename tsqr_type::FactorOutput; - using messenger_type = MessengerBase; - using messenger_ptr = Teuchos::RCP; - - static void - verify (tsqr_type& tsqr, - const messenger_ptr& scalarComm, - const matrix_type& A_local, - matrix_type& A_copy, - matrix_type& Q_local, - matrix_type& R, - const bool contiguousCacheBlocks, - const bool b_debug = false) - { - using std::cerr; - using std::endl; - - const ordinal_type nrows_local = A_local.extent(0); - const ordinal_type ncols = A_local.extent(1); - - // If specified, rearrange cache blocks in the copy. - if (contiguousCacheBlocks) { - tsqr.cache_block (nrows_local, ncols, A_copy.data(), - A_local.data(), A_local.stride(1)); - if (b_debug) { - scalarComm->barrier (); - if (scalarComm->rank () == 0) - cerr << "-- Cache-blocked input matrix to factor." << endl; - } - } - else { - deep_copy (A_copy, A_local); - } - - const bool testFactorExplicit = true; - if (testFactorExplicit) { - tsqr.factorExplicit (A_copy.view(), Q_local.view(), - R.view(), contiguousCacheBlocks); - if (b_debug) { - scalarComm->barrier (); - if (scalarComm->rank () == 0) { - cerr << "-- Finished Tsqr::factorExplicit" << endl; - } - } - } - else { - // Factor the (copy of the) matrix. - factor_output_type factorOutput = - tsqr.factor (nrows_local, ncols, - A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), - contiguousCacheBlocks); - if (b_debug) { - scalarComm->barrier (); - if (scalarComm->rank () == 0) { - cerr << "-- Finished Tsqr::factor" << endl; - } - } - - // Compute the explicit Q factor in Q_local - tsqr.explicit_Q (nrows_local, - ncols, A_copy.data(), A_copy.stride(1), - factorOutput, - ncols, Q_local.data(), Q_local.stride(1), - contiguousCacheBlocks); - if (b_debug) { - scalarComm->barrier (); - if (scalarComm->rank () == 0) { - cerr << "-- Finished Tsqr::explicit_Q" << endl; - } - } - } - - // "Un"-cache-block the output, if contiguous cache blocks were - // used. This is only necessary because global_verify() doesn't - // currently support contiguous cache blocks. - if (contiguousCacheBlocks) { - // We can use A_copy as scratch space for un-cache-blocking - // Q_local, since we're done using A_copy for other things. - tsqr.un_cache_block (nrows_local, ncols, A_copy.data(), - A_copy.stride(1), Q_local.data()); - // Overwrite Q_local with the un-cache-blocked Q factor. - deep_copy (Q_local, A_copy); - - if (b_debug) { - scalarComm->barrier (); - if (scalarComm->rank () == 0) { - cerr << "-- Un-cache-blocked output Q factor" << endl; - } - } - } - } - }; - - /// \function verifyTsqr - /// \brief Test and print to stdout the accuracy of parallel TSQR - /// - /// \param which [in] Valid values: "MpiTbbTSQR" (for TBB-parallel - /// node-level TSQR underneath MPI-parallel TSQR), "MpiSeqTSQR" - /// (for cache-blocked sequential node-level TSQR underneath - /// MPI-parallel TSQR) - /// - /// \param scalarTypeName [in] Name of the Scalar type - /// - /// \param generator [in/out] Normal(0,1) (pseudo)random number - /// generator. Only touched on MPI process 0. Used to generate - /// random test matrices for the factorization. - /// - /// \param nrows_global [in] Number of rows in the entire test - /// matrix (over all processes) to generate. The matrix will be - /// divided up in blocks of contiguous rows among the processes. - /// - /// \param ncols [in] Number of columns in the test matrix to - /// generate. - /// - /// \param ordinalComm [in/out] Object for communicating Ordinal - /// (integer index) objects among the processes - /// - /// \param scalarComm [in/out] Object for communicating Scalar - /// (matrix data) objects among the processes - /// - /// \param num_cores [in] Number of cores to use per MPI process - /// for Intel TBB parallelism within that process - /// - /// \param cache_size_hint [in] Cache size hint (per core) in - /// bytes. If zero, a sensible default is used. - /// - /// \param contiguousCacheBlocks [in] Whether cache blocks - /// should be stored contiguously - /// - /// \param printFieldNames [in] Whether to print field names (only - /// appliable if not human_readable) - /// - /// \param human_readable [in] Whether output should be human - /// readable, or machine parseable - /// - /// \param b_debug [in] Whether to print debug output - /// - template - void - verifyTsqr (const std::string& which, - const std::string& scalarTypeName, - Generator& generator, - const Ordinal nrows_global, - const Ordinal ncols, - const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, - const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, - const int num_cores = 1, - const size_t cache_size_hint = 0, - const bool contiguousCacheBlocks, - const bool printFieldNames, - const bool human_readable = false, - const bool b_debug = false) - { - typedef typename Teuchos::ScalarTraits::magnitudeType magnitude_type; - using std::cerr; - using std::cout; - using std::endl; - - const bool b_extra_debug = false; - const int nprocs = scalarComm->size(); - const int my_rank = scalarComm->rank(); - if (b_debug) { - scalarComm->barrier (); - if (my_rank == 0) { - cerr << "tsqr_verify:" << endl; - } - scalarComm->barrier (); - } - const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); - - // Set up storage for the test problem. - Matrix< Ordinal, Scalar > A_local (nrows_local, ncols); - Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A_local, std::numeric_limits::quiet_NaN ()); - deep_copy (Q_local, std::numeric_limits::quiet_NaN ()); - } - Matrix R (ncols, ncols, Scalar(0)); - - // Generate the test problem. - distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get()); - if (b_debug) { - scalarComm->barrier (); - if (my_rank == 0) { - cerr << "-- Generated test problem." << endl; - } - } - - // Make sure that the test problem (the matrix to factor) was - // distributed correctly. - if (b_extra_debug && b_debug) { - if (my_rank == 0) { - cerr << "Test matrix A:" << endl; - } - scalarComm->barrier (); - printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get()); - scalarComm->barrier (); - } - - // Factoring the matrix stored in A_local overwrites it, so we - // make a copy of A_local. Initialize with NaNs to make sure - // that cache blocking works correctly (if applicable). - Matrix A_copy (nrows_local, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN ()); - } - - // actual_cache_size_hint: "cache_size_hint" is just a - // suggestion. TSQR determines the cache size hint itself; - // this remembers it so we can print it out later. - size_t actual_cache_size_hint; - - if (which == "MpiTbbTSQR") { -#ifdef HAVE_KOKKOSTSQR_TBB - using Teuchos::RCP; - typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; - typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; - typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; - - RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_size_hint)); - RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (node_tsqr, dist_tsqr); - - // Compute the factorization and explicit Q factor. - TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, - Q_local, R, contiguousCacheBlocks, - b_debug); - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); -#else - throw std::logic_error("TSQR not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqTSQR") { - using Teuchos::RCP; - typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type; - typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; - typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; - - RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_size_hint)); - RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (node_tsqr, dist_tsqr); - - // Compute the factorization and explicit Q factor. - TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, - Q_local, R, contiguousCacheBlocks, - b_debug); - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); - } - else { - throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); - } - - // Print out the Q and R factors - if (b_extra_debug && b_debug) { - if (my_rank == 0) { - cerr << endl << "Q factor:" << endl; - } - scalarComm->barrier (); - printGlobalMatrix (cerr, Q_local, scalarComm.get (), ordinalComm.get ()); - scalarComm->barrier (); - if (my_rank == 0) { - cerr << endl << "R factor:" << endl; - print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1)); - cerr << endl; - } - scalarComm->barrier (); - } - - // Test accuracy of the resulting factorization - std::vector< magnitude_type > results = - global_verify (nrows_local, ncols, A_local.data(), A_local.stride(1), - Q_local.data(), Q_local.stride(1), R.data(), R.stride(1), - scalarComm.get()); - if (b_debug) { - scalarComm->barrier (); - if (my_rank == 0) { - cerr << "-- Finished global_verify" << endl; - } - } - - // Print the results on Proc 0. - if (my_rank == 0) { - if (human_readable) { - std::string human_readable_name; - - if (which == "MpiSeqTSQR") { - human_readable_name = "MPI parallel / cache-blocked TSQR"; - } - else if (which == "MpiTbbTSQR") { -#ifdef HAVE_KOKKOSTSQR_TBB - human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; -#else - throw std::logic_error("TSQR not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else { - throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); - } - - cout << human_readable_name << ":" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << nrows_global << endl - << "# columns: " << ncols << endl - << "# MPI processes: " << nprocs << endl; -#ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") - cout << "# cores per process = " << num_cores << endl; -#endif // HAVE_KOKKOSTSQR_TBB - cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl - << "Contiguous cache blocks? " << contiguousCacheBlocks << endl - << "Absolute residual $\\| A - Q R \\|_2: " - << results[0] << endl - << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " - << results[1] << endl - << "Test matrix norm $\\| A \\|_F$: " - << results[2] << endl - << endl; - } - else { - if (printFieldNames) { - cout << "%" - << "method" - << ",scalarType" - << ",globalNumRows" - << ",numCols" - << ",numProcs" - << ",numCores" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",absFrobResid" - << ",absFrobOrthog" - << ",frobA" << endl; - } - - cout << which - << "," << scalarTypeName - << "," << nrows_global - << "," << ncols - << "," << nprocs; -#ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") { - cout << "," << num_cores; - } else { - cout << ",1"; - } -#else - cout << ",1" << endl; -#endif // HAVE_KOKKOSTSQR_TBB - cout << "," << actual_cache_size_hint - << "," << contiguousCacheBlocks - << "," << results[0] - << "," << results[1] - << "," << results[2] - << endl; - } - } - } - - - template - double - do_tsqr_benchmark (const std::string& which, - TsqrBase& tsqr, - const Teuchos::RCP>& messenger, - const Matrix& A_local, - Matrix& A_copy, - Matrix& Q_local, - Matrix& R, - const int ntrials, - const bool contiguousCacheBlocks, - const bool human_readable, - const bool b_debug = false) - { - typedef typename TsqrBase::FactorOutput factor_output_type; - typedef typename TsqrBase::ordinal_type ordinal_type; - using std::cerr; - using std::cout; - using std::endl; - - const ordinal_type nrows_local = A_local.extent(0); - const ordinal_type ncols = A_local.extent(1); - - if (contiguousCacheBlocks) { - tsqr.cache_block (nrows_local, ncols, A_copy.data(), - A_local.data(), A_local.stride(1)); - if (b_debug) { - messenger->barrier (); - if (messenger->rank () == 0) { - cerr << "-- Cache-blocked input matrix to factor." << endl; - } - } - } - else { - deep_copy (A_copy, A_local); - } - - if (b_debug) { - messenger->barrier (); - if (messenger->rank () == 0) { - cerr << "-- Starting timing loop" << endl; - } - } - - // Benchmark TSQR for ntrials trials. The answer (the numerical - // results of the factorization) is only valid if ntrials == 1, - // but this is a benchmark and not a verification routine. Call - // tsqr_verify() if you want to determine whether TSQR computes - // the right answer. - // - // Name of timer doesn't matter here; we only need the timing. - TSQR::Test::verifyTimerConcept< TimerType >(); - TimerType timer (which); - - - const bool testFactorExplicit = true; - double tsqr_timing; - if (testFactorExplicit) { - timer.start(); - for (int trial_num = 0; trial_num < ntrials; ++trial_num) - tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), - contiguousCacheBlocks); - tsqr_timing = timer.stop(); - } - else { - timer.start(); - for (int trial_num = 0; trial_num < ntrials; ++trial_num) { - // Factor the matrix and compute the explicit Q factor. - // Don't worry about the fact that we're overwriting the - // input; this is a benchmark, not a numerical verification - // test. (We have the latter implemented as tsqr_verify() - // in this file.) For the same reason, don't worry about - // un-cache-blocking the output (when cache blocks are - // stored contiguously). - factor_output_type factor_output = - tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.stride(1), - R.data(), R.stride(1), contiguousCacheBlocks); - tsqr.explicit_Q (nrows_local, - ncols, A_copy.data(), A_copy.stride(1), factor_output, - ncols, Q_local.data(), Q_local.stride(1), - contiguousCacheBlocks); - // Timings in debug mode likely won't make sense, because - // Proc 0 is outputting the debug messages to cerr. - // Nevertheless, we don't put any "if(b_debug)" calls in the - // timing loop. - } - // Compute the resulting total time (in seconds) to execute - // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q(). The - // time may differ on different MPI processes. - tsqr_timing = timer.stop(); - } - - if (b_debug) { - messenger->barrier(); - if (messenger->rank() == 0) - cerr << "-- Finished timing loop" << endl; - } - return tsqr_timing; - } - - /// \function benchmarkTsqr - /// \brief Benchmark parallel TSQR and report timings to stdout - /// - /// Benchmark the MPI-parallel TSQR implementation specified by - /// the "which" parameter (either with cache-blocked TSQR or - /// TBB-parallel cache-blocked TSQR as the node-level - /// implementation), for "ntrials" trials. Print the stdout the - /// cumulative run time (in seconds) for all ntrials trials. - /// - /// \param which [in] Valid values: "MpiTbbTSQR" (for TBB-parallel - /// node-level TSQR underneath MPI-parallel TSQR), "MpiSeqTSQR" - /// (for cache-blocked sequential node-level TSQR underneath - /// MPI-parallel TSQR) - /// - /// \param scalarTypeName [in] Name of the Scalar type - /// - /// \param generator [in/out] Normal(0,1) (pseudo)random number - /// generator. Only touched on MPI process 0. Used to generate - /// random test matrices for the factorization. - /// - /// \param ntrials [in] Number of trials to use in the benchmark. - /// Reported timings are cumulative over all trials. - /// - /// \param nrows_global [in] Number of rows in the entire test - /// matrix (over all processes) to generate. The matrix will be - /// divided up in blocks of contiguous rows among the processes. - /// - /// \param ncols [in] Number of columns in the test matrix to - /// generate. - /// - /// \param ordinalComm [in/out] Object for communicating Ordinal - /// (integer index) objects among the processes - /// - /// \param scalarComm [in/out] Object for communicating Scalar - /// (matrix data) objects among the processes - /// - /// \param num_cores [in] Number of cores to use per MPI process - /// for Intel TBB parallelism within that process - /// - /// \param cache_size_hint [in] Cache block size (per core) in - /// bytes. If zero, a sensible default is used. - /// - /// \param contiguousCacheBlocks [in] Whether cache blocks - /// should be stored contiguously - /// - /// \param printFieldNames [in] Whether to print field names (only - /// appliable if not human_readable) - /// - /// \param human_readable [in] Whether output should be human - /// readable, or machine parseable - /// - /// \param b_debug [in] Whether to print debug output - /// - template - void - benchmarkTsqr (const std::string& which, - const std::string& scalarTypeName, - Generator& generator, - const int ntrials, - const Ordinal nrows_global, - const Ordinal ncols, - const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm, - const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm, - const Ordinal num_cores, - const size_t cache_size_hint, - const bool contiguousCacheBlocks, - const bool printFieldNames, - const bool human_readable, - const bool b_debug) - { - using std::cerr; - using std::cout; - using std::endl; - - TSQR::Test::verifyTimerConcept< TimerType >(); - const bool b_extra_debug = false; - const int nprocs = scalarComm->size(); - const int my_rank = scalarComm->rank(); - if (b_debug) - { - scalarComm->barrier(); - if (my_rank == 0) - cerr << "tsqr_benchmark:" << endl; - scalarComm->barrier(); - } - const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs); - - // Set up storage for the test problem. - Matrix A_local (nrows_local, ncols); - Matrix Q_local (nrows_local, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A_local, std::numeric_limits::quiet_NaN()); - deep_copy (Q_local, std::numeric_limits::quiet_NaN()); - } - Matrix R (ncols, ncols, Scalar {}); - - // Generate the test problem. - distributedTestProblem (generator, A_local, ordinalComm.get(), - scalarComm.get()); - if (b_debug) { - scalarComm->barrier(); - if (my_rank == 0) { - cerr << "-- Generated test problem." << endl; - } - } - - // Make sure that the test problem (the matrix to factor) was - // distributed correctly. - if (b_extra_debug && b_debug) { - if (my_rank == 0) { - cerr << "Test matrix A:" << endl; - } - scalarComm->barrier (); - printGlobalMatrix (cerr, A_local, scalarComm.get(), - ordinalComm.get()); - scalarComm->barrier (); - } - - // Factoring the matrix stored in A_local overwrites it, so we - // make a copy of A_local. If specified, rearrange cache blocks - // in the copy. Initialize with NaNs to make sure that cache - // blocking worked correctly. - Matrix A_copy (nrows_local, ncols); - if (std::numeric_limits::has_quiet_NaN) { - deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN()); - } - - // actual_cache_size_hint: "cache_size_hint" is just a - // suggestion. TSQR determines the cache block size itself; - // this remembers it so we can print it out later. - size_t actual_cache_size_hint; - // Run time (in seconds, as a double-precision floating-point - // value) for TSQR on this MPI node. - double tsqr_timing; - - if (which == "MpiTbbTSQR") { -#ifdef HAVE_KOKKOSTSQR_TBB - using Teuchos::RCP; - typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type; - typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type; - typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type; - - RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint)); - RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (nodeTsqr, distTsqr); - - // Run the benchmark. - tsqr_timing = - do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local, - A_copy, Q_local, R, ntrials, - contiguousCacheBlocks, - human_readable, b_debug); - - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); -#else - throw std::logic_error("TSQR not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else if (which == "MpiSeqTSQR") { - using Teuchos::RCP; - using node_tsqr_type = SequentialTsqr; - using dist_tsqr_type = TSQR::DistTsqr; - using tsqr_type = typedef Tsqr; - - // Set up TSQR. - RCP nodeTsqr (new node_tsqr_type (cache_size_hint)); - RCP distTsqr (new dist_tsqr_type (scalarComm)); - tsqr_type tsqr (nodeTsqr, distTsqr); - - // Run the benchmark. - tsqr_timing = - do_tsqr_benchmark (which, tsqr, scalarComm, A_local, - A_copy, Q_local, R, ntrials, - contiguousCacheBlocks, - human_readable, b_debug); - // Save the "actual" cache block size - actual_cache_size_hint = tsqr.cache_size_hint(); - } - else { - throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); - } - - // Find the min and max TSQR timing on all processors. - const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing); - const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing); - - // Print the results on Proc 0. - if (my_rank == 0) { - if (human_readable) { - std::string human_readable_name; - - if (which == "MpiSeqTSQR") { - human_readable_name = "MPI parallel / cache-blocked TSQR"; - } - else if (which == "MpiTbbTSQR") { -#ifdef HAVE_KOKKOSTSQR_TBB - human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR"; -#else - throw std::logic_error("TSQR not built with Intel TBB support"); -#endif // HAVE_KOKKOSTSQR_TBB - } - else { - throw std::logic_error("Unknown TSQR implementation type \"" + which + "\""); - } - - cout << human_readable_name << ":" << endl - << "Scalar type: " << scalarTypeName << endl - << "# rows: " << nrows_global << endl - << "# columns: " << ncols << endl - << "# MPI processes: " << nprocs << endl; - -#ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") - cout << "# cores per process: " << num_cores << endl; -#endif // HAVE_KOKKOSTSQR_TBB - - cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl - << "contiguous cache blocks? " << contiguousCacheBlocks << endl - << "# trials: " << ntrials << endl - << "Min total time (s) over all MPI processes: " - << min_tsqr_timing << endl - << "Max total time (s) over all MPI processes: " - << max_tsqr_timing << endl - << endl; - } - else { - if (printFieldNames) { - cout << "%" - << "method" - << ",scalarType" - << ",globalNumRows" - << ",numCols" - << ",numProcs" - << ",numCores" - << ",cacheSizeHint" - << ",contiguousCacheBlocks" - << ",numTrials" - << ",minTiming" - << ",maxTiming" - << endl; - } - cout << which - << "," << scalarTypeName - << "," << nrows_global - << "," << ncols - << "," << nprocs; -#ifdef HAVE_KOKKOSTSQR_TBB - if (which == "MpiTbbTSQR") { - cout << "," << num_cores; - } - else { - cout << ",1"; - } -#else - cout << ",1"; -#endif // HAVE_KOKKOSTSQR_TBB - cout << "," << actual_cache_size_hint - << "," << contiguousCacheBlocks - << "," << ntrials - << "," << min_tsqr_timing - << "," << max_tsqr_timing - << endl; - } - } - } - } // namespace Test -} // namespace TSQR - -#endif // __TSQR_Test_TsqrTest_hpp diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp index ddbe59f4f062..9cd657594977 100644 --- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp +++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp @@ -40,14 +40,15 @@ /// \file Tsqr_Util.hpp /// \brief Utilities for TSQR (the Tall Skinny QR factorization) -#ifndef __TSQR_Tsqr_Util_hpp -#define __TSQR_Tsqr_Util_hpp +#ifndef TSQR_UTIL_HPP +#define TSQR_UTIL_HPP #include "Teuchos_ScalarTraits.hpp" +#include "Tsqr_MatView.hpp" -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX # include -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX #include #include @@ -143,36 +144,6 @@ namespace TSQR { } } - template< class Ordinal, class Scalar > - void - copy_upper_triangle (const Ordinal nrows, - const Ordinal ncols, - Scalar* const R_out, - const Ordinal ldr_out, - const Scalar* const R_in, - const Ordinal ldr_in) - { - if (nrows >= ncols) { - for (Ordinal j = 0; j < ncols; ++j) { - Scalar* const A_j = &R_out[j*ldr_out]; - const Scalar* const B_j = &R_in[j*ldr_in]; - for (Ordinal i = 0; i <= j; ++i) { - A_j[i] = B_j[i]; - } - } - } - else { - copy_upper_triangle (nrows, nrows, R_out, ldr_out, R_in, ldr_in); - for (Ordinal j = nrows; j < ncols; j++) { - Scalar* const A_j = &R_out[j*ldr_out]; - const Scalar* const B_j = &R_in[j*ldr_in]; - for (Ordinal i = 0; i < nrows; i++) - A_j[i] = B_j[i]; - } - } - } - - template< class Scalar > class SumSquare { public: @@ -181,7 +152,7 @@ namespace TSQR { } }; -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX // Specialization for complex numbers template class SumSquare > { @@ -192,58 +163,8 @@ namespace TSQR { return result + absval * absval; } }; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - template - void - pack_R_factor (const Ordinal nrows, - const Ordinal ncols, - const Scalar R_in[], - const Ordinal ldr_in, - Scalar buffer[]) - { - Ordinal count = 0; // current position in output buffer - if (nrows >= ncols) { - for (Ordinal j = 0; j < ncols; ++j) { - for (Ordinal i = 0; i <= j; ++i) { - buffer[count++] = R_in[i + j*ldr_in]; - } - } - } - else { - for (Ordinal j = 0; j < nrows; ++j) { - for (Ordinal i = 0; i <= j; ++i) { - buffer[count++] = R_in[i + j*ldr_in]; - } - } - } - } - - template< class Ordinal, class Scalar > - void - unpack_R_factor (const Ordinal nrows, - const Ordinal ncols, - Scalar R_out[], - const Ordinal ldr_out, - const Scalar buffer[]) - { - Ordinal count = 0; // current position in input buffer - if (nrows >= ncols) { - for (Ordinal j = 0; j < ncols; ++j) { - for (Ordinal i = 0; i <= j; ++i) { - R_out[i + j*ldr_out] = buffer[count++]; - } - } - } - else { - for (Ordinal j = 0; j < nrows; ++j) { - for (Ordinal i = 0; i <= j; ++i) { - R_out[i + j*ldr_out] = buffer[count++]; - } - } - } - } +#endif // HAVE_TPETRATSQR_COMPLEX } // namespace TSQR -#endif // __TSQR_Tsqr_Util_hpp +#endif // TSQR_UTIL_HPP diff --git a/packages/tpetra/tsqr/test/CMakeLists.txt b/packages/tpetra/tsqr/test/CMakeLists.txt index 26bc2e6a0cb6..5bcdb5a21905 100644 --- a/packages/tpetra/tsqr/test/CMakeLists.txt +++ b/packages/tpetra/tsqr/test/CMakeLists.txt @@ -1,104 +1,135 @@ -# It's not necessary to run the first five tests in an MPI build -# ("COMM mpi"), since none of them need to run on more than one MPI -# process. However, it's useful to have the tests around in an MPI -# build, so we also build the tests there. In an MPI build, only -# Process 0 in MPI_COMM_WORLD runs the tests; the other ranks are -# quieted. +# It's not necessary to run most of the tests below in an MPI build +# ("COMM mpi"), since only two of them (DistTsqr and FullTsqr) need to +# run on more than one MPI process. However, it's useful to have the +# tests around in an MPI build, so we also build the tests there. In +# an MPI build, only Process 0 in MPI_COMM_WORLD runs the tests; the +# other ranks are quieted. + +ASSERT_DEFINED(TPL_ENABLE_CUDA) +ASSERT_DEFINED(Kokkos_ENABLE_Cuda) +ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUSOLVER) + +IF (TPL_ENABLE_CUDA AND Kokkos_ENABLE_Cuda AND ${PACKAGE_NAME}_ENABLE_CUBLAS AND ${PACKAGE_NAME}_ENABLE_CUSOLVER) + SET (TpetraTSQR_ENABLE_CUDA_TESTS ON) +ELSE () + SET (TpetraTSQR_ENABLE_CUDA_TESTS OFF) +ENDIF () + +IF (TpetraTSQR_ENABLE_CUDA_TESTS) +TRIBITS_ADD_EXECUTABLE_AND_TEST( + CuSolver + SOURCES CuSolver.cpp + COMM serial mpi + ARGS "" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 1 +) +ENDIF (TpetraTSQR_ENABLE_CUDA_TESTS) # Performance and accuracy test suite for TSQR::Combine (which factors # cache blocks and combines triangular factors). -TRIBITS_ADD_EXECUTABLE_AND_TEST( + +TRIBITS_ADD_EXECUTABLE( Combine SOURCES Tsqr_TestCombine.cpp COMM serial mpi - ARGS "--verify --testReal" + ) + +TRIBITS_ADD_TEST( + Combine + NAME Combine_100rows_5cols + COMM serial mpi + ARGS "--verify --numRows=100 --numCols=5" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) -# Test TSQR::SequentialTsqr (sequential cache-blocked TSQR). -TRIBITS_ADD_EXECUTABLE( - SequentialTsqr - SOURCES Tsqr_TestSeqTsqr.cpp +TRIBITS_ADD_TEST( + Combine + NAME Combine_100rows_50cols COMM serial mpi + ARGS "--verify --numRows=100 --numCols=50" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 1 ) TRIBITS_ADD_TEST( - SequentialTsqr - NAME SequentialTsqr_contiguousCacheBlocks + Combine + NAME Combine_10000rows_11cols COMM serial mpi - ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000 --contiguous-cache-blocks" + ARGS "--verify --numRows=10000 --numCols=11" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) +# This executable can test any NodeTsqr subclass that +# TSQR::NodeTsqrFactory can create. It can check accuracy (--verify) +# and/or timing (--benchmark). For both of these, it can compare with +# LAPACK. Thus, this can serve as a check for your LAPACK +# implementation as well. Run the executable with --help to see all +# the options. It builds with or without MPI, but only runs with one +# MPI process. + +TRIBITS_ADD_EXECUTABLE( + NodeTsqr + SOURCES Tsqr_TestNodeTsqr.cpp + COMM serial mpi + ) + +SET(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN ON) +SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "--verify --NodeTsqr=SequentialTsqr") +IF(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN) + SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --noTestComplex") +ELSE() + SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --testComplex") +ENDIF() + TRIBITS_ADD_TEST( - SequentialTsqr - NAME SequentialTsqr_noncontiguousCacheBlocks + NodeTsqr + NAME SequentialTsqr_contiguousCacheBlocks COMM serial mpi - ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000" + ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --numRows=100000 --numCols=10 --cacheBlockSize=5000 --contiguousCacheBlocks" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) -# Performance and accuracy test suite for TSQR::KokkosNodeTsqr -TRIBITS_ADD_EXECUTABLE_AND_TEST( - KokkosHostTsqr - SOURCES Tsqr_TestKokkosNodeTsqr.cpp +TRIBITS_ADD_TEST( + NodeTsqr + NAME SequentialTsqr_noncontiguousCacheBlocks COMM serial mpi - ARGS "--verify --numRows=100000 --numCols=10" + ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --numRows=100000 --numCols=10 --cacheBlockSize=5000" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) -# This test uses LAPACK's QR factorization to get a reference for -# performance and accuracy. It doesn't run any parts of the TSQR -# algorithm, but it does depend on some TSQR test code (for generating -# the test matrix and measuring accuracy). -TRIBITS_ADD_EXECUTABLE_AND_TEST( - Lapack - SOURCES Tsqr_TestLapack.cpp +TRIBITS_ADD_TEST( + NodeTsqr + NAME CombineNodeTsqr COMM serial mpi - ARGS "--verify --nrows=1000 --ncols=10 --ntrials=10" + ARGS "--verify --NodeTsqr=CombineNodeTsqr --numRows=1000 --numCols=15" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) -# Performance and accuracy test suite for TSQR::TBB::TbbTsqr -# (shared-memory parallel cache-blocked TSQR, parallelized via Intel's -# Threading Building Blocks library). -# -# Only build TBB-enabled TSQR if (surprise!) TBB is enabled. -IF (KokkosTSQR_ENABLE_TBB) - TRIBITS_ADD_EXECUTABLE_AND_TEST( - TbbTsqr - SOURCES Tsqr_TestTbbTsqr.cpp +IF (TpetraTSQR_ENABLE_CUDA_TESTS) + TRIBITS_ADD_TEST( + NodeTsqr + NAME CuSolverNodeTsqr_11_5 COMM serial mpi - ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=50000 --contiguous-cache-blocks" + ARGS "--verify --NodeTsqr=CuSolverNodeTsqr --numRows=11 --numCols=5" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) -ENDIF() -# mfh 22 Dec 2014: Disable this test, since KokkosNodeTsqr no longer -# works with the new Kokkos Node types. -# -# Performance and accuracy test suite for TSQR::KokkosNodeTsqr -# ("generic" intranode parallel TSQR). We pick an odd number of -# partitions to ensure correct results in that case, not just for -# powers of two (which everybody tests first). The number of -# partitions is the maximum parallelism available in the algorithm, -# but it's up to the Kokkos Node implementation to decide what -# hardware resources to use (e.g., how many CPU cores, how many -# threads, ...). -#TRIBITS_ADD_EXECUTABLE_AND_TEST( -# KokkosNodeTsqr -# SOURCES Tsqr_TestKokkosNodeTsqr.cpp -# COMM serial mpi -# ARGS "--verify --numRows=100000 --numCols=10 --numPartitions=7 --cacheSizeHint=50000 --contiguousCacheBlocks" -# STANDARD_PASS_OUTPUT -# NUM_MPI_PROCS 1 -# ) + TRIBITS_ADD_TEST( + NodeTsqr + NAME CuSolverNodeTsqr_5000_20 + COMM serial mpi + ARGS "--verify --NodeTsqr=CuSolverNodeTsqr --numRows=5000 --numCols=20" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 1 + ) +ENDIF () # # Tests for the distributed-memory (MPI) part of TSQR. @@ -106,21 +137,99 @@ ENDIF() # Performance and accuracy test suite for TSQR::DistTsqr (which # combines triangular factors from different MPI processes). -TRIBITS_ADD_EXECUTABLE_AND_TEST( - DistTsqr_Accuracy + +# Accuracy test for TSQR::Tsqr (the full TSQR implementation). +TRIBITS_ADD_EXECUTABLE( + DistTsqr SOURCES Tsqr_TestDistTsqr.cpp - COMM mpi + COMM serial mpi + ) + +TRIBITS_ADD_TEST( + DistTsqr + NAME DistTsqr_1_proc + COMM serial mpi ARGS "--verify --ncols=5 --explicit --implicit --real" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 1 ) +TRIBITS_ADD_TEST( + DistTsqr + NAME DistTsqr_4_proc + COMM mpi + ARGS "--verify --ncols=5 --explicit --implicit --real" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 +) + # Accuracy test for TSQR::Tsqr (the full TSQR implementation). -TRIBITS_ADD_EXECUTABLE_AND_TEST( - FullTsqr_Accuracy +TRIBITS_ADD_EXECUTABLE( + FullTsqr SOURCES Tsqr_TestFullTsqr.cpp COMM mpi - ARGS "--numRowsLocal=100 --numCols=5 --testFactorExplicit --testReal" + ) + +SET(TSQR_FULL_BASE_ARGS "--testFactorExplicit") + +TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_100rows_5cols + COMM mpi + ARGS "--numRowsLocal=100 --numCols=5 ${TSQR_FULL_BASE_ARGS}" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 +) + +TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_100rows_20cols + COMM mpi + ARGS "--numRowsLocal=100 --numCols=20 ${TSQR_FULL_BASE_ARGS}" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 +) + +TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_10000rows_5cols + COMM mpi + ARGS "--numRowsLocal=10000 --numCols=5 ${TSQR_FULL_BASE_ARGS}" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 +) + +TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_10000rows_20cols + COMM mpi + ARGS "--numRowsLocal=10000 --numCols=20 ${TSQR_FULL_BASE_ARGS}" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 +) + +IF (TpetraTSQR_ENABLE_CUDA_TESTS) + TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_1000rows_15cols_CuSolver + COMM mpi + ARGS "--numRowsLocal=1000 --numCols=15 --NodeTsqr=CuSolverNodeTsqr ${TSQR_FULL_BASE_ARGS}" + STANDARD_PASS_OUTPUT + NUM_MPI_PROCS 4 + ) +ENDIF () + +IF(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN) + SET(TSQR_FULL_BASE_ARGS_SEQ "--noTestComplex") +ELSE() + SET(TSQR_FULL_BASE_ARGS_SEQ "--testComplex") +ENDIF() + +TRIBITS_ADD_TEST( + FullTsqr + NAME FullTsqr_Accuracy_5000rows_100cols_Sequential + COMM mpi + ARGS "--numRowsLocal=5000 --numCols=100 --NodeTsqr=SequentialTsqr ${TSQR_FULL_BASE_ARGS_SEQ}" STANDARD_PASS_OUTPUT NUM_MPI_PROCS 4 ) diff --git a/packages/tpetra/tsqr/test/CuSolver.cpp b/packages/tpetra/tsqr/test/CuSolver.cpp new file mode 100644 index 000000000000..c9e801e393ec --- /dev/null +++ b/packages/tpetra/tsqr/test/CuSolver.cpp @@ -0,0 +1,161 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos: Node API and Parallel Node Kernels +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER + +#include "Tsqr_Impl_CuBlasHandle.hpp" +#include "Tsqr_Impl_CuSolverHandle.hpp" +#include "Tsqr_Impl_CuBlas.hpp" +#include "Tsqr_Impl_CuSolver.hpp" +#include "Tsqr_Impl_CuTypes.hpp" +#include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_UnitTestHarness.hpp" +#include "Kokkos_Core.hpp" +#include +#include + +namespace { // (anonymous) + +template +void +verifyReal (std::ostream& out, bool& success) +{ + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + using TSQR::Impl::CudaValue; + using std::endl; + + CuSolverHandle s = CuSolverHandle::getSingleton (); + TEST_ASSERT( s.getHandle () != nullptr ); + + Kokkos::View info ("info"); + CuSolver solver (s, info.data ()); + + using IST = typename CudaValue::type; + static_assert (std::is_same::value, + "CudaValue::type is wrong."); + const RealType x (666.0); + out << "Original x: " << x << ": Converted x: " + << CudaValue::makeValue (x) << endl; + + using TSQR::Impl::CuBlas; + using TSQR::Impl::CuBlasHandle; + CuBlasHandle b = CuBlasHandle::getSingleton (); + TEST_ASSERT( b.getHandle () != nullptr ); + + CuBlas blas (b); +} + +#ifdef HAVE_TPETRATSQR_COMPLEX +template +void +verifyComplex (std::ostream& out, bool& success) +{ + using TSQR::Impl::CuSolver; + using TSQR::Impl::CuSolverHandle; + using TSQR::Impl::CudaValue; + using std::endl; + + CuSolverHandle s = CuSolverHandle::getSingleton (); + TEST_ASSERT( s.getHandle () != nullptr ); + + Kokkos::View info ("info"); + CuSolver solver (s, info.data ()); + + using IST = typename CudaValue::type; + + using expected_z_IST = cuDoubleComplex; + using expected_c_IST = cuFloatComplex; + constexpr bool is_z = + std::is_same>::value; + using expected_IST = typename std::conditional< + is_z, + expected_z_IST, + expected_c_IST>::type; + static_assert (std::is_same::value, + "CudaValue::type is wrong."); + const ComplexType x (666.0, 418.0); + const IST x_out = CudaValue::makeValue (x); + out << "Original x: " << x << ": Converted x: (" + << x_out.x << "," << x_out.y << ")" << endl; + + using TSQR::Impl::CuBlas; + using TSQR::Impl::CuBlasHandle; + CuBlasHandle b = CuBlasHandle::getSingleton (); + TEST_ASSERT( b.getHandle () != nullptr ); + + CuBlas blas (b); +} +#endif // HAVE_TPETRATSQR_COMPLEX + +void +verify (std::ostream& out, bool& success) +{ + verifyReal (out, success); + verifyReal (out, success); + +#ifdef HAVE_TPETRATSQR_COMPLEX + verifyComplex> (out, success); + verifyComplex> (out, success); +#endif // HAVE_TPETRATSQR_COMPLEX +} + +} // namespace (anonymous) + +int +main (int argc, char *argv[]) +{ + using std::cout; + using std::endl; + + cout << "Test cuBLAS and cuSOLVER handle creation" << endl; + + bool success = true; + try { + Kokkos::ScopeGuard kokkosScope (argc, argv); + verify (cout, success); + // The Trilinos test framework expects a message like this. + if (success) { + cout << "\nEnd Result: TEST PASSED" << endl; + } + else { + cout << "\nEnd Result: TEST FAILED" << endl; + } + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success); + return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); +} diff --git a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp index 9e1344065d38..eab1f261cf03 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp @@ -37,30 +37,21 @@ // ************************************************************************ //@HEADER -#include "Tsqr_ConfigDefs.hpp" -#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI -#include "Teuchos_Tuple.hpp" -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI #include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_Time.hpp" #include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_Time.hpp" #include "Tsqr_CombineBenchmark.hpp" #include "Tsqr_CombineTest.hpp" -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX # include -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX -#include +#include "Kokkos_Core.hpp" #include #include #include - namespace { using Teuchos::RCP; @@ -76,67 +67,52 @@ namespace { // parameters. // struct TestParameters { - TestParameters () : - verify (false), - benchmark (false), - numRows (100), - numCols (5), - numTrials (3), - calibrate (false), - averageTimings (true), - testReal (true), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (true), -#endif // HAVE_KOKKOSTSQR_COMPLEX - printFieldNames (true), - printTrilinosTestStuff (true), - strictPerfTests (false), - allowance (1.2), - verbose (true), - debug (false) - {} - // Whether to run the accuracy test. - bool verify; + bool verify = true; // Whether to run the performance test. - bool benchmark; + bool benchmark = false; // Number of rows in the test matrix. - int numRows; + int numRows = 100; // Number of columns in the test matrix. - int numCols; + int numCols = 5; // Number of trials (benchmark only). - int numTrials; + int numTrials = 3; // Whether to pick the number of trials automatically, using an // iterative calibration process (benchmark only). - bool calibrate; - // Whether to print averaged timings over all trials (true), or the cumulative timing over all trials (false). - bool averageTimings; + bool calibrate = false; + // Whether to print averaged timings over all trials (true), or + // the cumulative timing over all trials (false). + bool averageTimings = true; // Whether to test real-arithmetic routines. - bool testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - // Whether to test complex-arithmetic routines. We don't let this - // option exist unless TSQR was built with complex arithmetic - // support. - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX + bool testReal = true; + // Whether to test complex-arithmetic routines. If TSQR was not + // built with complex arithmetic support, then this must always be + // false. +#ifdef HAVE_TPETRATSQR_COMPLEX + bool testComplex = true; +#else + bool testComplex = false; +#endif // HAVE_TPETRATSQR_COMPLEX // Whether to print column (field) names. - bool printFieldNames; + bool printFieldNames = true; // Whether to print output that the Trilinos test framework // expects, in order to judge a test as passed or failed. - bool printTrilinosTestStuff; + bool printTrilinosTestStuff = true; // Whether the benchmark should fail if performance of - // TSQR::CombineNative (and TSQR::CombineFortran, if applicable) - // relative to that of TSQR::CombineDefault is not good enough. - bool strictPerfTests; + // TSQR::CombineNative relative to that of TSQR::CombineDefault is + // not good enough. + bool strictPerfTests = false; // If strictPerfTests is true: how much slower CombineNative (and // CombineFortran, if applicable) is allowed to be, relative to // CombineDefault. - double allowance; + double allowance = 1.2; // Whether to print verbose status output. - bool verbose; + bool verbose = true; // Whether to print debugging output to stderr. - bool debug; - std::string additionalFieldNames, additionalData; + bool debug = false; + + std::string additionalFieldNames; + std::string additionalData; }; // Benchmark TSQR::Combine. @@ -148,76 +124,60 @@ namespace { // the following fields: numRows, numCols, numTrials, // testReal, testComplex. // - // Warning: Call only on (MPI) rank 0. Otherwise, you'll run the - // test routine on every MPI rank simultaneously, but only report - // results on rank 0. + // Warning: Call only on (MPI) Process 0. Otherwise, you'll run the + // test routine on every MPI process simultaneously, but only + // report results on Process 0. void - benchmark (std::ostream& out, - const TestParameters& params) - { - std::vector seed(4); - const bool useSeedValues = false; // Fill in seed with defaults. - - using TSQR::Test::benchmarkCombine; - typedef Teuchos::Time timer_type; - - TSQR::Test::CombineBenchmarkParameters testParams; - testParams.numRows = params.numRows; - testParams.numCols = params.numCols; - testParams.testReal = params.testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testParams.testComplex = params.testComplex; -#else - testParams.testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - testParams.numTrials = params.numTrials; - testParams.calibrate = params.calibrate; - testParams.averageTimings = params.averageTimings; - testParams.strictPerfTests = params.strictPerfTests; - testParams.allowance = params.allowance; - testParams.seed = seed; - testParams.useSeedValues = useSeedValues; - testParams.additionalFieldNames = params.additionalFieldNames; - testParams.additionalData = params.additionalData; - testParams.printFieldNames = params.printFieldNames; - testParams.debug = params.debug; - - benchmarkCombine (out, testParams); - } + benchmark(std::ostream& out, + const TestParameters& params) + { + std::vector seed(4); + const bool useSeedValues = false; // Fill in seed with defaults. + + TSQR::Test::CombineBenchmarkParameters testParams; + testParams.numRows = params.numRows; + testParams.numCols = params.numCols; + testParams.testReal = params.testReal; + testParams.testComplex = params.testComplex; + testParams.numTrials = params.numTrials; + testParams.calibrate = params.calibrate; + testParams.averageTimings = params.averageTimings; + testParams.strictPerfTests = params.strictPerfTests; + testParams.allowance = params.allowance; + testParams.seed = seed; + testParams.useSeedValues = useSeedValues; + testParams.additionalFieldNames = params.additionalFieldNames; + testParams.additionalData = params.additionalData; + testParams.printFieldNames = params.printFieldNames; + testParams.debug = params.debug; + + using timer_type = Teuchos::Time; + TSQR::Test::benchmarkCombine(out, testParams); + } // Test accuracy of TSQR::Combine. // - // out [out] output stream for benchmark results. - // It will only be used on rank 0. + // out [out] output stream for benchmark results. It will only be + // used on Process 0. // - // params [in] test parameter struct. This method reads - // the following fields: numRows, numCols, numTrials, - // testReal, testComplex. + // params [in] test parameter struct. This method reads the + // following fields: numRows, numCols, numTrials, testReal, + // testComplex. // - // Warning: Call only on (MPI) rank 0. Otherwise, you'll run - // the test routine on every MPI rank simultaneously, but - // only report results on rank 0. + // Warning: Call only on (MPI) Process 0. Otherwise, you'll run the + // test routine on every MPI process simultaneously, but only + // report results on Process 0. void - verify (std::ostream& out, - const TestParameters& params) - { - typedef int ordinal_type; - - const ordinal_type numRows = params.numRows; - const ordinal_type numCols = params.numCols; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = params.testComplex; -#else - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - const bool printFieldNames = params.printFieldNames; - const bool simulateSequentialTsqr = false; - const bool debug = false; - - using TSQR::Test::verifyCombine; - verifyCombine (numRows, numCols, params.testReal, testComplex, - printFieldNames, simulateSequentialTsqr, debug); - } + verify(std::ostream& out, const TestParameters& params) + { + constexpr bool simulateSequentialTsqr = false; + constexpr bool debug = false; + + using TSQR::Test::verifyCombine; + verifyCombine(params.numRows, params.numCols, params.testReal, + params.testComplex, params.printFieldNames, + simulateSequentialTsqr, debug); + } // \brief Parse command-line options for this test // @@ -232,197 +192,178 @@ namespace { // // Return: Encapsulation of command-line options. TestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - TestParameters params; - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy of TSQR::Combine implementations."); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance of TSQR::Combine implementations."); - cmdLineProc.setOption ("debug", - "nodebug", - ¶ms.debug, - "Print copious debugging information to stderr."); - cmdLineProc.setOption ("numRows", - ¶ms.numRows, - "Number of rows in the cache block test."); - cmdLineProc.setOption ("numCols", - ¶ms.numCols, - "Number of columns in the cache block test, and " - "number of rows and columns in each upper triangular " - "matrix in the pair test."); - cmdLineProc.setOption ("numTrials", - ¶ms.numTrials, - "For benchmarks: Number of trials. " - "Ignored if --calibrate option is set."); - cmdLineProc.setOption ("calibrate", - "noCalibrate", - ¶ms.calibrate, - "For benchmarks: ignore numTrials, and calibrate " - "the number of trials based on computed timer " - "resolution and problem size (numRows and " - "numCols)."); - cmdLineProc.setOption ("meanTimings", - "sumTimings", - ¶ms.averageTimings, - "For benchmarks: whether timings should be " - "computed as an arithmetic mean (true) or as a " - "sum (false) over all trials."); - cmdLineProc.setOption ("testReal", - "noTestReal", - ¶ms.testReal, - "Test real-arithmetic routines."); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("testComplex", - "noTestComplex", - ¶ms.testComplex, - "Test complex-arithmetic routines. This option " - "may only be set if Trilinos was built with " - "complex arithmetic support."); -#endif // HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("strictPerfTests", - "noStrictPerfTests", - ¶ms.strictPerfTests, - "For benchmarks: whether the test should fail if " - "run time of TSQR::CombineNative / run time of " - "TSQR::CombineDefault (both for the cache block " - "benchmark) is greater than the given slowdown " - "allowance. Ditto for TSQR::CombineFortran, if " - "TSQR was built with Fortran support."); - cmdLineProc.setOption ("allowance", - ¶ms.allowance, - "For benchmarks: if strictPerfTests is true: " - "allowed slowdown factor. If exceeded, the test " - "fails."); - cmdLineProc.setOption ("additionalFieldNames", - ¶ms.additionalFieldNames, - "Any additional field name(s) (comma-delimited " - "string) to add to the benchmark output. Empty " - "by default. Good for things known when invoking " - "the benchmark executable, but not (easily) known " - "inside the benchmark -- e.g., environment " - "variables."); - cmdLineProc.setOption ("additionalData", - ¶ms.additionalData, - "Any additional data to add to the output, " - "corresponding to the above field name(s). " - "Empty by default."); - cmdLineProc.setOption ("printFieldNames", - "noPrintFieldNames", - ¶ms.printFieldNames, - "Print field names for benchmark output (including " - "any arguments to --fieldNames)."); - cmdLineProc.setOption ("printTrilinosTestStuff", - "noPrintTrilinosTestStuff", - ¶ms.printTrilinosTestStuff, - "Print output that makes the Trilinos test " - "framework happy (but makes benchmark results " - "parsing scripts unhappy)"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } - - // Validate. TODO (mfh 08 Jul 2010) Figure out how to do this with - // ParameterList validators. - if (params.numRows <= 0) - throw std::invalid_argument ("Number of rows must be positive"); - else if (params.numCols <= 0) - throw std::invalid_argument ("Number of columns must be positive"); - else if (params.numRows < params.numCols) - throw std::invalid_argument ("Number of rows must be >= number of columns"); - else if (params.benchmark && params.numTrials < 1) - throw std::invalid_argument ("Benchmark requires numTrials >= 1"); - - return params; + parseOptions(int argc, + char* argv[], + std::ostream& err, + bool& printedHelp) + { + using std::endl; + + printedHelp = false; + + // Command-line parameters, set to their default values. + TestParameters params {}; + try { + constexpr bool throwExceptions = true; + constexpr bool recognizeAllOptions = true; + using CLP = Teuchos::CommandLineProcessor; + CLP cmdLineProc(throwExceptions, recognizeAllOptions); + cmdLineProc.setDocString(docString); + cmdLineProc.setOption("verify", + "noverify", + ¶ms.verify, + "Test accuracy of TSQR::Combine implementations."); + cmdLineProc.setOption("benchmark", + "nobenchmark", + ¶ms.benchmark, + "Test performance of TSQR::Combine implementations."); + cmdLineProc.setOption("debug", + "nodebug", + ¶ms.debug, + "Print copious debugging information to stderr."); + cmdLineProc.setOption("numRows", + ¶ms.numRows, + "Number of rows in the cache block test."); + cmdLineProc.setOption("numCols", + ¶ms.numCols, + "Number of columns in the cache block test, and " + "number of rows and columns in each upper triangular " + "matrix in the pair test."); + cmdLineProc.setOption("numTrials", + ¶ms.numTrials, + "For benchmarks: Number of trials. " + "Ignored if --calibrate option is set."); + cmdLineProc.setOption("calibrate", + "noCalibrate", + ¶ms.calibrate, + "For benchmarks: ignore numTrials, and calibrate " + "the number of trials based on computed timer " + "resolution and problem size (numRows and " + "numCols)."); + cmdLineProc.setOption("meanTimings", + "sumTimings", + ¶ms.averageTimings, + "For benchmarks: whether timings should be " + "computed as an arithmetic mean (true) or as a " + "sum (false) over all trials."); + cmdLineProc.setOption("testReal", + "noTestReal", + ¶ms.testReal, + "Test real-arithmetic routines."); + cmdLineProc.setOption("testComplex", + "noTestComplex", + ¶ms.testComplex, + "Test complex-arithmetic routines. This option " + "may only be true if Trilinos was built with " + "complex arithmetic support."); + cmdLineProc.setOption("strictPerfTests", + "noStrictPerfTests", + ¶ms.strictPerfTests, + "For benchmarks: whether the test should fail if " + "run time of TSQR::CombineNative / run time of " + "TSQR::CombineDefault (both for the cache block " + "benchmark) is greater than the given slowdown " + "allowance. Ditto for TSQR::CombineFortran, if " + "TSQR was built with Fortran support."); + cmdLineProc.setOption("allowance", + ¶ms.allowance, + "For benchmarks: if strictPerfTests is true: " + "allowed slowdown factor. If exceeded, the test " + "fails."); + cmdLineProc.setOption("additionalFieldNames", + ¶ms.additionalFieldNames, + "Any additional field name(s) (comma-delimited " + "string) to add to the benchmark output. Empty " + "by default. Good for things known when invoking " + "the benchmark executable, but not (easily) known " + "inside the benchmark -- e.g., environment " + "variables."); + cmdLineProc.setOption("additionalData", + ¶ms.additionalData, + "Any additional data to add to the output, " + "corresponding to the above field name(s). " + "Empty by default."); + cmdLineProc.setOption("printFieldNames", + "noPrintFieldNames", + ¶ms.printFieldNames, + "Print field names for benchmark output (including " + "any arguments to --fieldNames)."); + cmdLineProc.setOption("printTrilinosTestStuff", + "noPrintTrilinosTestStuff", + ¶ms.printTrilinosTestStuff, + "Print output that makes the Trilinos test " + "framework happy (but makes benchmark results " + "parsing scripts unhappy)"); + cmdLineProc.parse(argc, argv); + } + catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) { + err << "Unrecognized command-line option: " << e.what() << endl; + throw e; + } + catch(Teuchos::CommandLineProcessor::HelpPrinted& e) { + printedHelp = true; + return params; // Don't verify parameters in this case } -} // namespace (anonymous) - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// + TEUCHOS_TEST_FOR_EXCEPTION + (params.numRows <= 0, std::invalid_argument, "Number of " + "rows must be positive, but you set --numRows=" << + params.numRows << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.numCols <= 0, std::invalid_argument, "Number of " + "columns must be positive, but you set --numCols=" << + params.numCols << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.numRows < params.numCols, std::invalid_argument, + "Number of rows must be >= number of columns, but " + "--numRows=" << params.numRows << " and --numCols=" << + params.numCols << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.benchmark && params.numTrials < 1, + std::invalid_argument, "If you set --benchmark, then the " + "number of trials must be positive, but you set --numTrials=" + << params.numTrials << "."); +#ifndef HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (params.testComplex, std::invalid_argument, "Complex " + "arithmetic support was not enabled at configure time, " + "but you set --testComplex."); +#endif // HAVE_TPETRATSQR_COMPLEX + return params; + } +} // namespace (anonymous) - int -main (int argc, char *argv[]) +int +main(int argc, char *argv[]) { - using Teuchos::RCP; - -#ifdef HAVE_MPI - typedef RCP< const Teuchos::Comm > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to stdout. The other MPI process ranks - // send their output to something that looks like /dev/null (and - // likely is, on Unix-y operating systems). - std::ostream& out = (myRank == 0) ? std::cout : blackhole; - // Only Rank 0 performs the tests. - const bool performingTests = (myRank == 0); - const bool allowedToPrint = (myRank == 0); - -#else // Don't HAVE_MPI: single-node test - - const bool performingTests = true; - const bool allowedToPrint = true; - std::ostream& out = std::cout; -#endif // HAVE_MPI + using std::cout; + using std::cerr; + using std::endl; // Fetch command-line parameters. bool printedHelp = false; - TestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) - return 0; - + auto params = parseOptions(argc, argv, cerr, printedHelp); + if(printedHelp) { + return EXIT_SUCCESS; + } bool success = false; - bool verbose = false; + constexpr bool actually_print_caught_exceptions = true; try { - if (performingTests) - { - using std::endl; - - if (params.benchmark) - benchmark (out, params); - - // We allow the same run to do both benchmark and verify. - if (params.verify) - verify (out, params); - - success = true; - - if (params.printTrilinosTestStuff) - // The Trilinos test framework expects a message like this. - out << "\nEnd Result: TEST PASSED" << endl; + Kokkos::ScopeGuard kokkosScope(argc, argv); + if(params.benchmark) { + benchmark(cout, params); + } + // We allow the same run to do both benchmark and verify. + if(params.verify) { + verify(cout, params); + } + success = true; + if(params.printTrilinosTestStuff) { + // The Trilinos test framework expects a message like this. + cout << "\nEnd Result: TEST PASSED" << endl; } } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); + TEUCHOS_STANDARD_CATCH_STATEMENTS + (actually_print_caught_exceptions, cerr, success); return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); } diff --git a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp index 33210c6c81f4..a02891745b3f 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp @@ -37,163 +37,854 @@ // ************************************************************************ //@HEADER -#include "Tsqr_ConfigDefs.hpp" - -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_generateStack.hpp" +#include "Tsqr_DistTsqr.hpp" +#include "Tsqr_GlobalTimeStats.hpp" +#include "Tsqr_GlobalVerify.hpp" +#include "Tsqr_printGlobalMatrix.hpp" +#include "Tsqr_Test_MpiAndKokkosScope.cpp" +#include "Tsqr_TeuchosMessenger.hpp" #include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_RCP.hpp" #include "Teuchos_Time.hpp" #include "Teuchos_StandardCatchMacros.hpp" -#include "Tsqr_ParTest.hpp" -#include "Tsqr_TeuchosMessenger.hpp" - -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#include +#ifdef HAVE_TPETRATSQR_COMPLEX # include -#endif // HAVE_KOKKOSTSQR_COMPLEX - +#endif // HAVE_TPETRATSQR_COMPLEX +#include +#include #include #include #include -using TSQR::MessengerBase; -using TSQR::TeuchosMessenger; -using TSQR::Test::DistTsqrVerifier; -using TSQR::Test::DistTsqrBenchmarker; +namespace TSQR { + namespace Test { + /// \class DistTsqrVerifier + /// \brief Generic version of \c DistTsqr accuracy test. + template + class DistTsqrVerifier { + TSQR::Random::NormalGenerator gen_; + Teuchos::RCP > const ordinalComm_; + Teuchos::RCP > const scalarComm_; + std::string scalarTypeName_; + std::ostream& out_; + std::ostream& err_; + const bool testFactorExplicit_, testFactorImplicit_; + const bool humanReadable_, printMatrices_, debug_; -using Teuchos::RCP; -using Teuchos::rcp; -using Teuchos::rcp_implicit_cast; -using Teuchos::Tuple; + public: + using ordinal_type = Ordinal; + using scalar_type = Scalar; + using mag_type = + typename Teuchos::ScalarTraits::magnitudeType; + using result_type = std::vector; + /// \brief Constructor, with custom seed value + /// + /// \param scalarComm [in/out] Communicator object over which to + /// test. + /// \param seed [in] 4-element vector; the random seed input of + /// TSQR::Random::NormalGenerator (which see, since there are + /// restrictions on the set of valid seeds) + /// \param scalarTypeName [in] Human-readable name of the Scalar + /// template type parameter + /// \param out [out] Output stream to which to write results + /// \param err [out] Output stream to which to write any + /// debugging outputs (if applicable) or errors + /// \param testFactorExplicit [in] Whether to test + /// DistTsqr::factorExplicit() + /// \param testFactorImplicit [in] Whether to test + /// DistTsqr::factor() and DistTsqr::explicit_Q() + /// \param humanReadable [in] Whether printed results should be + /// easy for humans to read (vs. easy for parsers to parse) + /// \param debug [in] Whether to write verbose debug output to + /// err + DistTsqrVerifier(const Teuchos::RCP >& ordinalComm, + const Teuchos::RCP >& scalarComm, + const std::vector& seed, + const std::string& scalarTypeName, + std::ostream& out, + std::ostream& err, + const bool testFactorExplicit, + const bool testFactorImplicit, + const bool humanReadable, + const bool printMatrices, + const bool debug) : + gen_(seed), + ordinalComm_(ordinalComm), + scalarComm_(scalarComm), + scalarTypeName_(scalarTypeName), + out_(out), + err_(err), + testFactorExplicit_(testFactorExplicit), + testFactorImplicit_(testFactorImplicit), + humanReadable_(humanReadable), + printMatrices_(printMatrices), + debug_(debug) + {} -template< class Ordinal, class Scalar > -class MessengerPairMaker { - public: - typedef int ordinal_type; - typedef Scalar scalar_type; + /// \brief Constructor, with default seed value + /// + /// This constructor sets a default seed (for the pseudorandom + /// number generator), which is the same seed (0,0,0,1) each + /// time. + /// + /// \param scalarComm [in/out] Communicator object over which to + /// test. + /// \param scalarTypeName [in] Human-readable name of the Scalar + /// template type parameter + /// \param out [out] Output stream to which to write results + /// \param err [out] Output stream to which to write any + /// debugging outputs (if applicable) or errors + /// \param testFactorExplicit [in] Whether to test + /// DistTsqr::factorExplicit() + /// \param testFactorImplicit [in] Whether to test + /// DistTsqr::factor() and DistTsqr::explicit_Q() + /// \param humanReadable [in] Whether printed results should be + /// easy for humans to read (vs. easy for parsers to parse) + /// \param debug [in] Whether to write verbose debug output to + /// err + DistTsqrVerifier(const Teuchos::RCP >& ordinalComm, + const Teuchos::RCP >& scalarComm, + const std::string& scalarTypeName, + std::ostream& out, + std::ostream& err, + const bool testFactorExplicit, + const bool testFactorImplicit, + const bool humanReadable, + const bool printMatrices, + const bool debug) : + ordinalComm_(ordinalComm), + scalarComm_(scalarComm), + scalarTypeName_(scalarTypeName), + out_(out), + err_(err), + testFactorExplicit_(testFactorExplicit), + testFactorImplicit_(testFactorImplicit), + humanReadable_(humanReadable), + printMatrices_(printMatrices), + debug_(debug) + {} - typedef std::pair >, RCP > > pair_type; + /// \brief Get seed vector for pseudorandom number generator + /// + /// Fill seed (changing size of vector as necessary) with the + /// seed vector used by the pseudorandom number generator. You + /// can use this to resume the pseudorandom number stream from + /// where you last were. + void + getSeed(std::vector& seed) const + { + gen_.getSeed(seed); + } - static pair_type - makePair (const RCP< const Teuchos::Comm >& comm) + /// \brief Run the DistTsqr accuracy test + /// + /// \param numCols [in] Number of columns in the matrix to test. + /// Number of rows := (# MPI processors) * ncols. + void + verify(const Ordinal numCols, + const std::string& additionalFieldNames, + const std::string& additionalData, + const bool printFieldNames) { - RCP > derivedOrdinalComm = - rcp (new TeuchosMessenger (comm)); - RCP > ordinalComm = - rcp_implicit_cast > (derivedOrdinalComm); - RCP > derivedScalarComm = - rcp (new TeuchosMessenger (comm)); - RCP > scalarComm = - rcp_implicit_cast > (derivedScalarComm); - - return std::make_pair (ordinalComm, scalarComm); + using std::endl; + + const int myRank = scalarComm_->rank(); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "Verifying DistTsqr:" << endl; + } + scalarComm_->barrier(); + } + + // Generate test problem. + Matrix A_local, Q_local, R; + testProblem(A_local, Q_local, R, numCols); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Generated test problem." << endl; + } + scalarComm_->barrier(); + } + + // Set up TSQR implementation. + DistTsqr par; + par.init (scalarComm_); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- DistTsqr object initialized" << endl << endl; + } + } + + // Whether we've printed field names (i.e., column headers) + // yet. Only matters for non-humanReadable output. + bool printedFieldNames = false; + + // Test DistTsqr::factor() and DistTsqr::explicit_Q(). + if(testFactorImplicit_) { + // Factor the matrix A (copied into R, which will be + // overwritten on output) + typedef typename DistTsqr::FactorOutput + factor_output_type; + factor_output_type factorOutput = par.factor (R.view()); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Finished DistTsqr::factor" << endl; + } + } + // Compute the explicit Q factor + par.explicit_Q(numCols, Q_local.data(), Q_local.stride(1), + factorOutput); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Finished DistTsqr::explicit_Q" << endl; + } + } + // Verify the factorization + auto result = + global_verify(numCols, numCols, A_local.data(), + A_local.stride(1), Q_local.data(), + Q_local.stride(1), R.data(), R.stride(1), + scalarComm_.get()); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Finished global_verify" << endl; + } + } + reportResults("DistTsqr", numCols, result, + additionalFieldNames, additionalData, + printFieldNames && (! printedFieldNames)); + if(printFieldNames && (! printedFieldNames)) { + printedFieldNames = true; + } + } + + // Test DistTsqr::factorExplicit() + if(testFactorExplicit_) { + // Factor the matrix and compute the explicit Q factor, both + // in a single operation. + par.factorExplicit(R.view(), Q_local.view()); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Finished DistTsqr::factorExplicit" << endl; + } + } + + if(printMatrices_) { + if(myRank == 0) { + err_ << std::endl << "Computed Q factor:" << std::endl; + } + printGlobalMatrix(err_, Q_local, scalarComm_.get(), + ordinalComm_.get()); + if(myRank == 0) { + err_ << std::endl << "Computed R factor:" << std::endl; + print_local_matrix (err_, R.extent(0), R.extent(1), + R.data(), R.stride(1)); + err_ << std::endl; + } + } + + // Verify the factorization + result_type result = + global_verify(numCols, numCols, A_local.data(), + A_local.stride(1), Q_local.data(), + Q_local.stride(1), R.data(), R.stride(1), + scalarComm_.get()); + if(debug_) { + scalarComm_->barrier(); + if(myRank == 0) { + err_ << "-- Finished global_verify" << endl; + } + } + reportResults("DistTsqrRB", numCols, result, + additionalFieldNames, additionalData, + printFieldNames && (! printedFieldNames)); + if(printFieldNames && (! printedFieldNames)) { + printedFieldNames = true; + } + } } -}; + private: + /// Report verification results. Call on ALL MPI processes, not + /// just Process 0. + /// + /// \param method [in] String to print before reporting results + /// \param numCols [in] Number of columns in the matrix tested. + /// \param result [in] (relative residual, orthogonality) + void + reportResults (const std::string& method, + const Ordinal numCols, + const result_type& result, + const std::string& additionalFieldNames, + const std::string& additionalData, + const bool printFieldNames) + { + using std::endl; + + const int numProcs = scalarComm_->size(); + const int myRank = scalarComm_->rank(); + + if(myRank == 0) { + if(humanReadable_) { + out_ << method << " accuracy results:" << endl + << "Scalar: " << scalarTypeName_ << endl + << "numCols: " << numCols << endl + << "Number of (MPI) processes: " << numProcs << endl + << "Absolute residual $\\| A - Q R \\|_2: " + << result[0] << endl + << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " + << result[1] << endl + << "Test matrix norm $\\| A \\|_F$: " + << result[2] << endl; + } + else { + // Use scientific notation for floating-point numbers + out_ << std::scientific; + + if(printFieldNames) { + out_ << "%method,scalarType,numCols,numProcs" + ",absFrobResid,absFrobOrthog,frobA"; + if(! additionalFieldNames.empty()) + out_ << "," << additionalFieldNames; + out_ << endl; + } + + out_ << method + << "," << scalarTypeName_ + << "," << numCols + << "," << numProcs + << "," << result[0] + << "," << result[1] + << "," << result[2]; + if(! additionalData.empty()) { + out_ << "," << additionalData; + } + out_ << endl; + } + } + } + + void + testProblem(Matrix& A_local, + Matrix& Q_local, + Matrix& R, + const Ordinal numCols) + { + const Ordinal numRowsLocal = numCols; + + // A_local: Space for the matrix A to factor -- local to each + // processor. + // + // A_global: Global matrix (only nonempty on Proc 0); only + // used temporarily. + Matrix A_global; + + // This modifies A_local on all procs, and A_global on Proc 0. + par_tsqr_test_problem(gen_, A_local, A_global, numCols, scalarComm_); + + if(printMatrices_) { + const int myRank = scalarComm_->rank(); + if(myRank == 0) { + err_ << "Input matrix A:" << std::endl; + } + printGlobalMatrix(err_, A_local, scalarComm_.get(), + ordinalComm_.get()); + if(myRank == 0) { + err_ << std::endl; + } + } + + // Copy the test problem input into R, since the factorization + // will overwrite it in place with the final R factor. + R.reshape(numCols, numCols); + deep_copy(R, Scalar{}); + deep_copy(R, A_local); + + // Prepare space in which to construct the explicit Q factor + // (local component on this processor) + Q_local.reshape(numRowsLocal, numCols); + deep_copy(Q_local, Scalar {}); + } + }; + + /// \class DistTsqrBenchmarker + /// \brief Generic version of DistTsqr performance test. + template< class Ordinal, class Scalar> + class DistTsqrBenchmarker { + TSQR::Random::NormalGenerator gen_; + Teuchos::RCP> scalarComm_; + Teuchos::RCP> doubleComm_; + std::string scalarTypeName_; + + std::ostream& out_; + std::ostream& err_; + const bool testFactorExplicit_; + const bool testFactorImplicit_; + const bool humanReadable_; + const bool debug_; + + public: + using ordinal_type = Ordinal; + using scalar_type = Scalar; + using timer_type = Teuchos::Time; + + /// \brief Constructor, with custom seed value + /// + /// \param scalarComm [in/out] Communicator object over which + /// to test. + /// \param doubleComm [in/out] Communicator object for doubles, + /// used for finding the min and max of timing results over + /// all the MPI processes. + /// \param seed [in] 4-element vector; the random seed input of + /// TSQR::Random::NormalGenerator (which see, since there are + /// restrictions on the set of valid seeds) + /// \param scalarTypeName [in] Human-readable name of the Scalar + /// template type parameter + /// \param out [out] Output stream to which to write results + /// \param err [out] Output stream to which to write any + /// debugging outputs (if applicable) or errors + /// \param testFactorExplicit [in] Whether to test + /// DistTsqr::factorExplicit() + /// \param testFactorImplicit [in] Whether to test + /// DistTsqr::factor() and DistTsqr::explicit_Q() + /// \param humanReadable [in] Whether printed results should be + /// easy for humans to read (vs. easy for parsers to parse) + /// \param debug [in] Whether to write verbose debug output to + /// err + DistTsqrBenchmarker(const Teuchos::RCP>& scalarComm, + const Teuchos::RCP>& doubleComm, + const std::vector& seed, + const std::string& scalarTypeName, + std::ostream& out, + std::ostream& err, + const bool testFactorExplicit, + const bool testFactorImplicit, + const bool humanReadable, + const bool debug) : + gen_(seed), + scalarComm_(scalarComm), + doubleComm_(doubleComm), + scalarTypeName_(scalarTypeName), + out_(out), + err_(err), + testFactorExplicit_(testFactorExplicit), + testFactorImplicit_(testFactorImplicit), + humanReadable_(humanReadable), + debug_(debug) + {} + + /// \brief Constructor, with default seed value + /// + /// This constructor sets a default seed (for the pseudorandom + /// number generator), which is the same seed (0,0,0,1) each + /// time. + /// + /// \param scalarComm [in/out] Communicator object over which + /// to test. + /// \param doubleComm [in/out] Communicator object for doubles, + /// used for finding the min and max of timing results over + /// all the MPI processes. + /// \param scalarTypeName [in] Human-readable name of the Scalar + /// template type parameter + /// \param out [out] Output stream to which to write results + /// \param err [out] Output stream to which to write any + /// debugging outputs (if applicable) or errors + /// \param testFactorExplicit [in] Whether to test + /// DistTsqr::factorExplicit() + /// \param testFactorImplicit [in] Whether to test + /// DistTsqr::factor() and DistTsqr::explicit_Q() + /// \param humanReadable [in] Whether printed results should be + /// easy for humans to read (vs. easy for parsers to parse) + /// \param debug [in] Whether to write verbose debug output to + /// err + DistTsqrBenchmarker(const Teuchos::RCP>& scalarComm, + const Teuchos::RCP>& doubleComm, + const std::string& scalarTypeName, + std::ostream& out, + std::ostream& err, + const bool testFactorExplicit, + const bool testFactorImplicit, + const bool humanReadable, + const bool debug) : + scalarComm_(scalarComm), + doubleComm_(doubleComm), + scalarTypeName_(scalarTypeName), + out_(out), + err_(err), + testFactorExplicit_(testFactorExplicit), + testFactorImplicit_(testFactorImplicit), + humanReadable_(humanReadable), + debug_(debug) + {} + + /// \brief Get seed vector for pseudorandom number generator + /// + /// Fill seed (changing size of vector as necessary) with the + /// seed vector used by the pseudorandom number generator. You + /// can use this to resume the pseudorandom number stream from + /// where you last were. + void + getSeed(std::vector& seed) const + { + gen_.getSeed(seed); + } + + /// \brief Run the DistTsqr benchmark + /// + /// \param numTrials [in] Number of times to repeat the computation + /// in a single timing run + /// \param numCols [in] Number of columns in the matrix to test. + /// Number of rows := (# MPI processors) * ncols + void + benchmark(const int numTrials, + const Ordinal numCols, + const std::string& additionalFieldNames, + const std::string& additionalData, + const bool printFieldNames) + { + using std::endl; + + // Set up test problem. + Matrix A_local, Q_local, R; + testProblem(A_local, Q_local, R, numCols); + + // Set up TSQR implementation. + DistTsqr par; + par.init(scalarComm_); + + // Whether we've printed field names (i.e., column headers) + // yet. Only matters for non-humanReadable output. + bool printedFieldNames = false; + + if(testFactorImplicit_) { + std::string timerName("DistTsqr"); + + // Throw away some number of runs, because some MPI libraries + // (recent versions of OpenMPI at least) do autotuning for the + // first few collectives calls. + const int numThrowAwayRuns = 5; + for(int runNum = 0; runNum < numThrowAwayRuns; ++runNum) { + auto factorOutput = par.factor(R.view()); + par.explicit_Q(numCols, Q_local.data(), + Q_local.stride(1), factorOutput); + } + + // Now do the actual timing runs. Benchmark DistTsqr + // (factor() and explicit_Q()) for numTrials trials. + timer_type timer (timerName); + timer.start(); + for(int trialNum = 0; trialNum < numTrials; ++trialNum) { + auto factorOutput = par.factor(R.view()); + par.explicit_Q(numCols, Q_local.data(), + Q_local.stride(1), factorOutput); + } + // Cumulative timing on this MPI process. "Cumulative" + // means the elapsed time of numTrials executions. + const double localCumulativeTiming = timer.stop(); + + // reportResults() must be called on all processes, since this + // figures out the min and max timings over all processes. + reportResults(timerName, numTrials, numCols, + localCumulativeTiming, additionalFieldNames, + additionalData, + printFieldNames && (! printedFieldNames)); + if(printFieldNames && (! printedFieldNames)) { + printedFieldNames = true; + } + } + + if(testFactorExplicit_) { + std::string timerName ("DistTsqrRB"); + + // Throw away some number of runs, because some MPI libraries + // (recent versions of OpenMPI at least) do autotuning for the + // first few collectives calls. + const int numThrowAwayRuns = 5; + for(int runNum = 0; runNum < numThrowAwayRuns; ++runNum) { + par.factorExplicit(R.view(), Q_local.view()); + } + + // Benchmark DistTsqr::factorExplicit() for numTrials trials. + timer_type timer(timerName); + timer.start(); + for(int trialNum = 0; trialNum < numTrials; ++trialNum) { + par.factorExplicit(R.view(), Q_local.view()); + } + // Cumulative timing on this MPI process. + // "Cumulative" means the elapsed time of numTrials executions. + const double localCumulativeTiming = timer.stop(); + + // Report cumulative (not per-invocation) timing results + reportResults(timerName, numTrials, numCols, localCumulativeTiming, + additionalFieldNames, additionalData, + printFieldNames && (! printedFieldNames)); + if(printFieldNames && (! printedFieldNames)) { + printedFieldNames = true; + } + + // Per-invocation timings (for factorExplicit() benchmark + // only). localTimings were computed on this MPI process; + // globalTimings are statistical summaries of those over + // all MPI processes. We only collect that data for + // factorExplicit(). + std::vector localTimings; + std::vector globalTimings; + par.getFactorExplicitTimings(localTimings); + for(size_t k = 0; k < localTimings.size(); ++k) { + globalTimings.push_back + (globalTimeStats(*doubleComm_, localTimings[k])); + } + std::vector timingLabels; + par.getFactorExplicitTimingLabels(timingLabels); + + if(humanReadable_) { + out_ << timerName << " per-invocation benchmark results:" << endl; + } + const std::string labelLabel("label,scalarType"); + for (size_t k = 0; k < timingLabels.size(); ++k) { + // Only print column headers (i.e., field names) once, if at all. + const bool printHeaders = (k == 0) && printFieldNames; + globalTimings[k].print (out_, humanReadable_, + timingLabels[k] + "," + scalarTypeName_, + labelLabel, printHeaders); + } + } + } + + private: + /// Report timing results to the given output stream + /// + /// \param method [in] String to print before reporting results + /// \param numTrials [in] Number of times to repeat the computation + /// in a single timing run + /// \param numCols [in] Number of columns in the matrix to test. + /// Number of rows := (# MPI processors) * ncols + /// \param timing [in] Total benchmark time, as measured on this + /// MPI process. This may differ on each process; we report + /// the min and the max. + /// + /// \warning Call on ALL MPI processes, not just Rank 0! + void + reportResults(const std::string& method, + const int numTrials, + const ordinal_type numCols, + const double localTiming, + const std::string& additionalFieldNames, + const std::string& additionalData, + const bool printFieldNames) + { + using std::endl; + + // Find min and max timing over all MPI processes + TimeStats localStats; + localStats.update (localTiming); + TimeStats globalStats = globalTimeStats (*doubleComm_, localStats); + + // Only Rank 0 prints the final results. + const bool printResults = (doubleComm_->rank() == 0); + if(printResults) { + const int numProcs = doubleComm_->size(); + if(humanReadable_) { + out_ << method << " cumulative benchmark results " + << "(total time over all trials):" << endl + << "Scalar: " << scalarTypeName_ << endl + << "numCols: " << numCols << endl + << "MPI comm size: " << numProcs << endl + << "numTrials: " << numTrials << endl + << "Min timing (s): " << globalStats.min() << endl + << "Mean timing (s): " << globalStats.mean() << endl + << "Max timing (s): " << globalStats.max() << endl + << endl; + } + else { + // Use scientific notation for floating-point numbers + out_ << std::scientific; + + if(printFieldNames) { + out_ << "%method,scalarType,numCols,numProcs,numTrials" + << ",minTiming,meanTiming,maxTiming"; + if(! additionalFieldNames.empty()) { + out_ << "," << additionalFieldNames; + } + out_ << endl; + } + + out_ << method + << "," << scalarTypeName_ + << "," << numCols + << "," << numProcs + << "," << numTrials + << "," << globalStats.min() + << "," << globalStats.mean() + << "," << globalStats.max(); + if(! additionalData.empty()) { + out_ << "," << additionalData; + } + out_ << endl; + } + } + } + + void + testProblem(Matrix& A_local, + Matrix& Q_local, + Matrix& R, + const Ordinal numCols) + { + const Ordinal numRowsLocal = numCols; + + // A_local: Space for the matrix A to factor -- local to each + // (MPI) process. + // + // A_global: Global matrix (only nonempty on Proc 0); only + // used temporarily. + Matrix A_global; + + // This modifies A_local on all procs, and A_global on Proc 0. + par_tsqr_test_problem(gen_, A_local, A_global, numCols, + scalarComm_); + + // Copy the test problem input into R, since the factorization + // will overwrite it in place with the final R factor. + R.reshape(numCols, numCols); + deep_copy(R, A_local); + + // Prepare space in which to construct the explicit Q factor + // (local component on this processor) + Q_local.reshape(numRowsLocal, numCols); + deep_copy(Q_local, Scalar {}); + } + }; + } // namespace Test +} // namespace TSQR + +template +class MessengerPairMaker { +public: + using ordinal_type = Ordinal; + using scalar_type = Scalar; + + using pair_type = std::pair< + Teuchos::RCP>, + Teuchos::RCP> + >; + + static pair_type + makePair(const Teuchos::RCP>& comm) + { + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::rcp_implicit_cast; + using TSQR::MessengerBase; + using TSQR::TeuchosMessenger; + + auto derivedOrdinalComm = + rcp(new TeuchosMessenger(comm)); + auto ordinalComm = + rcp_implicit_cast>(derivedOrdinalComm); + auto derivedScalarComm = + rcp (new TeuchosMessenger(comm)); + auto scalarComm = + rcp_implicit_cast>(derivedScalarComm); + + return {ordinalComm, scalarComm}; + } +}; #define TSQR_TEST_DIST_TSQR( ScalarType, typeString ) \ do { \ - typedef int ordinal_type; \ - typedef ScalarType scalar_type; \ - typedef MessengerPairMaker::pair_type pair_type; \ - typedef DistTsqrVerifier verifier_type; \ - \ + using TSQR::Test::DistTsqrVerifier; \ + using LO = int; \ + using SC = ScalarType; \ + using verifier_type = DistTsqrVerifier; \ + \ std::string scalarTypeName (typeString); \ - pair_type messPair = MessengerPairMaker< ordinal_type, scalar_type >::makePair (comm); \ + auto messPair = MessengerPairMaker::makePair (comm); \ verifier_type verifier (messPair.first, messPair.second, seed, \ - scalarTypeName, out, err, \ - testFactorExplicit, testFactorImplicit, \ - humanReadable, printMatrices, debug); \ + scalarTypeName, out, err, \ + testFactorExplicit, testFactorImplicit, \ + humanReadable, printMatrices, debug); \ verifier.verify (numCols, params.additionalFieldNames, \ - params.additionalData, params.printFieldNames); \ + params.additionalData, params.printFieldNames); \ verifier.getSeed (seed); \ - } while(false) + } while (false) #define TSQR_BENCHMARK_DIST_TSQR( theType, typeString ) \ do { \ - typedef theType scalar_type; \ - typedef MessengerBase< scalar_type > base_messenger_type; \ - typedef RCP< base_messenger_type > base_messenger_ptr; \ - typedef TeuchosMessenger< scalar_type > derived_messenger_type; \ - typedef RCP< derived_messenger_type > derived_messenger_ptr; \ - typedef DistTsqrBenchmarker \ - benchmarker_type; \ - \ + using TSQR::Test::DistTsqrBenchmarker; \ + using Teuchos::RCP; \ + using SC = theType; \ + using base_messenger_type = TSQR::MessengerBase; \ + using base_messenger_ptr = RCP; \ + using derived_messenger_type = TSQR::TeuchosMessenger; \ + using derived_messenger_ptr = RCP; \ + using benchmarker_type = DistTsqrBenchmarker; \ + \ std::string scalarTypeName (typeString); \ - derived_messenger_ptr scalarCommDerived (new derived_messenger_type (comm)); \ - base_messenger_ptr scalarComm = \ - rcp_implicit_cast< base_messenger_type > (scalarCommDerived); \ + derived_messenger_ptr scalarCommDerived \ + (new derived_messenger_type (comm)); \ + auto scalarComm = \ + rcp_implicit_cast (scalarCommDerived); \ benchmarker_type benchmarker (scalarComm, doubleComm, seed, \ - scalarTypeName, out, err, \ - testFactorExplicit, testFactorImplicit, \ - humanReadable, debug); \ + scalarTypeName, out, err, \ + testFactorExplicit, \ + testFactorImplicit, \ + humanReadable, debug); \ benchmarker.benchmark (numTrials, numCols, \ - params.additionalFieldNames, \ - params.additionalData, \ - params.printFieldNames); \ + params.additionalFieldNames, \ + params.additionalData, \ + params.printFieldNames); \ benchmarker.getSeed (seed); \ - } while(false) - - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// + } while (false) /// \class DistTsqrTestParameters /// \brief Encapsulates values of command-line parameters -/// struct DistTsqrTestParameters { - DistTsqrTestParameters () : - numCols (10), - numTrials (10), - verify (false), - benchmark (false), - testReal (true), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (true), -#endif // HAVE_KOKKOSTSQR_COMPLEX - testFactorExplicit (true), - testFactorImplicit (true), - printFieldNames (true), - printTrilinosTestStuff (true), - humanReadable (false), - printMatrices (false), - debug (false) - {} - - std::string additionalFieldNames, additionalData; - int numCols, numTrials; - bool verify, benchmark; - bool testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - bool testFactorExplicit, testFactorImplicit; - bool printFieldNames, printTrilinosTestStuff; - bool humanReadable, printMatrices, debug; + int numCols = 10; + int numTrials = 10; + bool verify = true; + bool benchmark = false; + bool testReal = true; +#ifdef HAVE_TPETRATSQR_COMPLEX + bool testComplex = true; +#else + bool testComplex = false; +#endif // HAVE_TPETRATSQR_COMPLEX + bool testFactorExplicit = true; + bool testFactorImplicit = true; + bool printFieldNames = true; + bool printTrilinosTestStuff = true; + bool humanReadable = false; + bool printMatrices = false; + bool debug = false; + + std::string additionalFieldNames; + std::string additionalData; }; - static void -verify (RCP< const Teuchos::Comm > comm, - const DistTsqrTestParameters& params, - std::ostream& out, - std::ostream& err, - std::vector& seed, - const bool useSeed) +static void +verify(Teuchos::RCP> comm, + const DistTsqrTestParameters& params, + std::ostream& out, + std::ostream& err, + std::vector& seed, + const bool useSeed) { const bool testReal = params.testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX const bool testComplex = params.testComplex; -#else // Don't HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - const int numCols = params.numCols; const bool testFactorExplicit = params.testFactorExplicit; const bool testFactorImplicit = params.testFactorImplicit; @@ -201,52 +892,44 @@ verify (RCP< const Teuchos::Comm > comm, const bool printMatrices = params.printMatrices; const bool debug = params.debug; - if (! useSeed) - { - seed.resize (4); + if(! useSeed) { + seed.resize(4); seed[0] = 0; seed[1] = 0; seed[2] = 0; seed[3] = 1; } - if (testReal) - { + if(testReal) { TSQR_TEST_DIST_TSQR( float, "float" ); TSQR_TEST_DIST_TSQR( double, "double" ); } - if (testComplex) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX + if(testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX using std::complex; TSQR_TEST_DIST_TSQR( complex, "complex" ); TSQR_TEST_DIST_TSQR( complex, "complex" ); -#else // Don't HAVE_KOKKOSTSQR_COMPLEX +#else // Don't HAVE_TPETRATSQR_COMPLEX throw std::logic_error("TSQR was not built with complex " - "arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX + "arithmetic support"); +#endif // HAVE_TPETRATSQR_COMPLEX } } - static void -benchmark (RCP< const Teuchos::Comm > comm, - const DistTsqrTestParameters& params, - std::ostream& out, - std::ostream& err, - std::vector& seed, - const bool useSeed) +static void +benchmark(Teuchos::RCP> comm, + const DistTsqrTestParameters& params, + std::ostream& out, + std::ostream& err, + std::vector& seed, + const bool useSeed) { - typedef Teuchos::Time timer_type; + using timer_type = Teuchos::Time; const bool testReal = params.testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX const bool testComplex = params.testComplex; -#else // Don't HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - const int numCols = params.numCols; const int numTrials = params.numTrials; const bool testFactorExplicit = params.testFactorExplicit; @@ -254,34 +937,36 @@ benchmark (RCP< const Teuchos::Comm > comm, const bool humanReadable = params.humanReadable; const bool debug = params.debug; - if (! useSeed) - { - seed.resize (4); + if(! useSeed) { + seed.resize(4); seed[0] = 0; seed[1] = 0; seed[2] = 0; seed[3] = 1; } - RCP< MessengerBase< double > > doubleComm = - rcp_implicit_cast< MessengerBase< double > > (RCP< TeuchosMessenger< double > > (new TeuchosMessenger< double > (comm))); + using Teuchos::rcp; + auto doubleCommSub = + rcp(new TSQR::TeuchosMessenger(comm)); + using TSQR::MessengerBase; + using Teuchos::rcp_implicit_cast; + auto doubleComm = + rcp_implicit_cast>(doubleCommSub); - if (testReal) - { + if(testReal) { TSQR_BENCHMARK_DIST_TSQR( float, "float" ); TSQR_BENCHMARK_DIST_TSQR( double, "double" ); } - if (testComplex) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX + if(testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX using std::complex; TSQR_BENCHMARK_DIST_TSQR( complex, "complex" ); TSQR_BENCHMARK_DIST_TSQR( complex, "complex" ); -#else // Don't HAVE_KOKKOSTSQR_COMPLEX +#else // Don't HAVE_TPETRATSQR_COMPLEX throw std::logic_error("TSQR was not built with complex " - "arithmetic support"); -#endif // HAVE_KOKKOSTSQR_COMPLEX + "arithmetic support"); +#endif // HAVE_TPETRATSQR_COMPLEX } } @@ -296,44 +981,44 @@ benchmark (RCP< const Teuchos::Comm > comm, /// "help" display (summary of command-line options) /// /// \return Encapsulation of command-line options - static DistTsqrTestParameters -parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) +static DistTsqrTestParameters +parseOptions(int argc, + char* argv[], + std::ostream& err, + bool& printedHelp) { - using std::cerr; using std::endl; - printedHelp = false; // Command-line parameters, set to their default values. - DistTsqrTestParameters params; + DistTsqrTestParameters params {}; try { - Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); + constexpr bool throwExceptions = true; + constexpr bool recognizeAllOptions = true; + using CLP = Teuchos::CommandLineProcessor; + CLP cmdLineProc(throwExceptions, recognizeAllOptions); const char docString[] = "This program tests TSQR::DistTsqr, which " "implements the internode-parallel part of TSQR (TSQR::Tsqr). " "Accuracy and performance tests are included."; - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", + cmdLineProc.setDocString(docString); + cmdLineProc.setOption("verify", "noverify", ¶ms.verify, "Test accuracy"); - cmdLineProc.setOption ("benchmark", + cmdLineProc.setOption("benchmark", "nobenchmark", ¶ms.benchmark, "Test performance"); - cmdLineProc.setOption ("implicit", + cmdLineProc.setOption("implicit", "noimplicit", ¶ms.testFactorImplicit, "Test DistTsqr\'s factor() and explicit_Q()"); - cmdLineProc.setOption ("explicit", + cmdLineProc.setOption("explicit", "noexplicit", ¶ms.testFactorExplicit, "Test DistTsqr\'s factorExplicit()"); - cmdLineProc.setOption ("field-names", + cmdLineProc.setOption("field-names", ¶ms.additionalFieldNames, "Any additional field name(s) (comma-delimited " "string) to add to the benchmark output. Empty " @@ -341,55 +1026,54 @@ parseOptions (int argc, "the benchmark executable, but not (easily) known " "inside the benchmark -- e.g., environment " "variables."); - cmdLineProc.setOption ("output-data", + cmdLineProc.setOption("output-data", ¶ms.additionalData, "Any additional data to add to the output, " "corresponding to the above field name(s). " "Empty by default."); - cmdLineProc.setOption ("print-field-names", + cmdLineProc.setOption("print-field-names", "no-print-field-names", ¶ms.printFieldNames, "Print field names (for machine-readable output only)"); - cmdLineProc.setOption ("print-trilinos-test-stuff", + cmdLineProc.setOption("print-trilinos-test-stuff", "no-print-trilinos-test-stuff", ¶ms.printTrilinosTestStuff, "Print output that makes the Trilinos test " "framework happy (but makes benchmark results " "parsing scripts unhappy)"); - cmdLineProc.setOption ("print-matrices", + cmdLineProc.setOption("print-matrices", "no-print-matrices", ¶ms.printMatrices, "Print global test matrices and computed results to stderr"); - cmdLineProc.setOption ("debug", + cmdLineProc.setOption("debug", "nodebug", ¶ms.debug, "Print debugging information"); - cmdLineProc.setOption ("human-readable", + cmdLineProc.setOption("human-readable", "machine-readable", ¶ms.humanReadable, "If set, make output easy to read by humans " "(but hard to parse)"); - cmdLineProc.setOption ("ncols", + cmdLineProc.setOption("ncols", ¶ms.numCols, "Number of columns in the test matrix"); - cmdLineProc.setOption ("ntrials", + cmdLineProc.setOption("ntrials", ¶ms.numTrials, "Number of trials (only used when \"--benchmark\""); - cmdLineProc.setOption ("real", + cmdLineProc.setOption("real", "noreal", ¶ms.testReal, "Test real arithmetic routines"); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("complex", + cmdLineProc.setOption("complex", "nocomplex", ¶ms.testComplex, - "Test complex arithmetic routines"); -#endif // HAVE_KOKKOSTSQR_COMPLEX + "Test complex arithmetic routines (only set to true if " + "complex arithmetic support was enabled at configure " + "time)"); cmdLineProc.parse (argc, argv); } catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; + err << "Unrecognized command-line option: " << e.what() << endl; throw e; } catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { @@ -398,73 +1082,60 @@ parseOptions (int argc, // Validate command-line options. We provide default values // for unset options, so we don't have to validate those. - if (params.numCols <= 0) - throw std::invalid_argument ("Number of columns must be positive"); - else if (params.benchmark && params.numTrials < 1) - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - + TEUCHOS_TEST_FOR_EXCEPTION + (params.numCols <= 0, std::invalid_argument, + "You set --numCols=" << params.numCols << ". The number of " + "columns in the matrix to test must be positive."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.benchmark && params.numTrials < 1, std::invalid_argument, + "\"--benchmark\" option requires positive --numTrials, but you " + "set --numTrials=" << params.numTrials << "."); +#ifndef HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (params.testComplex, std::invalid_argument, "Complex " + "arithmetic support was not enabled at configure time, " + "but you set --testComplex."); +#endif // HAVE_TPETRATSQR_COMPLEX return params; } -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - - int -main (int argc, char *argv[]) +int +main(int argc, char *argv[]) { -#ifdef HAVE_MPI - typedef RCP< const Teuchos::Comm > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to cout and cerr. The other MPI - // process ranks send their output to a "black hole" (something that - // acts like /dev/null, and may be /dev/null). - const bool allowedToPrint = (myRank == 0); - std::ostream& out = allowedToPrint ? std::cout : blackhole; - std::ostream& err = allowedToPrint ? std::cerr : blackhole; - -#else // Don't HAVE_MPI: single-node test - - const bool allowedToPrint = true; - std::ostream& out = std::cout; - std::ostream& err = std::cerr; -#endif // HAVE_MPI + TSQR::Test::MpiAndKokkosScope testScope(&argc, &argv); + auto comm = testScope.getComm(); + std::ostream& out = testScope.outStream(); + std::ostream& err = testScope.errStream(); // Fetch command-line parameters. bool printedHelp = false; - DistTsqrTestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) - return 0; - + auto params = parseOptions(argc, argv, err, printedHelp); + if(printedHelp) { + return EXIT_SUCCESS; + } bool success = false; - bool verbose = false; + constexpr bool actually_print_caught_exceptions = true; try { - if (params.verify) - { + if(params.verify) { std::vector seed(4); const bool useSeed = false; - verify (comm, params, out, err, seed, useSeed); + verify(comm, params, out, err, seed, useSeed); } - if (params.benchmark) - { + if(params.benchmark) { std::vector seed(4); const bool useSeed = false; - benchmark (comm, params, out, err, seed, useSeed); + benchmark(comm, params, out, err, seed, useSeed); } success = true; - if (allowedToPrint && params.printTrilinosTestStuff) + if(params.printTrilinosTestStuff) { // The Trilinos test framework expects a message like this. out << "\nEnd Result: TEST PASSED" << std::endl; + } } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); + TEUCHOS_STANDARD_CATCH_STATEMENTS + (actually_print_caught_exceptions, err, success); + return success ? EXIT_SUCCESS : EXIT_FAILURE; } - - diff --git a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp index c60d652fc651..6b14b977b01f 100644 --- a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp +++ b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp @@ -38,65 +38,65 @@ //@HEADER #include "Tsqr_FullTsqrTest.hpp" - -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI +#include "Tsqr_Test_MpiAndKokkosScope.cpp" #include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" #include "Teuchos_StandardCatchMacros.hpp" -#ifdef HAVE_KOKKOSTSQR_COMPLEX +#ifdef HAVE_TPETRATSQR_COMPLEX # include -#endif // HAVE_KOKKOSTSQR_COMPLEX +#endif // HAVE_TPETRATSQR_COMPLEX namespace { - // - // Documentation string to print out if --help is a command-line argument. - // - const char docString[] = "This program tests correctness and accuracy of " - "TSQR::Tsqr, which is the full implementation of TSQR."; + using Teuchos::parameterList; + + // Documentation string to print out if --help is a command-line + // argument. + const char docString[] = "This program tests correctness and " + "accuracy of TSQR::Tsqr, which is the full implementation of " + "TSQR."; - // // Encapsulation of all command-line parameters. - // struct CmdLineOptions { - // - // Given a default valid parameter list from FullTsqrVerifierCaller, - // fill in the command-line options with their default values. - // - CmdLineOptions (const Teuchos::RCP& testParams) : - cacheSizeHint (testParams->get ("cacheSizeHint")), - numRowsLocal (testParams->get ("numRowsLocal")), - numCols (testParams->get ("numCols")), - contiguousCacheBlocks (testParams->get ("contiguousCacheBlocks")), - testFactorExplicit (testParams->get ("testFactorExplicit")), - testRankRevealing (testParams->get ("testRankRevealing")), - printFieldNames (testParams->get ("printFieldNames")), - printResults (testParams->get ("printResults")), - failIfInaccurate (testParams->get ("failIfInaccurate")), - debug (testParams->get ("debug")), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (false), -#endif // HAVE_KOKKOSTSQR_COMPLEX - testReal (false) // default is not to test _anything_ - {} + // Given a default valid parameter list from + // FullTsqrVerifierCaller, fill in the command-line options with + // their default values. + CmdLineOptions(const Teuchos::RCP& testParams) : + cacheSizeHint(testParams->get("Cache Size Hint")), + numRowsLocal(testParams->get("numRowsLocal")), + numCols(testParams->get("numCols")), + contiguousCacheBlocks(testParams->get("contiguousCacheBlocks")), + testFactorExplicit(testParams->get("testFactorExplicit")), + testRankRevealing(testParams->get("testRankRevealing")), + printFieldNames(testParams->get("printFieldNames")), + printResults(testParams->get("printResults")), + failIfInaccurate(testParams->get("failIfInaccurate")), + nodeTsqr(testParams->get("NodeTsqr")), +#ifdef HAVE_TPETRATSQR_COMPLEX + testComplex(true), +#else + testComplex(false), +#endif // HAVE_TPETRATSQR_COMPLEX + testReal(true), + verbose(testParams->get("verbose")) + {} - size_t cacheSizeHint; - int numRowsLocal; - int numCols; - bool contiguousCacheBlocks; - bool testFactorExplicit; - bool testRankRevealing; - bool printFieldNames; - bool printResults; - bool failIfInaccurate; - bool debug; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - bool testReal; + size_t cacheSizeHint = 0; + int numRowsLocal = 10000; + int numCols = 5; + bool contiguousCacheBlocks = false; + bool testFactorExplicit = true; + bool testRankRevealing = true; + bool printFieldNames = true; + bool printResults = true; + bool failIfInaccurate = true; + std::string nodeTsqr {"Default"}; +#ifdef HAVE_TPETRATSQR_COMPLEX + bool testComplex = true; +#else + bool testComplex = false; +#endif // HAVE_TPETRATSQR_COMPLEX + bool testReal = true; + bool verbose = false; // \brief Read command-line options. // @@ -108,100 +108,116 @@ namespace { // // \param argv [in] As usual in C(++). // - // \param allowedToPrint [in] Whether this (MPI) process is allowed - // to print to stdout/stderr. Different per (MPI) process. - // - // \param printedHelp [out] Whether this (MPI) process printed the - // "help" display (summary of command-line options) - // // \param testParams [in] List of test parameters for the // FullTsqrVerifierCaller. // + // \param err [out] Output stream to which to print error + // messages. Different per (MPI) process. + // // \return Whether help was printed. bool - read (int argc, - char* argv[], - const Teuchos::RCP& defaultParams, - const bool allowedToPrint) - { - using std::cerr; - using std::endl; - - try { - Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("testReal", - "noTestReal", - &testReal, - "Test real Scalar types"); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("testComplex", - "noTestComplex", - &testComplex, - "Test complex Scalar types"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - // CommandLineProcessor takes int arguments, but not size_t - // arguments, so we have to read in the argument as an int and - // convert back to size_t later. - int cacheSizeHintAsInt = cacheSizeHint; - cmdLineProc.setOption ("cacheSizeHint", - &cacheSizeHintAsInt, - defaultParams->getEntry("cacheSizeHint").docString().c_str()); - cmdLineProc.setOption ("numRowsLocal", - &numRowsLocal, - defaultParams->getEntry("numRowsLocal").docString().c_str()); - cmdLineProc.setOption ("numCols", - &numCols, - defaultParams->getEntry("numCols").docString().c_str()); - cmdLineProc.setOption ("contiguousCacheBlocks", - "noContiguousCacheBlocks", - &contiguousCacheBlocks, - defaultParams->getEntry("contiguousCacheBlocks").docString().c_str()); - cmdLineProc.setOption ("testFactorExplicit", - "noTestFactorExplicit", - &testFactorExplicit, - defaultParams->getEntry("testFactorExplicit").docString().c_str()); - cmdLineProc.setOption ("testRankRevealing", - "noTestRankRevealing", - &testRankRevealing, - defaultParams->getEntry("testRankRevealing").docString().c_str()); - cmdLineProc.setOption ("printFieldNames", - "noPrintFieldNames", - &printFieldNames, - defaultParams->getEntry("printFieldNames").docString().c_str()); - cmdLineProc.setOption ("printResults", - "noPrintResults", - &printResults, - defaultParams->getEntry("printResults").docString().c_str()); - cmdLineProc.setOption ("failIfInaccurate", - "noFailIfInaccurate", - &failIfInaccurate, - defaultParams->getEntry("failIfInaccurate").docString().c_str()); - cmdLineProc.setOption ("debug", - "nodebug", - &debug, - defaultParams->getEntry("debug").docString().c_str()); - cmdLineProc.parse (argc, argv); - cacheSizeHint = static_cast (cacheSizeHintAsInt); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - return true; - } + read(int argc, + char* argv[], + const Teuchos::RCP& defaultParams, + std::ostream& err) + { + using Teuchos::CommandLineProcessor; + using std::endl; - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - TEUCHOS_TEST_FOR_EXCEPTION(numRowsLocal <= 0, std::invalid_argument, - "Number of rows per process must be positive."); - TEUCHOS_TEST_FOR_EXCEPTION(numCols <= 0, std::invalid_argument, - "Number of columns must be positive."); - return false; // Did not print help + try { + const bool throwExceptions = true; + const bool recognizeAllOptions = true; + CommandLineProcessor cmdLineProc(throwExceptions, + recognizeAllOptions); + cmdLineProc.setDocString(docString); + cmdLineProc.setOption("testReal", + "noTestReal", + &testReal, + "Test real Scalar types"); + cmdLineProc.setOption("testComplex", + "noTestComplex", + &testComplex, + "Test complex Scalar types; must be " + "false if complex Scalar types were " + "disabled at configure (pre-build) " + "time"); + // CommandLineProcessor takes int arguments, but not size_t + // arguments, so we have to read in the argument as an int and + // convert back to size_t later. + int cacheSizeHintAsInt = cacheSizeHint; + cmdLineProc.setOption("cacheSizeHint", + &cacheSizeHintAsInt, + defaultParams->getEntry + ("Cache Size Hint").docString().c_str()); + cmdLineProc.setOption("numRowsLocal", + &numRowsLocal, + defaultParams->getEntry + ("numRowsLocal").docString().c_str()); + cmdLineProc.setOption("numCols", + &numCols, + defaultParams->getEntry + ("numCols").docString().c_str()); + cmdLineProc.setOption("contiguousCacheBlocks", + "noContiguousCacheBlocks", + &contiguousCacheBlocks, + defaultParams->getEntry + ("contiguousCacheBlocks").docString().c_str()); + cmdLineProc.setOption("testFactorExplicit", + "noTestFactorExplicit", + &testFactorExplicit, + defaultParams->getEntry + ("testFactorExplicit").docString().c_str()); + cmdLineProc.setOption("testRankRevealing", + "noTestRankRevealing", + &testRankRevealing, + defaultParams->getEntry + ("testRankRevealing").docString().c_str()); + cmdLineProc.setOption("printFieldNames", + "noPrintFieldNames", + &printFieldNames, + defaultParams->getEntry + ("printFieldNames").docString().c_str()); + cmdLineProc.setOption("printResults", + "noPrintResults", + &printResults, + defaultParams->getEntry + ("printResults").docString().c_str()); + cmdLineProc.setOption("failIfInaccurate", + "noFailIfInaccurate", + &failIfInaccurate, + defaultParams->getEntry + ("failIfInaccurate").docString().c_str()); + cmdLineProc.setOption("NodeTsqr", + &nodeTsqr, + defaultParams->getEntry + ("NodeTsqr").docString().c_str()); + cmdLineProc.setOption("verbose", + "quiet", + &verbose, + defaultParams->getEntry + ("verbose").docString().c_str()); + cmdLineProc.parse(argc, argv); + cacheSizeHint = size_t(cacheSizeHintAsInt); + } + catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) { + err << "Unrecognized command-line option: " << e.what() + << endl; + throw e; } + catch(Teuchos::CommandLineProcessor::HelpPrinted& e) { + return true; + } + + // Validate command-line options. We provide default values + // for unset options, so we don't have to validate those. + TEUCHOS_TEST_FOR_EXCEPTION + (numRowsLocal <= 0, std::invalid_argument, + "Number of rows per process must be positive."); + TEUCHOS_TEST_FOR_EXCEPTION + (numCols <= 0, std::invalid_argument, + "Number of columns must be positive."); + return false; // Did not print help + } }; // @@ -210,37 +226,34 @@ namespace { // the command line), return a parameter list describing the test. // Teuchos::RCP - testParameters (const Teuchos::RCP& validParams, - const CmdLineOptions& options) - { - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - - RCP testParams = parameterList ("FullTsqrVerifier"); - testParams->set ("cacheSizeHint", options.cacheSizeHint); - testParams->set ("numRowsLocal", options.numRowsLocal); - testParams->set ("numCols", options.numCols); - testParams->set ("testFactorExplicit", options.testFactorExplicit); - testParams->set ("testRankRevealing", options.testRankRevealing); - testParams->set ("contiguousCacheBlocks", options.contiguousCacheBlocks); - testParams->set ("printFieldNames", options.printFieldNames); - testParams->set ("printResults", options.printResults); - testParams->set ("failIfInaccurate", options.failIfInaccurate); - testParams->set ("debug", options.debug); + testParameters(const Teuchos::RCP& validParams, + const CmdLineOptions& options) + { + auto testParams = parameterList ("FullTsqrVerifier"); + testParams->set("Cache Size Hint", options.cacheSizeHint); + testParams->set("numRowsLocal", options.numRowsLocal); + testParams->set("numCols", options.numCols); + testParams->set("testFactorExplicit", + options.testFactorExplicit); + testParams->set("testRankRevealing", options.testRankRevealing); + testParams->set("contiguousCacheBlocks", + options.contiguousCacheBlocks); + testParams->set("printFieldNames", options.printFieldNames); + testParams->set("printResults", options.printResults); + testParams->set("failIfInaccurate", options.failIfInaccurate); + testParams->set("NodeTsqr", options.nodeTsqr); + testParams->set("verbose", options.verbose); - testParams->validateParametersAndSetDefaults (*validParams); - return testParams; - } + testParams->validateParametersAndSetDefaults(*validParams); + return testParams; + } - // // Return true if all tests were successful, else false. - // bool - test (int argc, - char* argv[], - const Teuchos::RCP >& comm, - const bool allowedToPrint) + test(int argc, + char* argv[], + const Teuchos::RCP >& comm, + std::ostream& err) { using TSQR::Test::NullCons; using TSQR::Test::Cons; @@ -249,41 +262,35 @@ namespace { using Teuchos::parameterList; using Teuchos::RCP; using Teuchos::rcp; - // - // Get a default random seed, and set up the Caller (that iterates - // the test over all Scalar types of interest). - // - typedef TSQR::Test::FullTsqrVerifierCaller caller_type; - std::vector randomSeed = caller_type::defaultRandomSeed (); - caller_type caller (comm, randomSeed); - // + // The Caller iterates the test over all Scalar types. + using caller_type = TSQR::Test::FullTsqrVerifierCaller; + caller_type caller(comm, caller_type::defaultRandomSeed ()); + // Read command-line options - // - RCP defaultParams = caller.getValidParameterList(); - CmdLineOptions cmdLineOpts (defaultParams); - const bool printedHelp = cmdLineOpts.read (argc, argv, defaultParams, allowedToPrint); + auto defaultParams = caller.getValidParameterList(); + CmdLineOptions cmdLineOpts(defaultParams); + const bool printedHelp = + cmdLineOpts.read(argc, argv, defaultParams, err); // Don't run the tests (and do succeed) if help was printed. - if (printedHelp) + if(printedHelp) { return true; + } // // Use read-in command-line options to set up test parameters. // - RCP testParams = testParameters (defaultParams, cmdLineOpts); + auto testParams = testParameters(defaultParams, cmdLineOpts); defaultParams = null; // save a little space - // // Define lists of Scalar types to test. We keep separate lists // for real and complex types, since callers can control whether // each of these is tested independently on the command line. - // - typedef Cons > real_type_list; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - typedef Cons, Cons, NullCons> > complex_type_list; -#endif // HAVE_KOKKOSTSQR_COMPLEX + using real_type_list = Cons>; +#ifdef HAVE_TPETRATSQR_COMPLEX + using complex_type_list = Cons, Cons, NullCons>>; +#endif // HAVE_TPETRATSQR_COMPLEX - // // Run the tests. If the tests are set up to fail on // insufficiently inaccurate results, run() will throw an // exception in that case. Otherwise, the tests return nothing, @@ -292,15 +299,18 @@ namespace { // The testReal and testComplex options are read in at the command // line, but since they do not apply to all Scalar types, they // don't belong in testParams. - // - if (cmdLineOpts.testReal) - caller.run (testParams); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (cmdLineOpts.testComplex) - caller.run (testParams); -#endif // HAVE_KOKKOSTSQR_COMPLEX + const bool realResult = cmdLineOpts.testReal ? + caller.run(testParams) : + true; +#ifdef HAVE_TPETRATSQR_COMPLEX + const bool complexResult = cmdLineOpts.testComplex ? + caller.run(testParams) : + true; +#else + const bool complexResult = true; +#endif // HAVE_TPETRATSQR_COMPLEX - return true; // for success + return realResult && complexResult; } } // namespace (anonymous) @@ -308,47 +318,22 @@ namespace { int main (int argc, char* argv[]) { - using TSQR::Test::NullCons; - using TSQR::Test::Cons; - using Teuchos::null; - using Teuchos::ParameterList; - using Teuchos::parameterList; - using Teuchos::RCP; - using Teuchos::rcp; using std::endl; + TSQR::Test::MpiAndKokkosScope testScope(&argc, &argv); + auto comm = testScope.getComm(); + std::ostream& out = testScope.outStream(); + std::ostream& err = testScope.errStream(); -#ifdef HAVE_MPI - typedef RCP > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to cout and cerr. The other MPI - // process ranks send their output to a "black hole" (something that - // acts like /dev/null, and may be /dev/null). - const bool allowedToPrint = (myRank == 0); - std::ostream& out = allowedToPrint ? std::cout : blackhole; - std::ostream& err = allowedToPrint ? std::cerr : blackhole; - // Make sure that err gets "used" - (void) err; - -#else // Don't HAVE_MPI: single-process test - - const bool allowedToPrint = true; - std::ostream& out = std::cout; - std::ostream& err = std::cerr; -#endif // HAVE_MPI - - bool success = false; - bool verbose = false; + constexpr bool actually_print_caught_exceptions = true; + bool success = false; // hopefully this will be true later try { - success = test (argc, argv, comm, allowedToPrint); - if (allowedToPrint && success) { + success = test(argc, argv, comm, err); + if(success) { // The Trilinos test framework expects a message like this. out << "\nEnd Result: TEST PASSED" << endl; } } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); + TEUCHOS_STANDARD_CATCH_STATEMENTS + (actually_print_caught_exceptions, err, success); return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); } diff --git a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp deleted file mode 100644 index d47000f68846..000000000000 --- a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp +++ /dev/null @@ -1,373 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Tsqr_KokkosNodeTsqrTest.hpp" -#include "Kokkos_Core.hpp" - -#ifdef HAVE_KOKKOSTSQR_COMPLEX -# include -#endif // HAVE_KOKKOSTSQR_COMPLEX - -namespace { - // - // The documentation string for this test executable to print out at - // the command line on request. - // - const char docString[] = "This program tests TSQR::KokkosNodeTsqr, " - "which implements an intranode parallel version of TSQR for " - "Kokkos::DefaultHostExecutionSpace. Accuracy and performance " - "tests are included."; - - // - // TestParameters encapsulates values of command-line parameters, as - // well as state that may change from one benchmark / verify - // invocation to the next. - // - class TestParameters { - public: - TestParameters () = default; - TestParameters (const std::vector /* theSeed */); - - bool verify = true; - bool benchmark = false; - int numRows = 100000; - int numCols = 10; - int numTrials = 1; - bool testReal = true; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex = true; -#endif // HAVE_KOKKOSTSQR_COMPLEX - int numPartitions = 16; - int cacheSizeHint = 0; - bool contiguousCacheBlocks = false; - bool printFieldNames = true; - bool humanReadable = true; - bool debug = false; - }; - - // Run the test(s) for a particular Scalar type T. - // Used by Cons, which in turn is used by runTests(). - template - class Dispatcher { - public: - typedef T dispatch_type; - - static void - benchmark (std::vector&, - const TestParameters& params, - bool& printFieldNames) - { - using TSQR::Test::benchmarkKokkosNodeTsqr; - benchmarkKokkosNodeTsqr (params.numTrials, - params.numRows, - params.numCols, - params.numPartitions, - params.cacheSizeHint, - params.contiguousCacheBlocks, - printFieldNames, - params.humanReadable); - printFieldNames = false; - } - - static void - verify (std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - TSQR::Random::NormalGenerator gen (seed); - using TSQR::Test::verifyKokkosNodeTsqr; - verifyKokkosNodeTsqr (gen, - params.numRows, - params.numCols, - params.numPartitions, - params.cacheSizeHint, - params.contiguousCacheBlocks, - printFieldNames, - params.humanReadable, - params.debug); - printFieldNames = false; - // Save the seed for next time, since we can't use the same - // NormalGenerator for a different Scalar type T. - gen.getSeed (seed); - } - }; - - // - // Class for executing a template function over a compile-time - // fixed-length list of types. See runTests() for an example. - // - template - class Cons { - public: - static void - verify (std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - Dispatcher::verify (seed, params, printFieldNames); - CdrType::verify (seed, params, printFieldNames); - } - - static void - benchmark (std::vector& seed, - const TestParameters& params, - bool& printFieldNames) - { - Dispatcher::benchmark (seed, params, printFieldNames); - CdrType::benchmark (seed, params, printFieldNames); - } - }; - - // Base case for Cons template recursion. - class NullCons { - public: - static void - verify (std::vector&, - const TestParameters&, - bool& printFieldNames) {} - - static void - benchmark (std::vector&, - const TestParameters&, - bool& printFieldNames) {} - }; - - // Run the tests for all types of interest. - void - runTests (const TestParameters& params) - { - using real_tests = Cons>; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using complex_tests = - Cons, Cons, NullCons>>; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - // Length-4 seed for the pseudorandom number generator. The last - // entry must be an odd number. There are other restrictions on - // these values; see the LAPACK documentation for details. (0, 0, - // 0, 1) is a typical initial seed if you want reproducible - // results, but don't actually care much about randomness. - std::vector seed {{0, 0, 0, 1}}; - - bool printFieldNames = params.printFieldNames; - if (params.verify) { - if (params.testReal) { - real_tests::verify (seed, params, printFieldNames); - } -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - complex_tests::verify (seed, params, printFieldNames); - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - // Reset this, since the first call of verify() sets it to false. - printFieldNames = params.printFieldNames; - if (params.benchmark) { - if (params.testReal) { - real_tests::benchmark (seed, params, printFieldNames); - } -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - complex_tests::benchmark (seed, params, printFieldNames); - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - } - - // Parse command-line options for this test. - // - // argc [in] As usual in C(++) - // - // argv [in] As usual in C(++) - // - // allowedToPrint [in] Whether this (MPI) process is allowed - // to print to stdout/stderr. Different per (MPI) process. - // - // printedHelp [out] Whether this (MPI) process printed the - // "help" display (summary of command-line options). - // - // Return an encapsulation of the command-line options. - TestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - TestParameters params; - /// We really want the cache size hint as a size_t, but - /// Teuchos::CommandLineProcessor doesn't offer that option. So - /// we read it in as an int, which means negative inputs are - /// possible. We check for those below in the input validation - /// phase. - // - // Fetch default value of cacheSizeHint. - int cacheSizeHint = params.cacheSizeHint; - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy"); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance"); - cmdLineProc.setOption ("numRows", - ¶ms.numRows, - "Number of rows in the test matrix"); - cmdLineProc.setOption ("numCols", - ¶ms.numCols, - "Number of columns in the test matrix"); - cmdLineProc.setOption ("numTrials", - ¶ms.numTrials, - "Number of trials (only used when \"--benchmark\""); - cmdLineProc.setOption ("testReal", - "noTestReal", - ¶ms.testReal, - "Test real arithmetic"); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("testComplex", - "noTestComplex", - ¶ms.testComplex, - "Test complex arithmetic"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - params.numPartitions = Kokkos::DefaultHostExecutionSpace::concurrency(); - cmdLineProc.setOption ("numPartitions", - ¶ms.numPartitions, - "Number of partitions to use (max available parallelism)"); - cmdLineProc.setOption ("cacheSizeHint", - &cacheSizeHint, - "Cache size hint in bytes (0 means pick a reasonable default)"); - cmdLineProc.setOption ("contiguousCacheBlocks", - "noncontiguousCacheBlocks", - ¶ms.contiguousCacheBlocks, - "Whether cache blocks should be stored contiguously"); - cmdLineProc.setOption ("printFieldNames", - "noPrintFieldNames", - ¶ms.printFieldNames, - "Print field names (for machine-readable output only)"); - cmdLineProc.setOption ("humanReadable", - "machineReadable", - ¶ms.humanReadable, - "If set, make output easy to read by humans " - "(but hard to parse)"); - cmdLineProc.setOption ("debug", - "noDebug", - ¶ms.debug, - "Print debugging information"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } - - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - if (params.numRows <= 0) { - throw std::invalid_argument ("Number of rows must be positive"); - } else if (params.numCols <= 0) { - throw std::invalid_argument ("Number of columns must be positive"); - } else if (params.numRows < params.numCols) { - throw std::invalid_argument ("Number of rows must be >= number of columns"); - } else if (params.benchmark && params.numTrials < 1) { - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - } else if (params.numPartitions < 1) { - throw std::invalid_argument ("\"--numPartitions\" option must be >= 1"); - } else if (params.cacheSizeHint < 0) { - throw std::invalid_argument ("Cache size hint must be nonnegative"); - } - return params; - } -} // namespace (anonymous) - -// -// The "main" test driver. -// -int -main (int argc, char *argv[]) -{ - using Teuchos::ParameterList; - using Teuchos::RCP; - using Teuchos::rcp; - - bool performingTests = true; - const bool allowedToPrint = true; - std::ostream& out = std::cout; - - // Fetch command-line parameters. - bool printedHelp = false; - TestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) { - return EXIT_SUCCESS; - } - - bool success = false; - bool verbose = false; - try { - if (performingTests) { - Kokkos::ScopeGuard kokkosScope (argc, argv); - runTests (params); - success = true; - // The Trilinos test framework expects a message like this. - out << "\nEnd Result: TEST PASSED" << std::endl; - } - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - return success ? EXIT_SUCCESS : EXIT_FAILURE; -} diff --git a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp b/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp deleted file mode 100644 index 3c4da413287b..000000000000 --- a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp +++ /dev/null @@ -1,320 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#include "Tsqr_ConfigDefs.hpp" -#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI -#include "Teuchos_Tuple.hpp" -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Tsqr_SeqTest.hpp" - -#ifdef HAVE_KOKKOSTSQR_COMPLEX -# include -#endif // HAVE_KOKKOSTSQR_COMPLEX - -#include -#include -#include - - -namespace TSQR { - namespace Trilinos { - namespace Test { - - const char docString[] = "This program compares LAPACK\'s QR factorization" - " (with TSQR). Accuracy and performance tests are included."; - - using Teuchos::RCP; - using Teuchos::Tuple; - - /// \class LapackTestParameters - /// \brief Encapsulates values of command-line parameters - /// - struct LapackTestParameters { - LapackTestParameters () : - verify (false), - benchmark (false), - numRows (1000), - numCols (10), - numTrials (10), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (true), -#endif // HAVE_KOKKOSTSQR_COMPLEX - printFieldNames (true), - printTrilinosTestStuff (true), - humanReadable (false), - debug (false) - {} - - bool verify, benchmark; - int numRows, numCols, numTrials; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - std::string additionalFieldNames, additionalData; - bool printFieldNames, printTrilinosTestStuff, humanReadable, debug; - }; - - static void - benchmark (std::ostream& out, - const LapackTestParameters& params) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = params.testComplex; -#else - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - using TSQR::Test::benchmarkLapack; - benchmarkLapack (out, - params.numRows, - params.numCols, - params.numTrials, - testComplex, - params.additionalFieldNames, - params.additionalData, - params.printFieldNames, - params.humanReadable); - } - - static void - verify (std::ostream& out, - const LapackTestParameters& params) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = params.testComplex; -#else - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - using TSQR::Test::verifyLapack; - verifyLapack (out, - params.numRows, - params.numCols, - testComplex, - params.additionalFieldNames, - params.additionalData, - params.printFieldNames, - params.humanReadable, - params.debug); - } - - /// \brief Parse command-line options for this test - /// - /// \param argc [in] As usual in C(++) - /// \param argv [in] As usual in C(++) - /// \param allowedToPrint [in] Whether this (MPI) process is allowed - /// to print to stdout/stderr. Different per (MPI) process. - /// \param printedHelp [out] Whether this (MPI) process printed the - /// "help" display (summary of command-line options) - /// - /// \return Encapsulation of command-line options - static LapackTestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - LapackTestParameters params; - - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy"); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance"); - cmdLineProc.setOption ("nrows", - ¶ms.numRows, - "Number of rows in the test matrix"); - cmdLineProc.setOption ("ncols", - ¶ms.numCols, - "Number of columns in the test matrix"); - cmdLineProc.setOption ("ntrials", - ¶ms.numTrials, - "Number of trials (only used when \"--benchmark\""); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("complex", - "nocomplex", - ¶ms.testComplex, - "Test complex arithmetic, as well as real"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("field-names", - ¶ms.additionalFieldNames, - "Any additional field name(s) (comma-delimited " - "string) to add to the benchmark output. Empty " - "by default. Good for things known when invoking " - "the benchmark executable, but not (easily) known " - "inside the benchmark -- e.g., environment " - "variables."); - cmdLineProc.setOption ("output-data", - ¶ms.additionalData, - "Any additional data to add to the output, " - "corresponding to the above field name(s). " - "Empty by default."); - cmdLineProc.setOption ("print-field-names", - "no-print-field-names", - ¶ms.printFieldNames, - "Print field names for benchmark output (including " - "any arguments to --field-names)."); - cmdLineProc.setOption ("print-trilinos-test-stuff", - "no-print-trilinos-test-stuff", - ¶ms.printTrilinosTestStuff, - "Print output that makes the Trilinos test " - "framework happy (but makes benchmark results " - "parsing scripts unhappy)"); - cmdLineProc.setOption ("human-readable", - "machine-readable", - ¶ms.humanReadable, - "If set, make output easy to read by humans " - "(but hard to parse)"); - cmdLineProc.setOption ("debug", - "nodebug", - ¶ms.debug, - "Print debugging information"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } - - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - if (params.numRows <= 0) - throw std::invalid_argument ("Number of rows must be positive"); - else if (params.numCols <= 0) - throw std::invalid_argument ("Number of columns must be positive"); - else if (params.numRows < params.numCols) - throw std::invalid_argument ("Number of rows must be >= number of columns"); - else if (params.benchmark && params.numTrials < 1) - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - return params; - } - - } // namespace Test - } // namespace Trilinos -} // namespace TSQR - - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - - int -main (int argc, char *argv[]) -{ - using Teuchos::RCP; - using TSQR::Trilinos::Test::LapackTestParameters; - using TSQR::Trilinos::Test::parseOptions; - using std::endl; - -#ifdef HAVE_MPI - typedef RCP< const Teuchos::Comm > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to stdout. The other MPI process ranks - // send their output to something that looks like /dev/null (and - // likely is, on Unix-y operating systems). - std::ostream& out = (myRank == 0) ? std::cout : blackhole; - // Only Rank 0 performs the tests. - const bool performingTests = (myRank == 0); - const bool allowedToPrint = (myRank == 0); - -#else // Don't HAVE_MPI: single-node test - - const bool performingTests = true; - const bool allowedToPrint = true; - std::ostream& out = std::cout; -#endif // HAVE_MPI - - // Fetch command-line parameters. - bool printedHelp = false; - LapackTestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) - return 0; - - bool success = false; - bool verbose = false; - try { - if (performingTests) - { - if (params.benchmark) - TSQR::Trilinos::Test::benchmark (out, params); - - // We allow the same run to do both benchmark and verify. - if (params.verify) - TSQR::Trilinos::Test::verify (out, params); - - success = true; - - if (params.printTrilinosTestStuff) - // The Trilinos test framework expects a message like this. - out << "\nEnd Result: TEST PASSED" << endl; - } - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); -} diff --git a/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp new file mode 100644 index 000000000000..85a96725c507 --- /dev/null +++ b/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp @@ -0,0 +1,1532 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos: Node API and Parallel Node Kernels +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER + +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_Time.hpp" + +#include "Tsqr_Impl_Lapack.hpp" +#include "Tsqr_Random_NormalGenerator.hpp" +#include "Tsqr_LocalVerify.hpp" +#include "Tsqr_Matrix.hpp" +#include "Tsqr_NodeTsqrFactory.hpp" +#include "Tsqr_nodeTestProblem.hpp" +#include "Tsqr_Util.hpp" + +#include +#include +#include // size_t definition +#include +#include +#include +#include +#include +#include +#include + +namespace TSQR { + namespace Test { + + using execution_space = Kokkos::DefaultExecutionSpace; + using memory_space = execution_space::memory_space; + using device_type = + Kokkos::Device; + + // Command-line arguments and other test parameters. + struct NodeTestParameters { + NodeTestParameters() = default; + + std::string nodeTsqrType {"Default"}; + bool verify = true; + bool benchmark = false; + int numRows = 10000; + int numCols = 10; + int numTrials = 10; + bool testReal = true; +#ifdef HAVE_TPETRATSQR_COMPLEX + bool testComplex = true; +#else + bool testComplex = false; +#endif // HAVE_TPETRATSQR_COMPLEX + size_t cacheSizeHint = 0; + bool contiguousCacheBlocks = false; + bool printFieldNames = true; + bool printTrilinosTestStuff = true; + bool humanReadable = false; + bool verbose = false; + bool saveMatrices = false; + }; + + void + printNodeTestParameters(std::ostream& out, + const NodeTestParameters& p, + const std::string& prefix) + { + using std::endl; + out << prefix << "NodeTsqr: " << p.nodeTsqrType << endl + << prefix << "numRows: " << p.numRows << endl + << prefix << "numCols: " << p.numCols << endl + << prefix << "numTrials: " << p.numTrials << endl + << prefix << "testReal: " + << (p.testReal ? "true" : "false") << endl + << prefix << "testComplex: " + << (p.testComplex ? "true" : "false") << endl + << prefix << "cacheSizeHint: " << p.cacheSizeHint << endl + << prefix << "contiguousCacheBlocks: " + << (p.contiguousCacheBlocks ? "true" : "false") << endl + << prefix << "printFieldNames: " + << (p.printFieldNames ? "true" : "false") << endl + << prefix << "printTrilinosTestStuff: " + << (p.printTrilinosTestStuff ? "true" : "false") << endl + << prefix << "humanReadable: " + << (p.humanReadable ? "true" : "false") << endl + << prefix << "verbose: " + << (p.verbose ? "true" : "false") << endl + << prefix << "saveMatrices: " + << (p.saveMatrices ? "true" : "false") << endl; + } + + void + setBoolCmdLineOpt(Teuchos::CommandLineProcessor& cmdLineProc, + bool* variable, + const char trueString[], + const char falseString[], + const char docString[]) + { + cmdLineProc.setOption(trueString, falseString, variable, + docString); + } + + // \brief Parse command-line options for this test + // + // \param argc [in] As usual in C(++). + // \param argv [in] As usual in C(++). + // \param printedHelp [out] Whether this function printed the + // "help" display (summary of command-line options). + // + // \return Encapsulation of command-line options + static NodeTestParameters + parseOptions(int argc, + char* argv[], + bool& printedHelp) + { + using std::cerr; + using std::endl; + + printedHelp = false; + + // Command-line parameters, set to their default values. + NodeTestParameters params; + /// We really want the cache block size as a size_t, but + /// Teuchos::CommandLineProcessor doesn't offer that option. + /// So we read it in as an int, which means negative inputs + /// are possible. We check for those below in the input + /// validation phase. + // + // Fetch default value of cacheSizeHint. + int cacheSizeHintAsInt = static_cast(params.cacheSizeHint); + try { + const bool throwExceptions = true; + const bool recognizeAllOptions = false; + using Teuchos::CommandLineProcessor; + CommandLineProcessor cmdLineProc(throwExceptions, + recognizeAllOptions); + const char docString[] = "This program tests TSQR::NodeTsqr, " + "which implements the intraprocess part of TSQR. " + "Accuracy and performance tests are included."; + cmdLineProc.setDocString(docString); + + setBoolCmdLineOpt(cmdLineProc, ¶ms.verify, + "verify", + "noverify", + "Test accuracy"); + setBoolCmdLineOpt(cmdLineProc, ¶ms.benchmark, + "benchmark", + "nobenchmark", + "Test performance"); + cmdLineProc.setOption("numRows", + ¶ms.numRows, + "Number of rows in the test matrix"); + cmdLineProc.setOption("numCols", + ¶ms.numCols, + "Number of columns in the test matrix"); + cmdLineProc.setOption("numTrials", + ¶ms.numTrials, + "Number of trials (only used when " + "\"--benchmark\""); + setBoolCmdLineOpt(cmdLineProc, ¶ms.testReal, + "testReal", + "noTestReal", + "Test real arithmetic"); + setBoolCmdLineOpt(cmdLineProc, ¶ms.testComplex, + "testComplex", + "noTestComplex", + "Test complex arithmetic"); + cmdLineProc.setOption("cacheBlockSize", + &cacheSizeHintAsInt, + "Cache size hint in bytes (0 means " + "pick a reasonable default)"); + setBoolCmdLineOpt(cmdLineProc, + ¶ms.contiguousCacheBlocks, + "contiguousCacheBlocks", + "noncontiguousCacheBlocks", + "Whether cache blocks should be stored contiguously"); + setBoolCmdLineOpt(cmdLineProc, ¶ms.printFieldNames, + "printFieldNames", + "noPrintFieldNames", + "Print field names (for machine-readable output only)"); + setBoolCmdLineOpt(cmdLineProc, ¶ms.printTrilinosTestStuff, + "printTrilinosTestStuff", + "noPrintTrilinosTestStuff", + "Print output that makes the Trilinos test " + "framework happy, but may make benchmark " + "results' parsing scripts unhappy."); + setBoolCmdLineOpt(cmdLineProc, ¶ms.humanReadable, + "humanReadable", + "machineReadable", + "If set, make output easy to read by " + "humans, but harder to parse."); + setBoolCmdLineOpt(cmdLineProc, ¶ms.verbose, + "verbose", + "quiet", + "Print verbose debugging information"); + setBoolCmdLineOpt(cmdLineProc, ¶ms.saveMatrices, + "saveMatrices", + "noSaveMatrices", + "If set, dump matrices to files."); + cmdLineProc.setOption("NodeTsqr", + ¶ms.nodeTsqrType, + "NodeTsqr subclass type"); + cmdLineProc.parse(argc, argv); + } + catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) { + cerr << "Unrecognized command-line option: " << e.what() + << endl; + throw e; + } + catch(Teuchos::CommandLineProcessor::HelpPrinted& e) { + printedHelp = true; + return params; // Don't verify parameters in this case + } + + // Validate command-line options. We provide default values + // for unset options, so we don't have to validate those. + TEUCHOS_TEST_FOR_EXCEPTION + (params.numRows <= 0, std::invalid_argument, "Number of " + "rows must be positive, but you set --numRows=" << + params.numRows << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.numCols <= 0, std::invalid_argument, "Number of " + "columns must be positive, but you set --numCols=" << + params.numCols << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.numRows < params.numCols, std::invalid_argument, + "Number of rows must be >= number of columns, but you set " + "--numRows=" << params.numRows << " and --numCols=" << + params.numCols << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (params.benchmark && params.numTrials < 1, + std::invalid_argument, "Since you set --benchmark, the " + "number of trials must be positive, but you set --numTrials=" + << params.numTrials << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (cacheSizeHintAsInt < 0, std::invalid_argument, "Cache size " + "hint must be nonnegative, but you set --cacheBlockSize=" << + cacheSizeHintAsInt << "."); + params.cacheSizeHint = size_t(cacheSizeHintAsInt); + return params; + } + + template + using kokkos_value_type = typename std::conditional< + std::is_const::value, + const typename Kokkos::ArithTraits< + typename std::remove_const::type>::val_type, + typename Kokkos::ArithTraits::val_type + >::type; + + template + Kokkos::View**, + Kokkos::LayoutLeft, Kokkos::HostSpace, + Kokkos::MemoryTraits> + getHostMatrixView(const MatView& A) + { + using Kokkos::ALL; + using Kokkos::subview; + using IST = kokkos_value_type; + using host_mat_view_type = + Kokkos::View>; + + const size_t nrows(A.extent(0)); + const size_t ncols(A.extent(1)); + const size_t lda(A.stride(1)); + IST* A_raw = reinterpret_cast(A.data()); + host_mat_view_type A_full(A_raw, lda, ncols); + const std::pair rowRange(0, nrows); + return Kokkos::subview(A_full, rowRange, Kokkos::ALL()); + } + + template + Kokkos::View::val_type**, + Kokkos::LayoutLeft> + getDeviceMatrixCopy(const MatView& A, + const std::string& label) + { + using Kokkos::view_alloc; + using Kokkos::WithoutInitializing; + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + const size_t nrows(A.extent(0)); + const size_t ncols(A.extent(1)); + device_matrix_type A_dev + (view_alloc(label, WithoutInitializing), nrows, ncols); + auto A_host = getHostMatrixView(A); + Kokkos::deep_copy(A_dev, A_host); + return A_dev; + } + + + template class LapackType, class Scalar> + static int + lworkQueryLapackQr(LapackType& lapack, + const int nrows, + const int ncols, + const int lda) + { + const int lwork_geqrf = + lapack.compute_QR_lwork(nrows, ncols, nullptr, lda); + // A workspace query appropriate for computing the explicit Q + // factor (nrows x ncols) in place, from the QR factorization of + // an nrows x ncols matrix with leading dimension lda. + const int lwork_ungqr = + lapack.compute_explicit_Q_lwork(nrows, ncols, ncols, + nullptr, lda, nullptr); + return std::max(lwork_geqrf, lwork_ungqr); + } + + template + Teuchos::RCP< + typename ::TSQR::NodeTsqrFactory::node_tsqr_type + > + getNodeTsqr(const NodeTestParameters& p, + const std::string& overrideNodeTsqrType = "") + { + const std::string nodeTsqrType = [&] () { + if(overrideNodeTsqrType == "") { + return p.nodeTsqrType; + } + else { + return overrideNodeTsqrType; + } + }(); + using fct_type = ::TSQR::NodeTsqrFactory; + auto nodeTsqr = fct_type::getNodeTsqr(nodeTsqrType); + TEUCHOS_ASSERT( ! nodeTsqr.is_null() ); + auto nodeTsqrParams = Teuchos::parameterList("NodeTsqr"); + nodeTsqrParams->set("Cache Size Hint", p.cacheSizeHint); + nodeTsqr->setParameterList(nodeTsqrParams); + return nodeTsqr; + } + + static void + printVerifyFieldNames(std::ostream& out) + { + const char prefix[] = "%"; + out << prefix << "method" + << ",scalarType" + << ",numRows" + << ",numCols" + << ",cacheSizeHint" + << ",contiguousCacheBlocks" + << ",frobA" + << ",absFrobResid" + << ",absFrobOrthog"; + out << std::endl; + } + + template + static std::string + getFileSuffix(const std::string& method) + { + std::string shortScalarType; + if(std::is_same::value) { + shortScalarType = "S"; + } + else if(std::is_same::value) { + shortScalarType = "D"; + } + else if(std::is_same>::value) { + shortScalarType = "C"; + } + else if(std::is_same>::value) { + shortScalarType = "Z"; + } + else { + shortScalarType = "U"; // unknown + } + const std::string sep("_"); + return sep + method + sep + shortScalarType + ".txt"; + } + + // Test the accuracy of a NodeTsqr implementation on an nrows by + // ncols matrix (using the given cache block size (in bytes)), + // and print the results to stdout. + template + static bool + verifyNodeTsqrTmpl(std::ostream& out, + std::vector& iseed, + const NodeTestParameters& params) + { + using Teuchos::TypeNameTraits; + using std::cerr; + using std::endl; + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; + using STM = Teuchos::ScalarTraits; + const bool verbose = params.verbose; + const std::string scalarType = TypeNameTraits::name(); + const std::string fileSuffix = + getFileSuffix(params.nodeTsqrType); + if(verbose) { + cerr << "Test NodeTsqr with Scalar=" << scalarType << endl; + } + + bool success = true; + + const int nrows = params.numRows; + const int ncols = params.numCols; + + Matrix A(nrows, ncols); + Matrix A_copy(nrows, ncols); + Matrix Q(nrows, ncols); + Matrix R(ncols, ncols); + if(std::numeric_limits::has_quiet_NaN) { + deep_copy(A, std::numeric_limits::quiet_NaN()); + deep_copy(A_copy, std::numeric_limits::quiet_NaN()); + deep_copy(Q, std::numeric_limits::quiet_NaN()); + deep_copy(R, std::numeric_limits::quiet_NaN()); + } + const int lda = nrows; + const int ldq = nrows; + const int ldr = ncols; + + if(verbose) { + cerr << "-- Create test problem" << endl; + } + { + TSQR::Random::NormalGenerator gen(iseed); + nodeTestProblem(gen, nrows, ncols, A.data(), A.stride(1), + true); + gen.getSeed(iseed); // fetch seed for the next test + } + + if(params.saveMatrices) { + std::string filename = std::string("A") + fileSuffix; + if(verbose) { + cerr << "-- Save A to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, nrows, ncols, + A.data(), A.stride(1)); + fileOut.close(); + } + + auto nodeTsqrPtr = getNodeTsqr(params); + auto& actor = *nodeTsqrPtr; + if(verbose && actor.wants_device_memory()) { + cerr << "-- NodeTsqr claims to want device memory" << endl; + } + + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + auto A_h = getHostMatrixView(A.view()); + auto A_copy_h = getHostMatrixView(A_copy.view()); + auto Q_h = getHostMatrixView(Q.view()); + device_matrix_type A_d; + device_matrix_type A_copy_d; + device_matrix_type Q_d; + if(actor.wants_device_memory()) { + A_d = getDeviceMatrixCopy(A.view(), "A_d"); + // Don't copy A_copy yet; see below. + A_copy_d = device_matrix_type("A_copy_d", nrows, ncols); + Q_d = device_matrix_type("Q_d", nrows, ncols); + } + + if(! params.contiguousCacheBlocks) { + if(verbose) { + cerr << "-- Copy A into A_copy" << endl; + } + deep_copy(A_copy, A); + if(actor.wants_device_memory()) { + deep_copy(A_copy_d, A_d); + } + } + else { + if(verbose) { + cerr << "-- Copy A into A_copy via cache_block" << endl; + } + if(actor.wants_device_memory()) { + Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + const Scalar* A_d_raw = + reinterpret_cast(A_d.data()); + actor.cache_block(nrows, ncols, A_copy_d_raw, + A_d_raw, A_d.stride(1)); + Kokkos::deep_copy(A_copy_h, A_copy_d); + } + else { + actor.cache_block(nrows, ncols, A_copy.data(), + A.data(), A.stride(1)); + } + if(verbose) { + cerr << "-- Verify cache_block result" << endl; + } + + Matrix A2(nrows, ncols); + if(std::numeric_limits::has_quiet_NaN) { + deep_copy(A2, std::numeric_limits::quiet_NaN()); + } + if(actor.wants_device_memory()) { + auto A2_h = getHostMatrixView(A2.view()); + auto A2_d = getDeviceMatrixCopy(A2.view(), "A2_d"); + Scalar* A2_d_raw = reinterpret_cast(A2_d.data()); + const Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + actor.un_cache_block(nrows, ncols, A2_d_raw, + A2_d.stride(1), A_copy_d_raw); + Kokkos::deep_copy(A2_h, A2_d); + } + else { + actor.un_cache_block(nrows, ncols, A2.data(), + A2.stride(1), A_copy.data()); + } + const bool matrices_equal = matrix_equal(A, A2); + if(! matrices_equal) { + success = false; + if(verbose) { + cerr << "*** cache_block failed!" << endl; + } + } + } + + if(verbose) { + cerr << "-- Fill R with zeros" << endl; + } + // We need to fill R with zeros, since the factorization may not + // overwrite the strict lower triangle of R. + deep_copy(R, Scalar {}); + + if(verbose) { + cerr << "-- Call NodeTsqr::factor" << endl; + } + // R is always in host memory, because that's what Belos wants. + auto factorOutput = [&]() { + if(actor.wants_device_memory()) { + Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + TEUCHOS_ASSERT( nrows == 0 || ncols == 0 || + A_copy_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) == + size_t(nrows) ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) == + size_t(ncols) ); + auto result = + actor.factor(nrows, ncols, A_copy_d_raw, + A_copy_d.stride(1), + R.data(), R.stride(1), + params.contiguousCacheBlocks); + Kokkos::deep_copy(A_copy_h, A_copy_d); + return result; + } + else { + return actor.factor(nrows, ncols, A_copy.data(), + A_copy.stride(1), + R.data(), R.stride(1), + params.contiguousCacheBlocks); + } + }(); + + if(params.saveMatrices) { + std::string filename = std::string("R") + fileSuffix; + if(verbose) { + cerr << "-- Save R to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, ncols, ncols, + R.data(), R.stride(1)); + fileOut.close(); + } + + if(verbose) { + cerr << "-- Call NodeTsqr::explicit_Q" << endl; + } + if(actor.wants_device_memory()) { + const Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + Scalar* Q_d_raw = reinterpret_cast(Q_d.data()); + TEUCHOS_ASSERT( nrows == 0 || ncols == 0 || + Q_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(Q_d.extent(0)) == size_t(nrows) ); + TEUCHOS_ASSERT( size_t(Q_d.extent(1)) == size_t(ncols) ); + actor.explicit_Q(nrows, ncols, + A_copy_d_raw, A_copy_d.stride(1), + *factorOutput, ncols, + Q_d_raw, Q_d.stride(1), + params.contiguousCacheBlocks); + // We copy back to Q_h below, either with un_cache_block (if + // contiguous cache blocks) or directly (if not). + } + else { + actor.explicit_Q(nrows, ncols, + A_copy.data(), A_copy.stride(1), + *factorOutput, ncols, + Q.data(), Q.stride(1), + params.contiguousCacheBlocks); + } + + // "Un"-cache-block the output, if contiguous cache blocks were + // used. This is only necessary because local_verify() doesn't + // currently support contiguous cache blocks. + if(params.contiguousCacheBlocks) { + // Use A_copy as temporary storage for un-cache-blocking Q. + if(verbose) { + cerr << "-- Call NodeTsqr::un_cache_block" << endl; + } + if(actor.wants_device_memory()) { + Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + const Scalar* Q_d_raw = + reinterpret_cast(Q_d.data()); + actor.un_cache_block(nrows, ncols, A_copy_d_raw, + A_copy_d.stride(1), Q_d_raw); + Kokkos::deep_copy(Q_h, A_copy_d); + } + else { + actor.un_cache_block(nrows, ncols, A_copy.data(), + A_copy.stride(1), Q.data()); + deep_copy(Q, A_copy); + } + } + else { + if(actor.wants_device_memory()) { + Kokkos::deep_copy(Q_h, Q_d); + } + } + + if(params.saveMatrices) { + std::string filename = std::string("Q") + fileSuffix; + if(verbose) { + cerr << "-- Save Q to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, nrows, ncols, + Q.data(), Q.stride(1)); + fileOut.close(); + } + + if(verbose) { + cerr << "-- Call local_verify to validate the factorization" + << endl; + } + auto results = local_verify(nrows, ncols, A.data(), lda, + Q.data(), ldq, R.data(), ldr); + + if(verbose) { + cerr << "-- Compute accuracy bounds and check" << endl; + } + + // Accuracy relates to the number of floating-point operations, + // which in turn is a function of the matrix's dimensions. + // Avoid overflow of the local Ordinal type, by casting first to + // a floating-point type. + const mag_type dimsProd = mag_type(nrows) * mag_type(ncols) * + mag_type(ncols); + const mag_type fudgeFactor(10.0); + // Relative residual error is ||A-Q*R|| / ||A||, or just + // ||A-Q*R|| if ||A|| == 0. (The result had better be zero in + // the latter case.) Square root of the matrix dimensions is an + // old heuristic from Wilkinson or perhaps even an earlier + // source. We include a "fudge factor" so that the test won't + // fail unless there is a really good reason. + const mag_type relResidBound = fudgeFactor * + STM::squareroot(dimsProd) * STS::eps(); + + // Relative residual error; avoid division by zero. + const mag_type relResidError = results[0] / + (results[2] == STM::zero() ? STM::one() : results[2]); + + if(relResidError > relResidBound) { + success = false; + if(verbose) { + const std::string relResStr + (results[2] == STM::zero() ? " / ||A||_F" : ""); + cerr << "*** For NodeTsqr=" << params.nodeTsqrType + << " with Scalar=" << scalarType << ": " + << "Residual ||A - QR||_F" << relResStr + << " = " << relResidError << " > bound " + << relResidBound << "." << endl; + } + } + + // Orthogonality of the matrix should not depend on the matrix + // dimensions, if we measure in the 2-norm. However, we are + // measuring in the Frobenius norm, so it's appropriate to + // multiply eps by the number of entries in the matrix for which + // we compute the Frobenius norm. We include a "fudge factor" + // for the same reason as mentioned above. + const mag_type orthoBound = fudgeFactor * + mag_type(ncols) * mag_type(ncols) * STS::eps(); + + const mag_type orthoError = results[1]; + if(orthoError > orthoBound) { + success = false; + if(verbose) { + cerr << "*** For NodeTsqr=" << params.nodeTsqrType + << " with Scalar=" << scalarType << ": " + << "Orthogonality ||I - Q^* Q||_F = " << orthoError + << " > bound " << orthoBound << "." << endl; + } + } + + if(params.humanReadable) { + out << "NodeTsqr subclass: " << params.nodeTsqrType + << endl + << " - Scalar type: " << scalarType << endl + << " - Matrix dimensions: " << nrows << " by " << ncols + << endl + << " - Cache Size Hint: " << params.cacheSizeHint + << endl + << " - Contiguous cache blocks: " + << (params.contiguousCacheBlocks ? "true" : "false") + << endl + << " - Input matrix norm $\\| A \\|_F$: " << results[2] + << endl + << " - Residual $\\| A - QR \\|_F$: " << results[0] + << endl + << " - Orthogonality $\\| I - Q^* Q \\|_F$: " + << results[1] << endl + << endl; + } + else { + out << params.nodeTsqrType + << "," << scalarType + << "," << nrows + << "," << ncols + << "," << params.cacheSizeHint + << "," + << (params.contiguousCacheBlocks ? "true" : "false") + << "," << results[2] + << "," << results[0] + << "," << results[1]; + out << endl; + } + return success; + } + + bool + verifyNodeTsqr(std::ostream& out, + const NodeTestParameters& p) + { + // Seed for the next pseudorandom number generator. We do tests + // one after another, using the seed from the previous test in + // the current test, so that the pseudorandom streams used by + // the tests are independent. + std::vector iseed{{0, 0, 0, 1}}; + + bool success = true; + if(p.testReal) { + const bool ok_S = verifyNodeTsqrTmpl(out, iseed, p); + const bool ok_D = verifyNodeTsqrTmpl(out, iseed, p); + success = success && ok_S && ok_D; + } + if(p.testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX + const bool ok_C = + verifyNodeTsqrTmpl>(out, iseed, p); + const bool ok_Z = + verifyNodeTsqrTmpl>(out, iseed, p); + success = success && ok_C && ok_Z; +#else // HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "TSQR was not built with complex " + "arithmetic support."); +#endif // HAVE_TPETRATSQR_COMPLEX + } + return success; + } + + template class LapackType, class Scalar> + static void + verifyLapackTmpl(std::ostream& out, + std::vector& iseed, + LapackType& lapack, + const NodeTestParameters& params, + const std::string& lapackImplName) + { + using std::cerr; + using std::endl; + using STS = Teuchos::ScalarTraits; + using mag_type = typename STS::magnitudeType; + const bool verbose = params.verbose; + + const std::string scalarType = + Teuchos::TypeNameTraits::name(); + const std::string fileSuffix = getFileSuffix("Lapack"); + + if(verbose) { + cerr << "Test RawQR<" << scalarType << "> implementation " + << lapackImplName << " whose type is " + << Teuchos::typeName(lapack) << endl; + if(lapack.wants_device_memory()) { + cerr << "-- RawQR subclass claims to want device memory" + << endl; + } + } + const int nrows = params.numRows; + const int ncols = params.numCols; + + Matrix A(nrows, ncols); + Matrix A_copy(nrows, ncols); + Matrix Q(nrows, ncols); + Matrix R(ncols, ncols); + if(std::numeric_limits::has_quiet_NaN) { + deep_copy(A, std::numeric_limits< Scalar>::quiet_NaN()); + deep_copy(A_copy, std::numeric_limits::quiet_NaN()); + deep_copy(Q, std::numeric_limits::quiet_NaN()); + deep_copy(R, std::numeric_limits::quiet_NaN()); + } + const int lda = nrows; + const int ldq = nrows; + const int ldr = ncols; + + if(verbose) { + cerr << "-- Create test problem" << endl; + } + { + TSQR::Random::NormalGenerator gen(iseed); + nodeTestProblem(gen, nrows, ncols, A.data(), A.stride(1), + true); + gen.getSeed(iseed); // fetch seed for the next test + } + + if(params.saveMatrices) { + std::string filename = std::string("A") + fileSuffix; + if(verbose) { + cerr << "-- Save A to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, nrows, ncols, + A.data(), A.stride(1)); + fileOut.close(); + } + + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + auto A_h = getHostMatrixView(A.view()); + auto A_copy_h = getHostMatrixView(A_copy.view()); + auto Q_h = getHostMatrixView(Q.view()); + device_matrix_type A_d; + device_matrix_type A_copy_d; + device_matrix_type Q_d; + if(lapack.wants_device_memory()) { + A_d = getDeviceMatrixCopy(A.view(), "A_d"); + // Don't copy A_copy yet; see below. + A_copy_d = device_matrix_type("A_copy_d", nrows, ncols); + Q_d = device_matrix_type("Q_d", nrows, ncols); + } + + if(verbose) { + cerr << "-- Copy A into A_copy" << endl; + } + deep_copy(A_copy, A); + if(lapack.wants_device_memory()) { + deep_copy(A_copy_d, A_d); + } + + if(verbose) { + cerr << "-- Fill R with zeros" << endl; + } + // We need to do this because the factorization may not + // overwrite the strict lower triangle of R. R is always in + // host memory. + deep_copy(R, Scalar {}); + + if(verbose) { + cerr << "-- Do LAPACK lwork query" << endl; + } + const int lwork = [&]() { + if(lapack.wants_device_memory()) { + Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + const int A_copy_d_lda(A_copy_d.stride(1)); + TEUCHOS_ASSERT( nrows == 0 || ncols == 0 || + A_copy_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) == + size_t(nrows) ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) == + size_t(ncols) ); + return lapack.compute_QR_lwork(nrows, ncols, A_copy_d_raw, + A_copy_d_lda); + } + else { + Scalar* A_copy_raw = A_copy.data(); + const int A_copy_lda(A_copy.stride(1)); + return lapack.compute_QR_lwork(nrows, ncols, A_copy_raw, + A_copy_lda); + } + }(); + if(verbose) { + cerr << "-- lwork=" << lwork << endl; + } + std::vector work(lwork); + std::vector tau(ncols); + + Kokkos::View work_d; + Kokkos::View tau_d; + if(lapack.wants_device_memory()) { + work_d = Kokkos::View("work_d", lwork); + tau_d = Kokkos::View("tau_d", ncols); + } + + if(verbose) { + cerr << "-- Call compute_QR" << endl; + } + + if(lapack.wants_device_memory()) { + Scalar* A_copy_d_raw = + reinterpret_cast(A_copy_d.data()); + Scalar* tau_d_raw = reinterpret_cast(tau_d.data()); + Scalar* work_d_raw = + reinterpret_cast(work_d.data()); + TEUCHOS_ASSERT( ncols == 0 || tau_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(tau_d.extent(0)) >= size_t(ncols) ); + TEUCHOS_ASSERT( lwork == 0 || work_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(work_d.extent(0)) >= size_t(lwork) ); + TEUCHOS_ASSERT( nrows == 0 || ncols == 0 || + A_copy_d_raw != nullptr ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) == + size_t(nrows) ); + TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) == + size_t(ncols) ); + lapack.compute_QR(nrows, ncols, A_copy_d_raw, + A_copy_d.stride(1), tau_d_raw, + work_d_raw, lwork); + Kokkos::deep_copy(A_copy_h, A_copy_d); + } + else { + lapack.compute_QR(nrows, ncols, A_copy.data(), + A_copy.stride(1), tau.data(), + work.data(), lwork); + } + + if(verbose) { + cerr << "-- Copy R out of in-place result" << endl; + } + copy_upper_triangle(R, A_copy); + if(params.saveMatrices) { + std::string filename = std::string("R") + fileSuffix; + if(verbose) { + cerr << "-- Save R to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, ncols, ncols, + R.data(), R.stride(1)); + fileOut.close(); + } + + // The explicit Q factor will be computed in place, so copy the + // result of the factorization into Q. + deep_copy(Q, A_copy); + if(lapack.wants_device_memory()) { + deep_copy(Q_d, A_copy_d); + } + + if(verbose) { + cerr << "-- Call Lapack::compute_explicit_Q" << endl; + } + if(lapack.wants_device_memory()) { + Scalar* Q_d_raw = reinterpret_cast(Q_d.data()); + const Scalar* tau_d_raw = + reinterpret_cast(tau_d.data()); + Scalar* work_d_raw = + reinterpret_cast(work_d.data()); + lapack.compute_explicit_Q(nrows, ncols, ncols, + Q_d_raw, ldq, tau_d_raw, + work_d_raw, lwork); + deep_copy(Q_h, Q_d); + } + else { + lapack.compute_explicit_Q(nrows, ncols, ncols, + Q.data(), ldq, tau.data(), + work.data(), lwork); + } + + if(params.saveMatrices) { + std::string filename = std::string("Q") + fileSuffix; + if(verbose) { + cerr << "-- Save Q to \"" << filename << "\"" << endl; + } + std::ofstream fileOut(filename.c_str()); + print_local_matrix(fileOut, nrows, ncols, + Q.data(), Q.stride(1)); + fileOut.close(); + } + + if(verbose) { + cerr << "-- Call local_verify to validate the factorization" + << endl; + } + auto results = local_verify(nrows, ncols, A.data(), lda, + Q.data(), ldq, R.data(), ldr); + + if(params.humanReadable) { + out << lapackImplName << ":" << endl + << " - Scalar type: " << scalarType << endl + << " - Matrix dimensions: " << nrows << " by " << ncols + << endl + << " - Matrix norm $\\| A \\|_F$: " + << results[2] << endl + << " - Residual $\\| A - QR \\|_F$: " + << results[0] << endl + << " - Orthogonality $\\| I - Q^* Q \\|_F$: " + << results[1] << endl + << endl; + } + else { + out << lapackImplName + << "," << scalarType + << "," << nrows + << "," << ncols + << ",0" // cacheSizeHint + << ",false" // contiguousCacheBlocks + << "," << results[2] + << "," << results[0] + << "," << results[1]; + out << endl; + } + } + + template + void + verifyLapackImplementations(std::ostream& out, + std::vector& iseed, + const NodeTestParameters& p) + { +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + { + // Make sure that both Lapack and CuSolver get the same + // pseudorandom seed. + std::vector iseed_copy(iseed); + auto handle = Impl::CuSolverHandle::getSingleton(); + Kokkos::View info("info"); + Impl::CuSolver solver(handle, info.data()); + verifyLapackTmpl(out, iseed_copy, solver, p, "CUSOLVER"); + } +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER + { + Impl::Lapack lapack; + verifyLapackTmpl(out, iseed, lapack, p, "LAPACK"); + } + } + + void + verifyLapack(std::ostream& out, + const NodeTestParameters& p) + { + // We do tests one after another, using the seed from the + // previous test in the current test, so that the pseudorandom + // streams used by the tests are independent. + std::vector iseed {{0, 0, 0, 1}}; + if(p.testReal) { + verifyLapackImplementations(out, iseed, p); + verifyLapackImplementations(out, iseed, p); + } + if(p.testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX + verifyLapackImplementations> + (out, iseed, p); + verifyLapackImplementations> + (out, iseed, p); +#else // HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "TSQR was not built with complex " + "arithmetic support."); +#endif // HAVE_TPETRATSQR_COMPLEX + } + } + + static void + printBenchmarkFieldNames(std::ostream& out) + { + const char prefix[] = "%"; + out << prefix << "method" + << ",scalarType" + << ",numRows" + << ",numCols" + << ",cacheSizeHint" + << ",contiguousCacheBlocks" + << ",numTrials" + << ",timing" << std::endl; + } + + template class LapackType, class Scalar> + void + benchmarkLapackTmpl(std::ostream& out, + std::vector& iseed, + LapackType& lapack, + const NodeTestParameters& params, + const std::string& lapackImplName) + { + using std::endl; + + const int numRows = params.numRows; + const int numCols = params.numCols; + const int numTrials = params.numTrials; + + Matrix A(numRows, numCols); + Matrix Q(numRows, numCols); + Matrix R(numCols, numCols); + const int lda = numRows; + const int ldq = numRows; + + { + using prng_type = TSQR::Random::NormalGenerator; + prng_type gen(iseed); + nodeTestProblem(gen, numRows, numCols, A.data(), lda, false); + gen.getSeed(iseed); + } + + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + auto A_h = getHostMatrixView(A.view()); + auto Q_h = getHostMatrixView(Q.view()); + device_matrix_type A_d; + device_matrix_type Q_d; + if(lapack.wants_device_memory()) { + A_d = getDeviceMatrixCopy(A.view(), "A_d"); + Q_d = device_matrix_type("Q_d", numRows, numCols); + } + + // Copy A into Q, since LAPACK QR overwrites the input. We only + // need Q because LAPACK's computation of the explicit Q factor + // occurs in place. This doesn't work with TSQR. To give + // LAPACK QR the fullest possible advantage over TSQR, we don't + // allocate an A_copy here (as we would when benchmarking TSQR). + deep_copy(Q, A); + if(lapack.wants_device_memory()) { + deep_copy(Q_d, A_d); + } + + // Determine the required workspace for the factorization + const int lwork = + lworkQueryLapackQr(lapack, numRows, numCols, lda); + std::vector work(lwork); + std::vector tau(numCols); + + Kokkos::View work_d; + Kokkos::View tau_d; + if(lapack.wants_device_memory()) { + work_d = Kokkos::View("work_d", lwork); + tau_d = Kokkos::View("tau_d", numCols); + } + + // Benchmark LAPACK's QR factorization for numTrials trials. + Teuchos::Time timer("LAPACK"); + timer.start(); + for(int trialNum = 0; trialNum < numTrials; ++trialNum) { + if(lapack.wants_device_memory()) { + Scalar* Q_raw = reinterpret_cast(Q_d.data()); + Scalar* tau_raw = reinterpret_cast(tau_d.data()); + Scalar* work_raw = + reinterpret_cast(work_d.data()); + lapack.compute_QR(numRows, numCols, + Q_raw, Q_d.stride(1), + tau_raw, work_raw, lwork); + } + else { + lapack.compute_QR(numRows, numCols, + Q.data(), ldq, + tau.data(), work.data(), lwork); + } + + if(lapack.wants_device_memory()) { + // FIXME (mfh 18 Dec 2019) We should actually extract the + // upper triangle here and copy it to host, to get a fair + // comparison with TSQR. + + Scalar* Q_raw = reinterpret_cast(Q_d.data()); + const Scalar* tau_raw = + reinterpret_cast(tau_d.data()); + Scalar* work_raw = + reinterpret_cast(work_d.data()); + lapack.compute_explicit_Q(numRows, numCols, numCols, + Q_raw, Q_d.stride(1), + tau_raw, work_raw, lwork); + } + else { + // Extract the upper triangular factor R from Q (where it was + // computed in place by GEQRF), since UNGQR will overwrite all + // of Q with the explicit Q factor. + copy_upper_triangle(R, Q); + lapack.compute_explicit_Q(numRows, numCols, numCols, + Q.data(), ldq, tau.data(), + work.data(), lwork); + } + } + const double lapackTiming = timer.stop(); + + const std::string scalarType = + Teuchos::TypeNameTraits::name(); + + if(params.humanReadable) { + out << lapackImplName << ":" << endl + << " Scalar: " << scalarType << endl + << " numRows: " << numRows << endl + << " numCols: " << numCols << endl + << " numTrials: " << numTrials << endl + << "Total time (s) = " << lapackTiming << endl + << endl; + } + else { + // "0" refers to the cache size hint, which is not applicable + // in this case; we retain it for easy comparison of results + // with NodeTsqr (so that the number of fields is the same in + // both cases). "false" (that follows 0) refers to whether or + // not contiguous cache blocks were used (see TSQR::NodeTsqr); + // this is also not applicable here. + out << lapackImplName + << "," << scalarType + << "," << numRows + << "," << numCols + << ",0" + << ",false" + << "," << numTrials + << "," << lapackTiming << endl; + } + } + + template + void + benchmarkLapackImplementations(std::ostream& out, + std::vector& iseed, + const NodeTestParameters& p) + { +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + { + // Make sure that both Lapack and CuSolver get the same + // pseudorandom seed. + std::vector iseed_copy(iseed); + auto handle = Impl::CuSolverHandle::getSingleton(); + Kokkos::View info("info"); + Impl::CuSolver solver(handle, info.data()); + benchmarkLapackTmpl(out, iseed_copy, solver, p, "CUSOLVER"); + } +#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER + { + Impl::Lapack lapack; + benchmarkLapackTmpl(out, iseed, lapack, p, "LAPACK"); + } + } + + void + benchmarkLapack(std::ostream& out, + const NodeTestParameters& p) + { + std::vector iseed{{0, 0, 0, 1}}; + if(p.testReal) { + benchmarkLapackImplementations(out, iseed, p); + benchmarkLapackImplementations(out, iseed, p); + } + if(p.testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX + benchmarkLapackImplementations>(out, iseed, p); + benchmarkLapackImplementations>(out, iseed, p); +#else // Don't HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, + "TSQR was not built with complex arithmetic support."); +#endif // HAVE_TPETRATSQR_COMPLEX + } + } + + template + void + benchmarkNodeTsqrTmpl(std::ostream& out, + std::vector& iseed, + NodeTsqr& actor, + const NodeTestParameters& params, + const std::string& nodeTsqrType) + { + using std::endl; + + const int numRows = params.numRows; + const int numCols = params.numCols; + const int numTrials = params.numTrials; + const bool contiguousCacheBlocks = + params.contiguousCacheBlocks; + + Matrix A(numRows, numCols); + Matrix A_copy(numRows, numCols); + Matrix Q(numRows, numCols); + Matrix R(numCols, numCols); + + { + using prng_type = TSQR::Random::NormalGenerator; + prng_type gen(iseed); + nodeTestProblem(gen, numRows, numCols, + A.data(), A.stride(1), false); + gen.getSeed(iseed); + } + deep_copy(A_copy, A); // need copy since TSQR overwrites + + using IST = typename Kokkos::ArithTraits::val_type; + using device_matrix_type = + Kokkos::View; + + auto A_copy_h = getHostMatrixView(A_copy.view()); + auto Q_h = getHostMatrixView(Q.view()); + device_matrix_type A_copy_d; + device_matrix_type Q_d; + if(actor.wants_device_memory()) { + A_copy_d = getDeviceMatrixCopy(A_copy.view(), "A_copy_d"); + Q_d = device_matrix_type("Q_d", numRows, numCols); + } + + // Benchmark sequential TSQR for numTrials trials. + Teuchos::Time timer("NodeTsqr"); + timer.start(); + for(int trialNum = 0; trialNum < numTrials; ++trialNum) { + if(actor.wants_device_memory()) { + Scalar* A_raw = + reinterpret_cast(A_copy_d.data()); + auto factorOutput = + actor.factor(numRows, numCols, + A_raw, A_copy_d.stride(1), + R.data(), R.stride(1), + contiguousCacheBlocks); + // Unlike with LAPACK, this doesn't happen in place: the + // implicit Q factor is stored in A_copy_d, and the explicit + // Q factor is written to Q_d. + Scalar* Q_raw = reinterpret_cast(Q_d.data()); + actor.explicit_Q(numRows, numCols, + A_raw, A_copy_d.stride(1), + *factorOutput, numCols, + Q_raw, Q_d.stride(1), + contiguousCacheBlocks); + } + else { + Scalar* A_raw = A_copy.data(); + auto factorOutput = + actor.factor(numRows, numCols, + A_raw, A_copy.stride(1), + R.data(), R.stride(1), + contiguousCacheBlocks); + // Unlike with LAPACK, this doesn't happen in place: the + // implicit Q factor is stored in A_copy, and the explicit Q + // factor is written to Q. + Scalar* Q_raw = Q.data(); + actor.explicit_Q(numRows, numCols, + A_raw, A_copy.stride(1), + *factorOutput, numCols, + Q_raw, Q.stride(1), + contiguousCacheBlocks); + } + } + const double nodeTsqrTiming = timer.stop(); + + const std::string scalarType = + Teuchos::TypeNameTraits::name(); + + if(params.humanReadable) { + out << "NodeTsqr:" << endl + << " Implementation: " << nodeTsqrType << endl + << " Scalar: " << scalarType << endl + << " numRows: " << numRows << endl + << " numCols: " << numCols << endl + << " cache size hint (bytes): " + << params.cacheSizeHint << endl + << " contiguous cache blocks? " + << (contiguousCacheBlocks ? "true" : "false") << endl + << " # trials = " << numTrials << endl + << "Total time (s) = " << nodeTsqrTiming << endl + << endl; + } + else { + out << nodeTsqrType + << "," << scalarType + << "," << numRows + << "," << numCols + << "," << params.cacheSizeHint + << "," << (contiguousCacheBlocks ? "true" : "false") + << "," << numTrials + << "," << nodeTsqrTiming << endl; + } + } + + // If nodeTsqrType == "", use p.nodeTsqrType. + template + void + benchmarkNodeTsqrImplementation(std::ostream& out, + const std::vector& iseed, + const NodeTestParameters& p, + const std::string& nodeTsqrType = "") + { + // Make sure that all NodeTsqr implementations get the same + // pseudorandom seed. That way, if there are any data-dependent + // performance effects (e.g., subnorms), all implementations + // will see them. + std::vector iseed_copy(iseed); + auto nodeTsqrPtr = getNodeTsqr(p, nodeTsqrType); + benchmarkNodeTsqrTmpl(out, iseed_copy, *nodeTsqrPtr, p, + nodeTsqrType); + } + + template + void + benchmarkNodeTsqrImplementations(std::ostream& out, + std::vector& iseed, + const NodeTestParameters& p) + { + + if(p.nodeTsqrType == "all" || p.nodeTsqrType == "ALL" || + p.nodeTsqrType == "All") { + const char* nodeTsqrImpls[] = + {"CombineNodeTsqr", +#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER) + "CuSolverNodeTsqr", +#endif + "SequentialTsqr"}; + for(auto&& nodeTsqrType : nodeTsqrImpls) { + benchmarkNodeTsqrImplementation(out, iseed, p, + nodeTsqrType); + } + } + else { + benchmarkNodeTsqrImplementation(out, iseed, p); + } + } + + void + benchmarkNodeTsqr(std::ostream& out, + const NodeTestParameters& p) + { + using Teuchos::TypeNameTraits; + using LO = int; + + std::vector iseed{{0, 0, 0, 1}}; + if(p.testReal) { + benchmarkNodeTsqrImplementations(out, iseed, p); + benchmarkNodeTsqrImplementations(out, iseed, p); + } + if(p.testComplex) { +#ifdef HAVE_TPETRATSQR_COMPLEX + benchmarkNodeTsqrImplementations> + (out, iseed, p); + benchmarkNodeTsqrImplementations> + (out, iseed, p); +#else // Don't HAVE_TPETRATSQR_COMPLEX + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, + "TSQR was not built with complex arithmetic support."); +#endif // HAVE_TPETRATSQR_COMPLEX + } + } + } // namespace Test +} // namespace TSQR + +int +main(int argc, char *argv[]) +{ + using TSQR::Test::parseOptions; + using std::cerr; + using std::cout; + using std::endl; + + // Fetch command-line parameters. + bool printedHelp = false; + auto params = parseOptions(argc, argv, printedHelp); + if(printedHelp) { + return EXIT_SUCCESS; + } + + cout << "NodeTsqr verify/benchmark test options:" << endl; + printNodeTestParameters(cout, params, " - "); + + bool success = true; + try { + Kokkos::ScopeGuard kokkosScope(argc, argv); + + // We allow the same run to do both benchmark and verify. + if(params.verify) { + if(! params.humanReadable) { + TSQR::Test::printVerifyFieldNames(cout); + } + TSQR::Test::verifyLapack(cout, params); + success = TSQR::Test::verifyNodeTsqr(cout, params); + } + if(params.benchmark) { + if(! params.humanReadable) { + TSQR::Test::printBenchmarkFieldNames(cout); + } + TSQR::Test::benchmarkLapack(cout, params); + TSQR::Test::benchmarkNodeTsqr(cout, params); + } + + if(params.printTrilinosTestStuff) { + // The Trilinos test framework expects a message like this. + if(success) { + cout << "\nEnd Result: TEST PASSED" << endl; + } + else { + cout << "\nEnd Result: TEST FAILED" << endl; + } + } + } + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, cerr, success); + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp deleted file mode 100644 index 26c4222dea57..000000000000 --- a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp +++ /dev/null @@ -1,352 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#include "Tsqr_ConfigDefs.hpp" -#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI -#include "Teuchos_Tuple.hpp" -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Tsqr_SeqTest.hpp" - -#ifdef HAVE_KOKKOSTSQR_COMPLEX -# include -#endif // HAVE_KOKKOSTSQR_COMPLEX - -#include -#include -#include - - -namespace TSQR { - namespace Trilinos { - namespace Test { - - const char docString[] = "This program tests TSQR::SequentialTsqr, " - "which implements the sequential cache-blocked version of TSQR. " - "Accuracy and performance tests are included."; - - using Teuchos::RCP; - using Teuchos::Tuple; - - /// \class SeqTestParameters - /// \brief Encapsulates values of command-line parameters - /// - struct SeqTestParameters { - SeqTestParameters () : - verify (false), - benchmark (false), - numRows (1000), - numCols (10), - numTrials (10), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (true), -#endif // HAVE_KOKKOSTSQR_COMPLEX - cacheSizeHint (0), // choose a reasonable default - contiguousCacheBlocks (false), - printFieldNames (true), - printTrilinosTestStuff (true), - humanReadable (false), - debug (false) - {} - - bool verify, benchmark; - int numRows, numCols, numTrials; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - size_t cacheSizeHint; - bool contiguousCacheBlocks; - std::string additionalFieldNames, additionalData; - bool printFieldNames, printTrilinosTestStuff, humanReadable, debug; - }; - - static void - benchmark (std::ostream& out, - const SeqTestParameters& params) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = params.testComplex; -#else - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - using TSQR::Test::benchmarkSeqTsqr; - benchmarkSeqTsqr (out, - params.numRows, - params.numCols, - params.numTrials, - params.cacheSizeHint, - params.contiguousCacheBlocks, - testComplex, - params.additionalFieldNames, - params.additionalData, - params.printFieldNames, - params.humanReadable); - } - - static void - verify (std::ostream& out, - const SeqTestParameters& params) - { -#ifdef HAVE_KOKKOSTSQR_COMPLEX - const bool testComplex = params.testComplex; -#else - const bool testComplex = false; -#endif // HAVE_KOKKOSTSQR_COMPLEX - const bool saveMatrices = false; - - using TSQR::Test::verifySeqTsqr; - verifySeqTsqr (out, - params.numRows, - params.numCols, - params.cacheSizeHint, - testComplex, - saveMatrices, - params.contiguousCacheBlocks, - params.additionalFieldNames, - params.additionalData, - params.printFieldNames, - params.humanReadable, - params.debug); - } - - /// \brief Parse command-line options for this test - /// - /// \param argc [in] As usual in C(++) - /// \param argv [in] As usual in C(++) - /// \param allowedToPrint [in] Whether this (MPI) process is allowed - /// to print to stdout/stderr. Different per (MPI) process. - /// \param printedHelp [out] Whether this (MPI) process printed the - /// "help" display (summary of command-line options) - /// - /// \return Encapsulation of command-line options - static SeqTestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - SeqTestParameters params; - /// We really want the cache block size as a size_t, but - /// Teuchos::CommandLineProcessor doesn't offer that option. - /// So we read it in as an int, which means negative inputs - /// are possible. We check for those below in the input - /// validation phase. - // - // Fetch default value of cacheSizeHint. - int cacheSizeHintAsInt = static_cast (params.cacheSizeHint); - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy"); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance"); - cmdLineProc.setOption ("nrows", - ¶ms.numRows, - "Number of rows in the test matrix"); - cmdLineProc.setOption ("ncols", - ¶ms.numCols, - "Number of columns in the test matrix"); - cmdLineProc.setOption ("ntrials", - ¶ms.numTrials, - "Number of trials (only used when \"--benchmark\""); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("complex", - "nocomplex", - ¶ms.testComplex, - "Test complex arithmetic, as well as real"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("cache-block-size", - &cacheSizeHintAsInt, - "Cache size hint in bytes (0 means pick a reasonable default)"); - cmdLineProc.setOption ("contiguous-cache-blocks", - "noncontiguous-cache-blocks", - ¶ms.contiguousCacheBlocks, - "Whether cache blocks should be stored contiguously"); - cmdLineProc.setOption ("field-names", - ¶ms.additionalFieldNames, - "Any additional field name(s) (comma-delimited " - "string) to add to the benchmark output. Empty " - "by default. Good for things known when invoking " - "the benchmark executable, but not (easily) known " - "inside the benchmark -- e.g., environment " - "variables."); - cmdLineProc.setOption ("output-data", - ¶ms.additionalData, - "Any additional data to add to the output, " - "corresponding to the above field name(s). " - "Empty by default."); - cmdLineProc.setOption ("print-field-names", - "no-print-field-names", - ¶ms.printFieldNames, - "Print field names (for machine-readable output only)"); - cmdLineProc.setOption ("print-trilinos-test-stuff", - "no-print-trilinos-test-stuff", - ¶ms.printTrilinosTestStuff, - "Print output that makes the Trilinos test " - "framework happy (but makes benchmark results " - "parsing scripts unhappy)"); - cmdLineProc.setOption ("human-readable", - "machine-readable", - ¶ms.humanReadable, - "If set, make output easy to read by humans " - "(but hard to parse)"); - cmdLineProc.setOption ("debug", - "nodebug", - ¶ms.debug, - "Print debugging information"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) - cerr << "Unrecognized command-line option: " << e.what() << endl; - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } - - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - if (params.numRows <= 0) - throw std::invalid_argument ("Number of rows must be positive"); - else if (params.numCols <= 0) - throw std::invalid_argument ("Number of columns must be positive"); - else if (params.numRows < params.numCols) - throw std::invalid_argument ("Number of rows must be >= number of columns"); - else if (params.benchmark && params.numTrials < 1) - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - else - { - if (cacheSizeHintAsInt < 0) - throw std::invalid_argument ("Cache size hint must be nonnegative"); - else - params.cacheSizeHint = static_cast< size_t > (cacheSizeHintAsInt); - } - return params; - } - - } // namespace Test - } // namespace Trilinos -} // namespace TSQR - - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - - int -main (int argc, char *argv[]) -{ - using Teuchos::RCP; - using TSQR::Trilinos::Test::SeqTestParameters; - using TSQR::Trilinos::Test::parseOptions; - -#ifdef HAVE_MPI - typedef RCP< const Teuchos::Comm > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to stdout. The other MPI process ranks - // send their output to something that looks like /dev/null (and - // likely is, on Unix-y operating systems). - std::ostream& out = (myRank == 0) ? std::cout : blackhole; - // Only Rank 0 performs the tests. - const bool performingTests = (myRank == 0); - const bool allowedToPrint = (myRank == 0); - -#else // Don't HAVE_MPI: single-node test - - const bool performingTests = true; - const bool allowedToPrint = true; - std::ostream& out = std::cout; -#endif // HAVE_MPI - - // Fetch command-line parameters. - bool printedHelp = false; - SeqTestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) - return 0; - - bool success = false; - bool verbose = false; - try { - if (performingTests) - { - using std::endl; - - if (params.benchmark) - TSQR::Trilinos::Test::benchmark (out, params); - - // We allow the same run to do both benchmark and verify. - if (params.verify) - TSQR::Trilinos::Test::verify (out, params); - - success = true; - - if (params.printTrilinosTestStuff) - // The Trilinos test framework expects a message like this. - out << "\nEnd Result: TEST PASSED" << endl; - } - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); -} diff --git a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp deleted file mode 100644 index e70a8c1c3b3c..000000000000 --- a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp +++ /dev/null @@ -1,473 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos: Node API and Parallel Node Kernels -// Copyright (2008) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// ************************************************************************ -//@HEADER - -#include "Tsqr_ConfigDefs.hpp" -#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI -#include "Teuchos_Tuple.hpp" -#ifdef HAVE_MPI -# include "Teuchos_GlobalMPISession.hpp" -# include "Teuchos_oblackholestream.hpp" -#endif // HAVE_MPI -#include "Teuchos_CommandLineProcessor.hpp" -#include "Teuchos_DefaultComm.hpp" -#include "Teuchos_Time.hpp" -#include "Teuchos_StandardCatchMacros.hpp" -#include "Tsqr_TbbTest.hpp" - -#ifdef HAVE_KOKKOSTSQR_COMPLEX -# include -#endif // HAVE_KOKKOSTSQR_COMPLEX - -#include -#include -#include - - -namespace TSQR { -namespace Trilinos { -namespace Test { - - const char docString[] = "This program tests TSQR::TbbTsqr, " - "which implements the Intel TBB intranode parallel version of TSQR. " - "Accuracy and performance tests are included."; - - using Teuchos::RCP; - using Teuchos::Tuple; - - /// \class TbbTestParameters - /// \brief Encapsulates values of command-line parameters - struct TbbTestParameters { - TbbTestParameters () : - verify (false), - benchmark (false), - numCores (1), - numRows (1000), - numCols (10), - numTrials (10), - testReal (true), -#ifdef HAVE_KOKKOSTSQR_COMPLEX - testComplex (false), -#endif // HAVE_KOKKOSTSQR_COMPLEX - cacheSizeHint (0), - contiguousCacheBlocks (false), - printFieldNames (true), - humanReadable (false), - debug (false) - {} - - bool verify, benchmark; - int numCores, numRows, numCols, numTrials; - bool testReal; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - bool testComplex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - size_t cacheSizeHint; - bool contiguousCacheBlocks, printFieldNames, humanReadable, debug; - }; - - static void - benchmark (const TbbTestParameters& params) - { - using TSQR::Test::benchmarkTbbTsqr; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - // Only print field names (if at all) for the first data type tested. - bool printedFieldNames = false; - - if (params.testReal) { - { - std::string scalarTypeName ("float"); - benchmarkTbbTsqr (scalarTypeName, - params.numTrials, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable); - if (params.printFieldNames && ! printedFieldNames) - printedFieldNames = true; - } - { - std::string scalarTypeName ("double"); - benchmarkTbbTsqr (scalarTypeName, - params.numTrials, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable); - if (params.printFieldNames && ! printedFieldNames) - printedFieldNames = true; - } - } -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - { - std::string scalarTypeName ("complex"); - benchmarkTbbTsqr > (scalarTypeName, - params.numTrials, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable); - if (params.printFieldNames && ! printedFieldNames) - printedFieldNames = true; - } - { - std::string scalarTypeName ("complex"); - benchmarkTbbTsqr > (scalarTypeName, - params.numTrials, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable); - if (params.printFieldNames && ! printedFieldNames) - printedFieldNames = true; - } - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - - static void - verify (const TbbTestParameters& params) - { - using TSQR::Test::verifyTbbTsqr; -#ifdef HAVE_KOKKOSTSQR_COMPLEX - using std::complex; -#endif // HAVE_KOKKOSTSQR_COMPLEX - - std::vector seed(4); - seed[0] = 0; - seed[1] = 0; - seed[2] = 0; - seed[3] = 1; - - // Only print field names (if at all) for the first data type tested. - bool printedFieldNames = false; - - if (params.testReal) { - { - TSQR::Random::NormalGenerator gen (seed); - std::string scalarTypeName ("float"); - verifyTbbTsqr (scalarTypeName, - gen, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable, - params.debug); - if (params.printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - gen.getSeed (seed); - } - { - TSQR::Random::NormalGenerator gen (seed); - std::string scalarTypeName ("double"); - verifyTbbTsqr (scalarTypeName, - gen, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable, - params.debug); - if (params.printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - gen.getSeed (seed); - } - } // if (params.testReal) -#ifdef HAVE_KOKKOSTSQR_COMPLEX - if (params.testComplex) { - { - TSQR::Random::NormalGenerator > gen (seed); - std::string scalarTypeName ("complex"); - verifyTbbTsqr > (scalarTypeName, - gen, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable, - params.debug); - if (params.printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - gen.getSeed (seed); - } - { - TSQR::Random::NormalGenerator > gen (seed); - std::string scalarTypeName ("complex"); - verifyTbbTsqr > (scalarTypeName, - gen, - params.numRows, - params.numCols, - params.numCores, - params.cacheSizeHint, - params.contiguousCacheBlocks, - params.printFieldNames && ! printedFieldNames, - params.humanReadable, - params.debug); - if (params.printFieldNames && ! printedFieldNames) { - printedFieldNames = true; - } - gen.getSeed (seed); - } - } -#endif // HAVE_KOKKOSTSQR_COMPLEX - } - - /// \brief Parse command-line options for this test - /// - /// \param argc [in] As usual in C(++) - /// \param argv [in] As usual in C(++) - /// \param allowedToPrint [in] Whether this (MPI) process is allowed - /// to print to stdout/stderr. Different per (MPI) process. - /// \param printedHelp [out] Whether this (MPI) process printed the - /// "help" display (summary of command-line options) - /// - /// \return Encapsulation of command-line options - static TbbTestParameters - parseOptions (int argc, - char* argv[], - const bool allowedToPrint, - bool& printedHelp) - { - using std::cerr; - using std::endl; - - printedHelp = false; - - // Command-line parameters, set to their default values. - TbbTestParameters params; - /// We really want the cache block size as a size_t, but - /// Teuchos::CommandLineProcessor doesn't offer that option. - /// So we read it in as an int, which means negative inputs - /// are possible. We check for those below in the input - /// validation phase. - // - // Fetch default value of cacheSizeHint. - int cacheSizeHintAsInt = static_cast (params.cacheSizeHint); - try { - using Teuchos::CommandLineProcessor; - - CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, - /* recognizeAllOptions=*/ true); - cmdLineProc.setDocString (docString); - cmdLineProc.setOption ("verify", - "noverify", - ¶ms.verify, - "Test accuracy"); - cmdLineProc.setOption ("benchmark", - "nobenchmark", - ¶ms.benchmark, - "Test performance"); - cmdLineProc.setOption ("nrows", - ¶ms.numRows, - "Number of rows in the test matrix"); - cmdLineProc.setOption ("ncols", - ¶ms.numCols, - "Number of columns in the test matrix"); - cmdLineProc.setOption ("ntrials", - ¶ms.numTrials, - "Number of trials (only used when \"--benchmark\""); - cmdLineProc.setOption ("real", - "noreal", - ¶ms.testReal, - "Test real arithmetic"); -#ifdef HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("complex", - "nocomplex", - ¶ms.testComplex, - "Test complex arithmetic"); -#endif // HAVE_KOKKOSTSQR_COMPLEX - cmdLineProc.setOption ("ncores", - ¶ms.numCores, - "Number of cores to use for Intel TBB"); - cmdLineProc.setOption ("cache-block-size", - &cacheSizeHintAsInt, - "Cache size hint in bytes (0 means pick a reasonable default)"); - cmdLineProc.setOption ("contiguous-cache-blocks", - "noncontiguous-cache-blocks", - ¶ms.contiguousCacheBlocks, - "Whether cache blocks should be stored contiguously"); - cmdLineProc.setOption ("print-field-names", - "no-print-field-names", - ¶ms.printFieldNames, - "Print field names (for machine-readable output only)"); - cmdLineProc.setOption ("human-readable", - "machine-readable", - ¶ms.humanReadable, - "If set, make output easy to read by humans " - "(but hard to parse)"); - cmdLineProc.setOption ("debug", - "nodebug", - ¶ms.debug, - "Print debugging information"); - cmdLineProc.parse (argc, argv); - } - catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { - if (allowedToPrint) { - cerr << "Unrecognized command-line option: " << e.what() << endl; - } - throw e; - } - catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { - printedHelp = true; - return params; // Don't verify parameters in this case - } - - // Validate command-line options. We provide default values - // for unset options, so we don't have to validate those. - if (params.numRows <= 0) { - throw std::invalid_argument ("Number of rows must be positive"); - } - else if (params.numCols <= 0) { - throw std::invalid_argument ("Number of columns must be positive"); - } - else if (params.numRows < params.numCols) { - throw std::invalid_argument ("Number of rows must be >= number of columns"); - } - else if (params.benchmark && params.numTrials < 1) { - throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); - } - else if (params.numCores < 1) { - throw std::invalid_argument ("\"--ncores\" option must be >= 1"); - } - else { - if (cacheSizeHintAsInt < 0) { - throw std::invalid_argument ("Cache size hint must be nonnegative"); - } - else { - params.cacheSizeHint = static_cast (cacheSizeHintAsInt); - } - } - return params; - } - -} // namespace Test -} // namespace Trilinos -} // namespace TSQR - -int -main (int argc, char *argv[]) -{ - using Teuchos::RCP; - using TSQR::Trilinos::Test::TbbTestParameters; - using TSQR::Trilinos::Test::parseOptions; - -#ifdef HAVE_MPI - typedef RCP > comm_ptr; - - Teuchos::oblackholestream blackhole; - Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole); - comm_ptr comm = Teuchos::DefaultComm::getComm(); - const int myRank = comm->getRank(); - // Only Rank 0 gets to write to stdout. The other MPI process ranks - // send their output to something that looks like /dev/null (and - // likely is, on Unix-y operating systems). - std::ostream& out = (myRank == 0) ? std::cout : blackhole; - // Only Rank 0 performs the tests. - const bool performingTests = (myRank == 0); - const bool allowedToPrint = (myRank == 0); - -#else // Don't HAVE_MPI: single-node test - - const bool performingTests = true; - const bool allowedToPrint = true; - std::ostream& out = std::cout; -#endif // HAVE_MPI - - // Fetch command-line parameters. - bool printedHelp = false; - TbbTestParameters params = - parseOptions (argc, argv, allowedToPrint, printedHelp); - if (printedHelp) { - return 0; - } - - bool success = false; - bool verbose = false; - try { - if (performingTests) { - using std::endl; - - // The same run may both benchmark and verify, if that's what - // the user wants. - if (params.verify) { - TSQR::Trilinos::Test::verify (params); - } - if (params.benchmark) { - TSQR::Trilinos::Test::benchmark (params); - } - - success = true; - - // The Trilinos test framework expects a message like this. - // Obviously we haven't tested anything, but eventually we - // will include accuracy integration tests. - out << "\nEnd Result: TEST PASSED" << endl; - } - } - TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); - - return success ? EXIT_SUCCESS : EXIT_FAILURE; -} - -