diff --git a/TPLsList.cmake b/TPLsList.cmake
index 1bdb278b2dce..76ab8382b9f4 100644
--- a/TPLsList.cmake
+++ b/TPLsList.cmake
@@ -58,6 +58,8 @@ TRIBITS_REPOSITORY_DEFINE_TPLS(
   yaml-cpp        "cmake/TPLs/"    EX
   Peano           "cmake/TPLs/"    EX
   CUDA            "${${PROJECT_NAME}_TRIBITS_DIR}/core/std_tpls/"    PT
+  CUBLAS          "cmake/TPLs/"    PT
+  CUSOLVER        "cmake/TPLs/"    PT
   CUSPARSE        "cmake/TPLs/"    PT
   Thrust          "cmake/TPLs/"    ST
   Cusp            "cmake/TPLs/"    ST
diff --git a/cmake/TPLs/FindTPLCUBLAS.cmake b/cmake/TPLs/FindTPLCUBLAS.cmake
new file mode 100644
index 000000000000..8ce61e78e661
--- /dev/null
+++ b/cmake/TPLs/FindTPLCUBLAS.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+IF (NOT TPL_ENABLE_CUDA)
+  MESSAGE(FATAL_ERROR "\nCUBLAS: This TPL requires CUDA")
+ELSE()
+  find_library(CUDA_cublas_LIBRARY
+    cublas
+    HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
+  )
+  IF(CUDA_cublas_LIBRARY STREQUAL "CUDA_cublas_LIBRARY-NOTFOUND") 
+    MESSAGE(FATAL_ERROR "\nCUBLAS: could not find cublas library.")
+  ENDIF()
+  GLOBAL_SET(TPL_CUBLAS_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUBLAS_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
+ENDIF()
+
diff --git a/cmake/TPLs/FindTPLCUSOLVER.cmake b/cmake/TPLs/FindTPLCUSOLVER.cmake
new file mode 100644
index 000000000000..7725cc028cfc
--- /dev/null
+++ b/cmake/TPLs/FindTPLCUSOLVER.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+IF (NOT TPL_ENABLE_CUDA)
+  MESSAGE(FATAL_ERROR "\nCUSOLVER: This TPL requires CUDA")
+ELSE()
+  find_library(CUDA_cusolver_LIBRARY
+    cusolver
+    HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
+  )
+  IF(CUDA_cusolver_LIBRARY STREQUAL "CUDA_cusolver_LIBRARY-NOTFOUND") 
+    MESSAGE(FATAL_ERROR "\nCUSOLVER: could not find cusolver library.")
+  ENDIF()
+  GLOBAL_SET(TPL_CUSOLVER_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSOLVER_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSOLVER_LIBRARIES ${CUDA_cusolver_LIBRARY})
+ENDIF()
+
diff --git a/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp b/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp
index e3c06fe85626..cbada90ed6d5 100644
--- a/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp
+++ b/packages/stokhos/src/sacado/kokkos/pce/tpetra/Tpetra_TsqrAdaptor_UQ_PCE.hpp
@@ -34,8 +34,6 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
 // ************************************************************************
 // @HEADER
 
@@ -48,18 +46,18 @@
 
 #include "Stokhos_Sacado_Kokkos_UQ_PCE.hpp"
 
-#  include <Tsqr_NodeTsqrFactory.hpp> // create intranode TSQR object
-#  include <Tsqr.hpp> // full (internode + intranode) TSQR
-#  include <Tsqr_DistTsqr.hpp> // internode TSQR
+#  include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object
+#  include "Tsqr.hpp" // full (internode + intranode) TSQR
+#  include "Tsqr_DistTsqr.hpp" // internode TSQR
 // Subclass of TSQR::MessengerBase, implemented using Teuchos
 // communicator template helper functions
-#  include <Tsqr_TeuchosMessenger.hpp>
-#  include <Tpetra_MultiVector.hpp>
-#  include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
+#  include "Tsqr_TeuchosMessenger.hpp"
+#  include "Tpetra_MultiVector.hpp"
+#  include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
 #  include <stdexcept>
 
 // Base TsqrAdator template we will specialize
-#  include <Tpetra_TsqrAdaptor.hpp>
+#  include "Tpetra_TsqrAdaptor.hpp"
 
 namespace Tpetra {
 
@@ -81,16 +79,16 @@ namespace Tpetra {
     typedef typename mp_scalar_type::scalar_type scalar_type;
     typedef typename mp_scalar_type::ordinal_type mp_ordinal_type;
     typedef typename MV::local_ordinal_type ordinal_type;
-    typedef typename MV::node_type node_type;
     typedef Teuchos::SerialDenseMatrix<ordinal_type, scalar_type> dense_matrix_type;
     typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
 
   private:
-    //typedef TSQR::MatView<ordinal_type, scalar_type> matview_type;
-    typedef TSQR::NodeTsqrFactory<node_type, scalar_type, ordinal_type> node_tsqr_factory_type;
-    typedef typename node_tsqr_factory_type::node_tsqr_type node_tsqr_type;
-    typedef TSQR::DistTsqr<ordinal_type, scalar_type> dist_tsqr_type;
-    typedef TSQR::Tsqr<ordinal_type, scalar_type, node_tsqr_type> tsqr_type;
+    using node_tsqr_factory_type =
+      TSQR::NodeTsqrFactory<scalar_type, ordinal_type,
+                            typename MV::device_type>;
+    using node_tsqr_type = TSQR::NodeTsqr<ordinal_type, scalar_type>;
+    using dist_tsqr_type = TSQR::DistTsqr<ordinal_type, scalar_type>;
+    using tsqr_type = TSQR::Tsqr<ordinal_type, scalar_type>;
 
   public:
     /// \brief Constructor (that accepts a parameter list).
@@ -100,7 +98,7 @@ namespace Tpetra {
     ///   implementation.  For details, call \c getValidParameters()
     ///   and examine the documentation embedded therein.
     TsqrAdaptor (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -110,7 +108,7 @@ namespace Tpetra {
 
     //! Constructor (that uses default parameters).
     TsqrAdaptor () :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (new node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -289,20 +287,10 @@ namespace Tpetra {
     {
       if (! ready_) {
         prepareDistTsqr (mv);
-        prepareNodeTsqr (mv);
         ready_ = true;
       }
     }
 
-    /// \brief Finish intraprocess TSQR initialization.
-    ///
-    /// \note It's OK to call this method more than once; it is idempotent.
-    void
-    prepareNodeTsqr (const MV& mv)
-    {
-      node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_);
-    }
-
     /// \brief Finish interprocess TSQR initialization.
     ///
     /// \param mv [in] A valid Tpetra::MultiVector instance whose
diff --git a/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp b/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp
index feab87b8d530..8409389c33fc 100644
--- a/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp
+++ b/packages/stokhos/src/sacado/kokkos/vector/tpetra/Tpetra_TsqrAdaptor_MP_Vector.hpp
@@ -34,8 +34,6 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
 // ************************************************************************
 // @HEADER
 
@@ -48,18 +46,18 @@
 
 #include "Stokhos_Sacado_Kokkos_MP_Vector.hpp"
 
-#  include <Tsqr_NodeTsqrFactory.hpp> // create intranode TSQR object
-#  include <Tsqr.hpp> // full (internode + intranode) TSQR
-#  include <Tsqr_DistTsqr.hpp> // internode TSQR
+#  include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object
+#  include "Tsqr.hpp" // full (internode + intranode) TSQR
+#  include "Tsqr_DistTsqr.hpp" // internode TSQR
 // Subclass of TSQR::MessengerBase, implemented using Teuchos
 // communicator template helper functions
-#  include <Tsqr_TeuchosMessenger.hpp>
-#  include <Tpetra_MultiVector.hpp>
-#  include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
+#  include "Tsqr_TeuchosMessenger.hpp"
+#  include "Tpetra_MultiVector.hpp"
+#  include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
 #  include <stdexcept>
 
 // Base TsqrAdator template we will specialize
-#  include <Tpetra_TsqrAdaptor.hpp>
+#  include "Tpetra_TsqrAdaptor.hpp"
 
 namespace Tpetra {
 
@@ -81,16 +79,16 @@ namespace Tpetra {
     typedef typename mp_scalar_type::scalar_type scalar_type;
     typedef typename mp_scalar_type::ordinal_type mp_ordinal_type;
     typedef typename MV::local_ordinal_type ordinal_type;
-    typedef typename MV::node_type node_type;
     typedef Teuchos::SerialDenseMatrix<ordinal_type, scalar_type> dense_matrix_type;
     typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
 
   private:
-    //typedef TSQR::MatView<ordinal_type, scalar_type> matview_type;
-    typedef TSQR::NodeTsqrFactory<node_type, scalar_type, ordinal_type> node_tsqr_factory_type;
-    typedef typename node_tsqr_factory_type::node_tsqr_type node_tsqr_type;
-    typedef TSQR::DistTsqr<ordinal_type, scalar_type> dist_tsqr_type;
-    typedef TSQR::Tsqr<ordinal_type, scalar_type, node_tsqr_type> tsqr_type;
+    using node_tsqr_factory_type =
+      TSQR::NodeTsqrFactory<scalar_type, ordinal_type,
+                            typename MV::device_type>;
+    using node_tsqr_type = TSQR::NodeTsqr<ordinal_type, scalar_type>;
+    using dist_tsqr_type = TSQR::DistTsqr<ordinal_type, scalar_type>;
+    using tsqr_type = TSQR::Tsqr<ordinal_type, scalar_type>;
 
   public:
     /// \brief Constructor (that accepts a parameter list).
@@ -100,7 +98,7 @@ namespace Tpetra {
     ///   implementation.  For details, call \c getValidParameters()
     ///   and examine the documentation embedded therein.
     TsqrAdaptor (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -110,7 +108,7 @@ namespace Tpetra {
 
     //! Constructor (that uses default parameters).
     TsqrAdaptor () :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -289,20 +287,10 @@ namespace Tpetra {
     {
       if (! ready_) {
         prepareDistTsqr (mv);
-        prepareNodeTsqr (mv);
         ready_ = true;
       }
     }
 
-    /// \brief Finish intraprocess TSQR initialization.
-    ///
-    /// \note It's OK to call this method more than once; it is idempotent.
-    void
-    prepareNodeTsqr (const MV& mv)
-    {
-      node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_);
-    }
-
     /// \brief Finish interprocess TSQR initialization.
     ///
     /// \param mv [in] A valid Tpetra::MultiVector instance whose
diff --git a/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp b/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp
index 7c0d0344905e..22f31ac012b7 100644
--- a/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp
+++ b/packages/stratimikos/adapters/belos/src/Thyra_TsqrAdaptor.hpp
@@ -265,12 +265,6 @@ namespace Thyra {
 #endif // HAVE_MPI
     }
 
-    /// \brief Finish intraprocess TSQR initialization.
-    ///
-    /// \note It's OK to call this method more than once; it is idempotent.
-    void
-    prepareNodeTsqr (const MV& /* X */) {}
-
     /// \brief Finish interprocess TSQR initialization.
     ///
     /// Input X is a valid Thyra::MultiVectorBase instance whose
@@ -306,10 +300,7 @@ namespace Thyra {
     ///   All multivector objects used with this adapter must have the
     ///   same communicator and Kokkos Node instance (if applicable).
     void
-    prepareTsqr (const MV& /* X */)
-    {
-      throw std::logic_error ("Thyra adaptor for TSQR not implemented");
-    }
+    prepareTsqr (const MV& /* X */) {}
   };
 
 } // namespace Tpetra
diff --git a/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp b/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp
index 1cc8cce50e5e..f195e912a40b 100644
--- a/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp
+++ b/packages/tpetra/core/src/Epetra_TsqrAdaptor.hpp
@@ -40,7 +40,6 @@
 #ifndef EPETRA_TSQRADAPTOR_HPP
 #define EPETRA_TSQRADAPTOR_HPP
 
-///
 /// \file Epetra_TsqrAdaptor.hpp
 /// \brief Epetra_MultiVector to TSQR adaptor
 ///
@@ -52,25 +51,22 @@
 /// Trilinos to get the correct list of libraries against which to
 /// link, but we make this easy temporary fix now so they have time to
 /// fix their build systems later.
-///
 
-#include <Tpetra_ConfigDefs.hpp>
+#include "Tpetra_ConfigDefs.hpp"
 
 #if defined(HAVE_TPETRA_EPETRA) && defined(HAVE_TPETRA_TSQR)
 
-#include <Kokkos_DefaultNode.hpp> // Include minimal Kokkos Node types
-#include <Tsqr_NodeTsqrFactory.hpp> // create intranode TSQR object
-#include <Tsqr.hpp> // full (internode + intranode) TSQR
-#include <Tsqr_DistTsqr.hpp> // internode TSQR
-#include <Epetra_Comm.h>
+#include "Tsqr_NodeTsqrFactory.hpp" // create intranode TSQR object
+#include "Tsqr.hpp" // full (internode + intranode) TSQR
+#include "Tsqr_DistTsqr.hpp" // internode TSQR
+#include "Epetra_Comm.h"
 // Subclass of TSQR::MessengerBase, implemented using Teuchos
 // communicator template helper functions
-#include <Epetra_TsqrMessenger.hpp>
-#include <Epetra_MultiVector.h>
-#include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
+#include "Epetra_TsqrMessenger.hpp"
+#include "Epetra_MultiVector.h"
+#include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
 #include <stdexcept>
 
-
 namespace Epetra {
 
   /// \class TsqrAdaptor
@@ -117,11 +113,14 @@ namespace Epetra {
     /// both are int.
     typedef int ordinal_type;
 
-    /// \typedef node_type
+    /// \typedef device_type
     ///
-    /// TSQR depends on a Kokkos Node type.  We just use the default
-    /// Node type here.
-    typedef Tpetra::Details::DefaultTypes::node_type node_type;
+    /// TSQR depends on a Kokkos::Device type.  For Epetra, use a
+    /// host-only type.  Typical types are Kokkos::Serial or
+    /// Kokkos::OpenMP, depending on build settings.
+    using device_type =
+      Kokkos::Device<Kokkos::DefaultHostExecutionSpace,
+                     Kokkos::HostSpace>;
 
     /// \typedef dense_matrix_type
     ///
@@ -131,23 +130,25 @@ namespace Epetra {
     /// \note TSQR lives in the Kokkos package, which requires the
     ///   Teuchos package, so it's acceptable for us to require
     ///   Teuchos components.
-    typedef Teuchos::SerialDenseMatrix<ordinal_type, scalar_type> dense_matrix_type;
+    using dense_matrix_type =
+      Teuchos::SerialDenseMatrix<ordinal_type, scalar_type>;
 
     /// \typedef magnitude_type
     ///
     /// Epetra_MultiVector's "Scalar" type is real.  TSQR supports
     /// complex arithmetic as well, in which magnitude_type would
     /// differ from scalar_type.
-    typedef double magnitude_type;
+    using magnitude_type = double;
 
   private:
-    typedef TSQR::MatView<ordinal_type, scalar_type> matview_type;
-    typedef TSQR::NodeTsqrFactory<node_type, scalar_type, ordinal_type> node_tsqr_factory_type;
+    using matview_type = TSQR::MatView<ordinal_type, scalar_type>;
+    using node_tsqr_factory_type =
+      TSQR::NodeTsqrFactory<scalar_type, ordinal_type, device_type>;
     // Don't need a "typename" here, because there are no template
     // parameters involved in the type definition.
-    typedef node_tsqr_factory_type::node_tsqr_type node_tsqr_type;
-    typedef TSQR::DistTsqr<ordinal_type, scalar_type> dist_tsqr_type;
-    typedef TSQR::Tsqr<ordinal_type, scalar_type, node_tsqr_type> tsqr_type;
+    using node_tsqr_type = TSQR::NodeTsqr<ordinal_type, scalar_type>;
+    using dist_tsqr_type = TSQR::DistTsqr<ordinal_type, scalar_type>;
+    using tsqr_type = TSQR::Tsqr<ordinal_type, scalar_type>;
 
   public:
     /// \brief Constructor (that accepts a parameter list).
@@ -157,7 +158,7 @@ namespace Epetra {
     ///   implementation.  For details, call \c getValidParameters()
     ///   and examine the documentation embedded therein.
     TsqrAdaptor (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -167,7 +168,7 @@ namespace Epetra {
 
     //! Constructor (that uses default parameters).
     TsqrAdaptor () :
-      nodeTsqr_ (new node_tsqr_type),
+      nodeTsqr_ (node_tsqr_factory_type::getNodeTsqr ()),
       distTsqr_ (new dist_tsqr_type),
       tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
       ready_ (false)
@@ -339,20 +340,10 @@ namespace Epetra {
     {
       if (! ready_) {
         prepareDistTsqr (mv);
-        prepareNodeTsqr (mv);
         ready_ = true;
       }
     }
 
-    /// \brief Finish intraprocess TSQR initialization.
-    ///
-    /// \note It's OK to call this method more than once; it is idempotent.
-    void
-    prepareNodeTsqr (const MV& /* mv */)
-    {
-      node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_);
-    }
-
     /// \brief Finish interprocess TSQR initialization.
     ///
     /// \param mv [in] A multivector, from which to extract the
diff --git a/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp b/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp
index 017206501756..91721b8706ee 100644
--- a/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_DefaultTypes.hpp
@@ -34,8 +34,6 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
 // ************************************************************************
 // @HEADER
 
diff --git a/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp b/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp
index b48c9dafeb50..e926f53f1694 100644
--- a/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp
+++ b/packages/tpetra/core/src/Tpetra_TsqrAdaptor.hpp
@@ -85,55 +85,89 @@ namespace Tpetra {
   public:
     using scalar_type = typename MV::scalar_type;
     using ordinal_type = typename MV::local_ordinal_type;
-    using dense_matrix_type = Teuchos::SerialDenseMatrix<ordinal_type, scalar_type>;
-    using magnitude_type = typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
+    using dense_matrix_type =
+      Teuchos::SerialDenseMatrix<ordinal_type, scalar_type>;
+    using magnitude_type =
+      typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
 
   private:
     using node_tsqr_factory_type =
-      TSQR::NodeTsqrFactory<typename MV::node_type, scalar_type, ordinal_type>;
-    using node_tsqr_type = typename node_tsqr_factory_type::node_tsqr_type;
+      TSQR::NodeTsqrFactory<scalar_type, ordinal_type,
+                            typename MV::device_type>;
+    using node_tsqr_type = TSQR::NodeTsqr<ordinal_type, scalar_type>;
     using dist_tsqr_type = TSQR::DistTsqr<ordinal_type, scalar_type>;
-    using tsqr_type = TSQR::Tsqr<ordinal_type, scalar_type, node_tsqr_type>;
+    using tsqr_type = TSQR::Tsqr<ordinal_type, scalar_type>;
+
+    TSQR::MatView<ordinal_type, scalar_type>
+    get_mat_view(MV& X)
+    {
+      TEUCHOS_ASSERT( ! tsqr_.is_null() );
+      // FIXME (mfh 18 Oct 2010, 22 Dec 2019) Check Teuchos::Comm<int>
+      // object in Q to make sure it is the same communicator as the
+      // one we are using in our dist_tsqr_type implementation.
+
+      const ordinal_type lclNumRows(X.getLocalLength());
+      const ordinal_type numCols(X.getNumVectors());
+      scalar_type* X_ptr = nullptr;
+      // LAPACK and BLAS functions require "LDA" >= 1, even if the
+      // corresponding matrix dimension is zero.
+      ordinal_type X_stride = 1;
+      if(tsqr_->wants_device_memory()) {
+        X.sync_device();
+        X.modify_device();
+        auto X_view = X.getLocalViewDevice();
+        X_ptr = reinterpret_cast<scalar_type*>(X_view.data());
+        X_stride = static_cast<ordinal_type>(X_view.stride(1));
+        if(X_stride == 0) {
+          X_stride = ordinal_type(1); // see note above
+        }
+      }
+      else {
+        X.sync_host();
+        X.modify_host();
+        auto X_view = X.getLocalViewHost();
+        X_ptr = reinterpret_cast<scalar_type*>(X_view.data());
+        X_stride = static_cast<ordinal_type>(X_view.stride(1));
+        if(X_stride == 0) {
+          X_stride = ordinal_type(1); // see note above
+        }
+      }
+      using mat_view_type = TSQR::MatView<ordinal_type, scalar_type>;
+      return mat_view_type(lclNumRows, numCols, X_ptr, X_stride);
+    }
 
   public:
-    /// \brief Constructor (that accepts a parameter list).
+    /// \brief Constructor that accepts a Teuchos::ParameterList.
     ///
     /// \param plist [in/out] List of parameters for configuring TSQR.
     ///   The specific parameter keys that are read depend on the TSQR
-    ///   implementation.  For details, call \c getValidParameters()
-    ///   and examine the documentation embedded therein.
-    TsqrAdaptor (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-      nodeTsqr_ (new node_tsqr_type),
-      distTsqr_ (new dist_tsqr_type),
-      tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
-      ready_ (false)
+    ///   implementation.  For details, call getValidParameters() and
+    ///   examine the documentation embedded therein.
+    TsqrAdaptor(const Teuchos::RCP<Teuchos::ParameterList>& plist) :
+      nodeTsqr_(node_tsqr_factory_type::getNodeTsqr()),
+      distTsqr_(new dist_tsqr_type),
+      tsqr_(new tsqr_type(nodeTsqr_, distTsqr_))
     {
-      setParameterList (plist);
+      setParameterList(plist);
     }
 
-    //! Constructor (that uses default parameters).
-    TsqrAdaptor () :
-      nodeTsqr_ (new node_tsqr_type),
-      distTsqr_ (new dist_tsqr_type),
-      tsqr_ (new tsqr_type (nodeTsqr_, distTsqr_)),
-      ready_ (false)
+    //! Constructor(that uses default parameters).
+    TsqrAdaptor() :
+      nodeTsqr_(node_tsqr_factory_type::getNodeTsqr()),
+      distTsqr_(new dist_tsqr_type),
+      tsqr_(new tsqr_type(nodeTsqr_, distTsqr_))
     {
-      setParameterList (Teuchos::null);
+      setParameterList(Teuchos::null);
     }
 
     //! Get all valid parameters (with default values) that TSQR understands.
     Teuchos::RCP<const Teuchos::ParameterList>
-    getValidParameters () const
+    getValidParameters() const
     {
-      using Teuchos::RCP;
-      using Teuchos::rcp;
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-
-      if (defaultParams_.is_null()) {
-        RCP<ParameterList> params = parameterList ("TSQR implementation");
-        params->set ("NodeTsqr", *(nodeTsqr_->getValidParameters ()));
-        params->set ("DistTsqr", *(distTsqr_->getValidParameters ()));
+      if(defaultParams_.is_null()) {
+        auto params = Teuchos::parameterList("TSQR implementation");
+        params->set("NodeTsqr", *(nodeTsqr_->getValidParameters()));
+        params->set("DistTsqr", *(distTsqr_->getValidParameters()));
         defaultParams_ = params;
       }
       return defaultParams_;
@@ -165,19 +199,15 @@ namespace Tpetra {
     /// long as it is not too large or too small.  The default value
     /// should be fine.
     void
-    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
+    setParameterList(const Teuchos::RCP<Teuchos::ParameterList>& plist)
     {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
+      auto params = plist.is_null() ?
+        Teuchos::parameterList(*getValidParameters()) : plist;
       using Teuchos::sublist;
+      nodeTsqr_->setParameterList(sublist(params, "NodeTsqr"));
+      distTsqr_->setParameterList(sublist(params, "DistTsqr"));
 
-      RCP<ParameterList> params = plist.is_null() ?
-        parameterList (*getValidParameters ()) : plist;
-      nodeTsqr_->setParameterList (sublist (params, "NodeTsqr"));
-      distTsqr_->setParameterList (sublist (params, "DistTsqr"));
-
-      this->setMyParamList (params);
+      this->setMyParamList(params);
     }
 
     /// \brief Compute QR factorization [Q,R] = qr(A,0).
@@ -202,39 +232,30 @@ namespace Tpetra {
     ///   instance's constructor.  Otherwise, the result of this
     ///   method is undefined.
     void
-    factorExplicit (MV& A,
-                    MV& Q,
-                    dense_matrix_type& R,
-                    const bool forceNonnegativeDiagonal=false)
+    factorExplicit(MV& A,
+                   MV& Q,
+                   dense_matrix_type& R,
+                   const bool forceNonnegativeDiagonal=false)
     {
       TEUCHOS_TEST_FOR_EXCEPTION
-        (! A.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
+        (! A.isConstantStride(), std::invalid_argument, "TsqrAdaptor::"
          "factorExplicit: Input MultiVector A must have constant stride.");
       TEUCHOS_TEST_FOR_EXCEPTION
-        (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
+        (! Q.isConstantStride(), std::invalid_argument, "TsqrAdaptor::"
          "factorExplicit: Input MultiVector Q must have constant stride.");
-      prepareTsqr (Q); // Finish initializing TSQR.
+      prepareTsqr(Q); // Finish initializing TSQR.
+      TEUCHOS_ASSERT( ! tsqr_.is_null() );
 
-      // FIXME (mfh 16 Jan 2016) Currently, TSQR is a host-only
-      // implementation.
-      A.sync_host ();
-      A.modify_host ();
-      Q.sync_host ();
-      Q.modify_host ();
-      auto A_view = A.getLocalViewHost ();
-      auto Q_view = Q.getLocalViewHost ();
-      scalar_type* const A_ptr =
-        reinterpret_cast<scalar_type*> (A_view.data ());
-      scalar_type* const Q_ptr =
-        reinterpret_cast<scalar_type*> (Q_view.data ());
-      const bool contiguousCacheBlocks = false;
-      tsqr_->factorExplicitRaw (A_view.extent (0),
-                                A_view.extent (1),
-                                A_ptr, A.getStride (),
-                                Q_ptr, Q.getStride (),
-                                R.values (), R.stride (),
-                                contiguousCacheBlocks,
-                                forceNonnegativeDiagonal);
+      auto A_view = get_mat_view(A);
+      auto Q_view = get_mat_view(Q);
+      constexpr bool contiguousCacheBlocks = false;
+      tsqr_->factorExplicitRaw(A_view.extent(0),
+                               A_view.extent(1),
+                               A_view.data(), A_view.stride(1),
+                               Q_view.data(), Q_view.stride(1),
+                               R.values(), R.stride(),
+                               contiguousCacheBlocks,
+                               forceNonnegativeDiagonal);
     }
 
     /// \brief Rank-revealing decomposition
@@ -268,29 +289,22 @@ namespace Tpetra {
     ///
     /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq N\f$.
     int
-    revealRank (MV& Q,
-                dense_matrix_type& R,
-                const magnitude_type& tol)
+    revealRank(MV& Q,
+               dense_matrix_type& R,
+               const magnitude_type& tol)
     {
       TEUCHOS_TEST_FOR_EXCEPTION
-        (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
+        (! Q.isConstantStride(), std::invalid_argument, "TsqrAdaptor::"
          "revealRank: Input MultiVector Q must have constant stride.");
-      prepareTsqr (Q); // Finish initializing TSQR.
-      // FIXME (mfh 18 Oct 2010) Check Teuchos::Comm<int> object in Q
-      // to make sure it is the same communicator as the one we are
-      // using in our dist_tsqr_type implementation.
+      prepareTsqr(Q); // Finish initializing TSQR.
 
-      Q.sync_host ();
-      Q.modify_host ();
-      auto Q_view = Q.getLocalViewHost ();
-      scalar_type* const Q_ptr =
-        reinterpret_cast<scalar_type*> (Q_view.data ());
-      const bool contiguousCacheBlocks = false;
-      return tsqr_->revealRankRaw (Q_view.extent (0),
-                                   Q_view.extent (1),
-                                   Q_ptr, Q.getStride (),
-                                   R.values (), R.stride (),
-                                   tol, contiguousCacheBlocks);
+      auto Q_view = get_mat_view(Q);
+      constexpr bool contiguousCacheBlocks = false;
+      return tsqr_->revealRankRaw(Q_view.extent(0),
+                                  Q_view.extent(1),
+                                  Q_view.data(), Q_view.stride(1),
+                                  R.values(), R.stride(),
+                                  tol, contiguousCacheBlocks);
     }
 
   private:
@@ -307,7 +321,7 @@ namespace Tpetra {
     mutable Teuchos::RCP<const Teuchos::ParameterList> defaultParams_;
 
     //! Whether TSQR has been fully initialized.
-    bool ready_;
+    bool ready_ = false;
 
     /// \brief Finish TSQR initialization.
     ///
@@ -330,24 +344,14 @@ namespace Tpetra {
     ///   multivector objects used with this Adaptor instance must
     ///   have the same map, communicator, and Kokkos Node instance.
     void
-    prepareTsqr (const MV& mv)
+    prepareTsqr(const MV& mv)
     {
-      if (! ready_) {
-        prepareDistTsqr (mv);
-        prepareNodeTsqr (mv);
+      if(! ready_) {
+        prepareDistTsqr(mv);
         ready_ = true;
       }
     }
 
-    /// \brief Finish intraprocess TSQR initialization.
-    ///
-    /// \note It's OK to call this method more than once; it is idempotent.
-    void
-    prepareNodeTsqr (const MV& mv)
-    {
-      node_tsqr_factory_type::prepareNodeTsqr (nodeTsqr_);
-    }
-
     /// \brief Finish interprocess TSQR initialization.
     ///
     /// \param mv [in] A valid Tpetra::MultiVector instance whose
@@ -355,17 +359,17 @@ namespace Tpetra {
     ///
     /// \note It's OK to call this method more than once; it is idempotent.
     void
-    prepareDistTsqr (const MV& mv)
+    prepareDistTsqr(const MV& mv)
     {
       using Teuchos::RCP;
       using Teuchos::rcp_implicit_cast;
-      typedef TSQR::TeuchosMessenger<scalar_type> mess_type;
-      typedef TSQR::MessengerBase<scalar_type> base_mess_type;
+      using mess_type = TSQR::TeuchosMessenger<scalar_type>;
+      using base_mess_type = TSQR::MessengerBase<scalar_type>;
 
-      RCP<const Teuchos::Comm<int> > comm = mv.getMap()->getComm();
-      RCP<mess_type> mess (new mess_type (comm));
-      RCP<base_mess_type> messBase = rcp_implicit_cast<base_mess_type> (mess);
-      distTsqr_->init (messBase);
+      auto comm = mv.getMap()->getComm();
+      RCP<mess_type> mess(new mess_type(comm));
+      auto messBase = rcp_implicit_cast<base_mess_type>(mess);
+      distTsqr_->init(messBase);
     }
   };
 
@@ -374,4 +378,3 @@ namespace Tpetra {
 #endif // HAVE_TPETRA_TSQR
 
 #endif // TPETRA_TSQRADAPTOR_HPP
-
diff --git a/packages/tpetra/tsqr/CMakeLists.txt b/packages/tpetra/tsqr/CMakeLists.txt
index 4bf9f40aa773..71b30bf3916d 100644
--- a/packages/tpetra/tsqr/CMakeLists.txt
+++ b/packages/tpetra/tsqr/CMakeLists.txt
@@ -8,23 +8,38 @@ TRIBITS_SUBPACKAGE(TSQR)
 # Enabled by default (unless disabled explicitly at the command line)
 # if Teuchos is built with complex arithmetic support.
 TRIBITS_ADD_OPTION_AND_DEFINE(
-  KokkosTSQR_ENABLE_Complex
-  HAVE_KOKKOSTSQR_COMPLEX
+  ${PACKAGE_NAME}_ENABLE_Complex
+  HAVE_TPETRATSQR_COMPLEX
   "Enable complex arithmetic (std::complex<T>) support for TSQR.  This is currently ON if Teuchos_ENABLE_COMPLEX is ON.  The default behavior may change as we migrate TSQR to depend on new Kokkos.  New Kokkos does not currently support complex arithmetic, but this will change."
   "${Teuchos_ENABLE_COMPLEX}" 
   )
 
-# Whether to build TbbTsqr and related classes.
-#
-# Enabled by default (unless disabled explicitly at the command line)
-# if Trilinos is built with the TBB (Intel's Threading Building
-# Blocks) TPL (third-party library) enabled.
+ASSERT_DEFINED(TPL_ENABLE_CUBLAS)
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  ${PACKAGE_NAME}_ENABLE_CUBLAS
+  HAVE_TPETRATSQR_CUBLAS
+  "Enable TSQR's support for the CUBLAS TPL."
+  "${TPL_ENABLE_CUBLAS}"
+  )
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUBLAS)
+
+ASSERT_DEFINED(TPL_ENABLE_CUSOLVER)
 TRIBITS_ADD_OPTION_AND_DEFINE(
-  KokkosTSQR_ENABLE_TBB
-  HAVE_KOKKOSTSQR_TBB
-  "Enable Intel Threading Building Blocks (TBB) intranode parallelization of TSQR.  This option is enabled by default if you are building Trilinos with TBB enabled as a 'third-party library' (TPL), so you should not have to enable this option manually.  TSQR will work without this, but enabling it gives another parallelization option for TSQR."
-  "${TPL_ENABLE_TBB}" 
+  ${PACKAGE_NAME}_ENABLE_CUSOLVER
+  HAVE_TPETRATSQR_CUSOLVER
+  "Enable TSQR's support for the CUSOLVER TPL."
+  "${TPL_ENABLE_CUSOLVER}"
   )
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUSOLVER)
+
+IF(${PACKAGE_NAME}_ENABLE_CUSOLVER AND (NOT ${PACKAGE_NAME}_ENABLE_CUBLAS))
+  MESSAGE(FATAL_ERROR "*** We found the CUSOLVER TPL, but not the
+    CUBLAS TPL.  One should not exist without the other.")
+ENDIF()
+IF((NOT ${PACKAGE_NAME}_ENABLE_CUSOLVER) AND ${PACKAGE_NAME}_ENABLE_CUBLAS)
+  MESSAGE(FATAL_ERROR "*** We found the CUBLAS TPL, but not the
+    CUSOLVER TPL.  One should not exist without the other.")
+ENDIF()
 
 # KokkosTSQR_config.h gets created in the src/ subdirectory.
 ADD_SUBDIRECTORY(src)
diff --git a/packages/tpetra/tsqr/cmake/Dependencies.cmake b/packages/tpetra/tsqr/cmake/Dependencies.cmake
index beb08e5ca843..94476683e84d 100644
--- a/packages/tpetra/tsqr/cmake/Dependencies.cmake
+++ b/packages/tpetra/tsqr/cmake/Dependencies.cmake
@@ -3,6 +3,6 @@ SET(LIB_OPTIONAL_DEP_PACKAGES)
 SET(TEST_REQUIRED_DEP_PACKAGES)
 SET(TEST_OPTIONAL_DEP_PACKAGES)
 SET(LIB_REQUIRED_DEP_TPLS)
-SET(LIB_OPTIONAL_DEP_TPLS TBB)
+SET(LIB_OPTIONAL_DEP_TPLS CUBLAS CUSOLVER)
 SET(TEST_REQUIRED_DEP_TPLS)
 SET(TEST_OPTIONAL_DEP_TPLS)
diff --git a/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in b/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in
index 6f5fb98dbc92..0bb958d792c6 100644
--- a/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in
+++ b/packages/tpetra/tsqr/cmake/TpetraTSQR_config.h.in
@@ -2,9 +2,16 @@
 #define TPETRATSQR_CONFIG_H
 
 /* Define if building TSQR with std::complex<T> support */
-#cmakedefine HAVE_KOKKOSTSQR_COMPLEX
+#cmakedefine HAVE_TPETRATSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
+   /* For backwards compatibility */
+#  define HAVE_KOKKOSTSQR_COMPLEX HAVE_TPETRATSQR_COMPLEX
+#endif
 
-/* Define if the TBB (Intel Threading Building Blocks) TPL is available */
-#cmakedefine HAVE_KOKKOSTSQR_TBB
+/* Define if TSQR supports the CUBLAS TPL */
+#cmakedefine HAVE_TPETRATSQR_CUBLAS
+
+/* Define if TSQR supports the CUSOLVER TPL */
+#cmakedefine HAVE_TPETRATSQR_CUSOLVER
 
 #endif // TPETRATSQR_CONFIG_H
diff --git a/packages/tpetra/tsqr/src/CMakeLists.txt b/packages/tpetra/tsqr/src/CMakeLists.txt
index 91cca32b7ec1..9e243aaaf1e5 100644
--- a/packages/tpetra/tsqr/src/CMakeLists.txt
+++ b/packages/tpetra/tsqr/src/CMakeLists.txt
@@ -12,16 +12,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 # files to install.
 APPEND_SET(HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 
-# If TBB (Intel's Threading Building Blocks) is enabled, add headers
-# and sources for TBB-enabled shared-memory parallel TSQR to the
-# lists of this subpackage's headers resp. sources.
-IF (${PACKAGE_NAME}_ENABLE_TBB)
-  APPEND_GLOB(HEADERS ${DIR}/TbbTsqr*.hpp)
-  APPEND_GLOB(SOURCES ${DIR}/TbbTsqr*.cpp)
-ENDIF ()
-
-# Add all other headers and sources (those not related to TBB) to the
-# lists of this subpackage's headers resp. sources.
+# Add headers and sources to the lists of this subpackage's headers
+# resp. sources.
 APPEND_GLOB(HEADERS ${DIR}/Tsqr*.hpp)
 APPEND_GLOB(HEADERS ${DIR}/KokkosTSQR*.hpp)
 APPEND_GLOB(SOURCES ${DIR}/Tsqr*.cpp)
@@ -37,5 +29,5 @@ TRIBITS_ADD_LIBRARY(
 # / from this directory, or to / from the 'impl' subdirectory.  That ensures
 # that running "make" will also rerun CMake in order to regenerate Makefiles.
 #
-# Here is another such change, and here is another.  Another!
+# Behold: another such change, and another.
 #
diff --git a/packages/tpetra/tsqr/src/TbbTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr.hpp
deleted file mode 100644
index 996d76e94eec..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr.hpp
+++ /dev/null
@@ -1,504 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-/// \file TbbTsqr.hpp
-/// \brief Intranode TSQR, parallelized with Intel TBB.
-///
-#ifndef __TSQR_TbbTsqr_hpp
-#define __TSQR_TbbTsqr_hpp
-
-#include "TbbTsqr_TbbParallelTsqr.hpp"
-#include "Tsqr_TimeStats.hpp"
-#include "Teuchos_ParameterList.hpp"
-#include "Teuchos_ParameterListExceptions.hpp"
-#include "Teuchos_Time.hpp"
-#include <stdexcept>
-#include <string>
-#include <utility> // std::pair
-#include <vector>
-
-namespace TSQR {
-  namespace TBB {
-    /// \class TbbTsqr
-    /// \brief Intranode TSQR, parallelized with Intel TBB
-    ///
-    /// TSQR factorization for a dense, tall and skinny matrix stored
-    /// on a single node.  Parallelized using Intel's Threading
-    /// Building Blocks.
-    ///
-    /// \note TSQR only needs to know about the local ordinal type
-    ///   (LocalOrdinal), not about the global ordinal type.
-    ///   TimerType may be any class with the same interface as
-    ///   TrivialTimer; it times the divide-and-conquer base cases
-    ///   (the operations on each CPU core within the thread-parallel
-    ///   implementation).
-    template< class LocalOrdinal, class Scalar, class TimerType = Teuchos::Time >
-    class TbbTsqr : public Teuchos::Describable {
-    private:
-      /// \brief Implementation of TBB TSQR.
-      ///
-      /// If you don't have TBB available, you can test this class by
-      /// substituting in a TbbRecursiveTsqr<LocalOrdinal, Scalar>
-      /// object.  That is a nonparallel implementation that emulates
-      /// the control flow of TbbParallelTsqr.  If you do this, you
-      /// should also change the FactorOutput public typedef.
-      ///
-      /// \note This is NOT a use of the pImpl idiom, because the
-      ///   point of the pImpl idiom is to avoid including the
-      ///   implementation details of the header file of the
-      ///   implementation class.  Here, the implementation class is
-      ///   templated, so we have to include the implementation class'
-      ///   implementation details.
-      TbbParallelTsqr<LocalOrdinal, Scalar, TimerType> impl_;
-
-      // Collected running statistcs on various computations
-      mutable TimeStats factorStats_;
-      mutable TimeStats applyStats_;
-      mutable TimeStats explicitQStats_;
-      mutable TimeStats cacheBlockStats_;
-      mutable TimeStats unCacheBlockStats_;
-
-      // Timers for various computations
-      mutable TimerType factorTimer_;
-      mutable TimerType applyTimer_;
-      mutable TimerType explicitQTimer_;
-      mutable TimerType cacheBlockTimer_;
-      mutable TimerType unCacheBlockTimer_;
-
-    public:
-      typedef Scalar scalar_type;
-      typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
-      typedef LocalOrdinal ordinal_type;
-
-      /// \typedef FactorOutput
-      /// \brief Type of partial output of TBB TSQR.
-      ///
-      /// If you don't have TBB available, you can test this class by
-      /// substituting in "typename TbbRecursiveTsqr<LocalOrdinal,
-      /// Scalar>::FactorOutput" for the typedef's definition.  If you
-      /// do this, you should also change the type of \c impl_ above.
-      typedef typename TbbParallelTsqr<LocalOrdinal, Scalar, TimerType>::FactorOutput FactorOutput;
-
-      /// \brief Constructor.
-      ///
-      /// \param numCores [in] Maximum number of processing cores to use
-      ///   when factoring the matrix.  Fewer cores may be used if the
-      ///   matrix is not big enough to justify their use.
-      ///
-      /// \param cacheSizeHint [in] Cache block size hint (in bytes)
-      ///   to use in the sequential part of TSQR.  If zero or not
-      ///   specified, a reasonable default is used.  If each CPU core
-      ///   has a private cache, that cache's size (minus a little
-      ///   wiggle room) would be the appropriate value for this
-      ///   parameter.  Set to zero for the implementation to choose a
-      ///   reasonable default.
-      TbbTsqr (const size_t numCores,
-               const size_t cacheSizeHint = 0) :
-        impl_ (numCores, cacheSizeHint),
-        factorTimer_ ("TbbTsqr::factor"),
-        applyTimer_ ("TbbTsqr::apply"),
-        explicitQTimer_ ("TbbTsqr::explicit_Q"),
-        cacheBlockTimer_ ("TbbTsqr::cache_block"),
-        unCacheBlockTimer_ ("TbbTsqr::un_cache_block")
-      {}
-
-      /// \brief Constructor (that takes a parameter list).
-      ///
-      /// \param plist [in/out] On input: list of TbbTsqr parameters.
-      ///   On output: missing parameters are filled in with default
-      ///   values.
-      ///
-      /// For a list of accepted parameters and thei documentation,
-      /// see the parameter list returned by \c getValidParameters().
-      TbbTsqr (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-        impl_ (plist),
-        factorTimer_ ("TbbTsqr::factor"),
-        applyTimer_ ("TbbTsqr::apply"),
-        explicitQTimer_ ("TbbTsqr::explicit_Q"),
-        cacheBlockTimer_ ("TbbTsqr::cache_block"),
-        unCacheBlockTimer_ ("TbbTsqr::un_cache_block")
-      {}
-
-      /// \brief Constructor (that uses default parameters).
-      ///
-      /// \param plist [in/out] On input: list of TbbTsqr parameters.
-      ///   On output: missing parameters are filled in with default
-      ///   values.
-      ///
-      /// For a list of accepted parameters and thei documentation,
-      /// see the parameter list returned by \c getValidParameters().
-      TbbTsqr () :
-        impl_ (Teuchos::null),
-        factorTimer_ ("TbbTsqr::factor"),
-        applyTimer_ ("TbbTsqr::apply"),
-        explicitQTimer_ ("TbbTsqr::explicit_Q"),
-        cacheBlockTimer_ ("TbbTsqr::cache_block"),
-        unCacheBlockTimer_ ("TbbTsqr::un_cache_block")
-      {}
-
-      Teuchos::RCP<const Teuchos::ParameterList>
-      getValidParameters () const
-      {
-        return impl_.getValidParameters ();
-      }
-
-      void
-      setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
-      {
-        impl_.setParameterList (plist);
-      }
-
-      /// \brief Number of tasks that TSQR will use to solve the problem.
-      ///
-      /// This is the number of subproblems into which to divide the
-      /// main problem, in order to solve it in parallel.
-      size_t ntasks() const { return impl_.ntasks(); }
-
-      //! Cache size hint (in bytes) used for the factorization.
-      size_t cache_size_hint() const { return impl_.cache_size_hint(); }
-
-      /// Whether or not this QR factorization produces an R factor
-      /// with all nonnegative diagonal entries.
-      static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-        typedef TbbParallelTsqr< LocalOrdinal, Scalar, TimerType > impl_type;
-        return impl_type::QR_produces_R_factor_with_nonnegative_diagonal();
-      }
-
-      //! Whether this object is ready to perform computations.
-      bool ready() const {
-        return true;
-      }
-
-      /// \brief One-line description of this object.
-      ///
-      /// This implements Teuchos::Describable::description().  For now,
-      /// SequentialTsqr uses the default implementation of
-      /// Teuchos::Describable::describe().
-      std::string description () const {
-        using std::endl;
-
-        // SequentialTsqr also implements Describable, so if you
-        // decide to implement describe(), you could call
-        // SequentialTsqr's describe() and get a nice hierarchy of
-        // descriptions.
-        std::ostringstream os;
-        os << "Intranode Tall Skinny QR (TSQR): "
-           << "Intel Threading Building Blocks (TBB) implementation"
-           << ", max " << ntasks() << "-way parallelism"
-           << ", cache size hint of " << cache_size_hint() << " bytes.";
-        return os.str();
-      }
-
-      void
-      cache_block (const LocalOrdinal nrows,
-                   const LocalOrdinal ncols,
-                   Scalar A_out[],
-                   const Scalar A_in[],
-                   const LocalOrdinal lda_in) const
-      {
-        cacheBlockTimer_.start(true);
-        impl_.cache_block (nrows, ncols, A_out, A_in, lda_in);
-        cacheBlockStats_.update (cacheBlockTimer_.stop());
-      }
-
-      void
-      un_cache_block (const LocalOrdinal nrows,
-                      const LocalOrdinal ncols,
-                      Scalar A_out[],
-                      const LocalOrdinal lda_out,
-                      const Scalar A_in[]) const
-      {
-        unCacheBlockTimer_.start(true);
-        impl_.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
-        unCacheBlockStats_.update (unCacheBlockTimer_.stop());
-      }
-
-      void
-      fill_with_zeros (const LocalOrdinal nrows,
-                       const LocalOrdinal ncols,
-                       Scalar C[],
-                       const LocalOrdinal ldc,
-                       const bool contiguous_cache_blocks) const
-      {
-        impl_.fill_with_zeros (nrows, ncols, C, ldc, contiguous_cache_blocks);
-      }
-
-      template< class MatrixViewType >
-      MatrixViewType
-      top_block (const MatrixViewType& C,
-                 const bool contiguous_cache_blocks) const
-      {
-        return impl_.top_block (C, contiguous_cache_blocks);
-      }
-
-      /// \brief Compute QR factorization of the dense matrix A
-      ///
-      /// Compute the QR factorization of the dense matrix A.
-      ///
-      /// \param nrows [in] Number of rows of A.
-      ///   Precondition: nrows >= ncols.
-      ///
-      /// \param ncols [in] Number of columns of A.
-      ///   Precondition: nrows >= ncols.
-      ///
-      /// \param A [in,out] On input, the matrix to factor, stored as a
-      ///   general dense matrix in column-major order.  On output,
-      ///   overwritten with an implicit representation of the Q factor.
-      ///
-      /// \param lda [in] Leading dimension of A.
-      ///   Precondition: lda >= nrows.
-      ///
-      /// \param R [out] The final R factor of the QR factorization of
-      ///   the matrix A.  An ncols by ncols upper triangular matrix
-      ///   stored in column-major order, with leading dimension ldr.
-      ///
-      /// \param ldr [in] Leading dimension of the matrix R.
-      ///
-      /// \param b_contiguous_cache_blocks [in] Whether cache blocks are
-      ///   stored contiguously in the input matrix A and the output
-      ///   matrix Q (of explicit_Q()).  If not and you want them to be,
-      ///   you should use the cache_block() method to copy them into
-      ///   that format.  You may use the un_cache_block() method to
-      ///   copy them out of that format into the usual column-oriented
-      ///   format.
-      ///
-      /// \return FactorOutput struct, which together with the data in A
-      ///   form an implicit representation of the Q factor.  They
-      ///   should be passed into the apply() and explicit_Q() functions
-      ///   as the "factor_output" parameter.
-      FactorOutput
-      factor (const LocalOrdinal nrows,
-              const LocalOrdinal ncols,
-              Scalar A[],
-              const LocalOrdinal lda,
-              Scalar R[],
-              const LocalOrdinal ldr,
-              const bool contiguous_cache_blocks) const
-      {
-        factorTimer_.start(true);
-        return impl_.factor (nrows, ncols, A, lda, R, ldr, contiguous_cache_blocks);
-        factorStats_.update (factorTimer_.stop());
-      }
-
-      /// \brief Apply Q factor to the global dense matrix C
-      ///
-      /// Apply the Q factor (computed by factor() and represented
-      /// implicitly) to the dense matrix C.
-      ///
-      /// \param apply_type [in] Whether to compute Q*C, Q^T * C, or
-      ///   Q^H * C.
-      ///
-      /// \param nrows [in] Number of rows of the matrix C and the
-      ///   matrix Q.  Precondition: nrows >= ncols_Q, ncols_C.
-      ///
-      /// \param ncols_Q [in] Number of columns of Q
-      ///
-      /// \param Q [in] Same as the "A" output of factor()
-      ///
-      /// \param ldq [in] Same as the "lda" input of factor()
-      ///
-      /// \param factor_output [in] Return value of factor()
-      ///
-      /// \param ncols_C [in] Number of columns in C.
-      ///   Precondition: nrows_local >= ncols_C.
-      ///
-      /// \param C [in,out] On input, the matrix C, stored as a general
-      ///   dense matrix in column-major order.  On output, overwritten
-      ///   with op(Q)*C, where op(Q) = Q or Q^T.
-      ///
-      /// \param ldc [in] Leading dimension of C.
-      ///   Precondition: ldc_local >= nrows_local.
-      ///   Not applicable if C is cache-blocked in place.
-      ///
-      /// \param contiguous_cache_blocks [in] Whether or not cache
-      ///   blocks of Q and C are stored contiguously (default:
-      ///   false).
-      void
-      apply (const ApplyType& apply_type,
-             const LocalOrdinal nrows,
-             const LocalOrdinal ncols_Q,
-             const Scalar Q[],
-             const LocalOrdinal ldq,
-             const FactorOutput& factor_output,
-             const LocalOrdinal ncols_C,
-             Scalar C[],
-             const LocalOrdinal ldc,
-             const bool contiguous_cache_blocks) const
-      {
-        applyTimer_.start(true);
-        impl_.apply (apply_type, nrows, ncols_Q, Q, ldq, factor_output,
-                     ncols_C, C, ldc, contiguous_cache_blocks);
-        applyStats_.update (applyTimer_.stop());
-      }
-
-      /// \brief Compute the explicit Q factor from factor()
-      ///
-      /// Compute the explicit version of the Q factor computed by
-      /// factor() and represented implicitly (via Q_in and
-      /// factor_output).
-      ///
-      /// \param nrows [in] Number of rows of the matrix Q_in.  Also,
-      ///   the number of rows of the output matrix Q_out.
-      ///   Precondition: nrows >= ncols_Q_in.
-      ///
-      /// \param ncols_Q_in [in] Number of columns in the original matrix
-      ///   A, whose explicit Q factor we are computing.
-      ///   Precondition: nrows >= ncols_Q_in.
-      ///
-      /// \param Q_local_in [in] Same as A output of factor().
-      ///
-      /// \param ldq_local_in [in] Same as lda input of factor()
-      ///
-      /// \param ncols_Q_out [in] Number of columns of the explicit Q
-      ///   factor to compute.
-      ///
-      /// \param Q_out [out] The explicit representation of the Q factor.
-      ///
-      /// \param ldq_out [in] Leading dimension of Q_out.
-      ///
-      /// \param factor_output [in] Return value of factor().
-      void
-      explicit_Q (const LocalOrdinal nrows,
-                  const LocalOrdinal ncols_Q_in,
-                  const Scalar Q_in[],
-                  const LocalOrdinal ldq_in,
-                  const FactorOutput& factor_output,
-                  const LocalOrdinal ncols_Q_out,
-                  Scalar Q_out[],
-                  const LocalOrdinal ldq_out,
-                  const bool contiguous_cache_blocks) const
-      {
-        explicitQTimer_.start(true);
-        impl_.explicit_Q (nrows, ncols_Q_in, Q_in, ldq_in, factor_output,
-                          ncols_Q_out, Q_out, ldq_out, contiguous_cache_blocks);
-        explicitQStats_.update (explicitQTimer_.stop());
-      }
-
-      /// \brief Compute Q*B
-      ///
-      /// Compute matrix-matrix product Q*B, where Q is nrows by ncols
-      /// and B is ncols by ncols.  Respect cache blocks of Q.
-      void
-      Q_times_B (const LocalOrdinal nrows,
-                 const LocalOrdinal ncols,
-                 Scalar Q[],
-                 const LocalOrdinal ldq,
-                 const Scalar B[],
-                 const LocalOrdinal ldb,
-                 const bool contiguous_cache_blocks) const
-      {
-        impl_.Q_times_B (nrows, ncols, Q, ldq, B, ldb, contiguous_cache_blocks);
-      }
-
-      /// Compute SVD \f$R = U \Sigma V^*\f$, not in place.  Use the
-      /// resulting singular values to compute the numerical rank of R,
-      /// with respect to the relative tolerance tol.  If R is full
-      /// rank, return without modifying R.  If R is not full rank,
-      /// overwrite R with \f$\Sigma \cdot V^*\f$.
-      ///
-      /// \return Numerical rank of R: 0 <= rank <= ncols.
-      LocalOrdinal
-      reveal_R_rank (const LocalOrdinal ncols,
-                     Scalar R[],
-                     const LocalOrdinal ldr,
-                     Scalar U[],
-                     const LocalOrdinal ldu,
-                     const magnitude_type tol) const
-      {
-        return impl_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
-      }
-
-      /// \brief Rank-revealing decomposition
-      ///
-      /// Using the R factor from factor() and the explicit Q factor
-      /// from explicit_Q(), compute the SVD of R (\f$R = U \Sigma
-      /// V^*\f$).  R.  If R is full rank (with respect to the given
-      /// relative tolerance tol), don't change Q or R.  Otherwise,
-      /// compute \f$Q := Q \cdot U\f$ and \f$R := \Sigma V^*\f$ in
-      /// place (the latter may be no longer upper triangular).
-      ///
-      /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq ncols\f$.
-      ///
-      LocalOrdinal
-      reveal_rank (const LocalOrdinal nrows,
-                   const LocalOrdinal ncols,
-                   Scalar Q[],
-                   const LocalOrdinal ldq,
-                   Scalar R[],
-                   const LocalOrdinal ldr,
-                   const magnitude_type tol,
-                   const bool contiguous_cache_blocks) const
-      {
-        return impl_.reveal_rank (nrows, ncols, Q, ldq, R, ldr, tol,
-                                  contiguous_cache_blocks);
-      }
-
-      double
-      min_seq_factor_timing () const { return impl_.min_seq_factor_timing(); }
-      double
-      max_seq_factor_timing () const { return impl_.max_seq_factor_timing(); }
-      double
-      min_seq_apply_timing () const { return impl_.min_seq_apply_timing(); }
-      double
-      max_seq_apply_timing () const { return impl_.max_seq_apply_timing(); }
-
-      void getStats (std::vector< TimeStats >& stats) {
-        const int numStats = 5;
-        stats.resize (numStats);
-        stats[0] = factorStats_;
-        stats[1] = applyStats_;
-        stats[2] = explicitQStats_;
-        stats[3] = cacheBlockStats_;
-        stats[4] = unCacheBlockStats_;
-      }
-
-      void getStatsLabels (std::vector< std::string >& labels) {
-        const int numStats = 5;
-        labels.resize (numStats);
-        labels[0] = factorTimer_.name();
-        labels[1] = applyTimer_.name();
-        labels[2] = explicitQTimer_.name();
-        labels[3] = cacheBlockTimer_.name();
-        labels[4] = unCacheBlockTimer_.name();
-      }
-    }; // class TbbTsqr
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TbbTsqr_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp
deleted file mode 100644
index 0caff734b512..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_ApplyTask.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_ApplyTask_hpp
-#define __TSQR_TBB_ApplyTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-
-    /// \class ApplyTask
-    /// \brief TBB task for recursive TSQR "apply Q factor" phase.
-    ///
-    template< class LocalOrdinal, class Scalar, class TimerType >
-    class ApplyTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-      typedef std::pair<const_mat_view_type, mat_view_type> top_blocks_t;
-      typedef std::vector<top_blocks_t> array_top_blocks_t;
-
-      /// \typedef SeqOutput
-      /// Result of SequentialTsqr for each thread.
-      typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
-      /// \typedef ParOutput
-      ///
-      /// Array of ncores "local tau arrays" from parallel TSQR.
-      /// (Local Q factors are stored in place.)
-      typedef std::vector<std::vector<Scalar> > ParOutput;
-      /// \typedef FactorOutput
-      /// Result of SequentialTsqr for the data on each thread,
-      /// and the result of combining the threads' data.
-      typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
-
-      /// \brief Constructor.
-      ///
-      /// \note The timing references are only modified by one thread
-      ///   at a time; recursive calls use distinct references and
-      ///   combine the results.
-      ApplyTask (const size_t P_first__,
-                 const size_t P_last__,
-                 const_mat_view_type Q,
-                 mat_view_type C,
-                 array_top_blocks_t& top_blocks,
-                 const FactorOutput& factor_output,
-                 const SequentialTsqr<LocalOrdinal, Scalar>& seq,
-                 double& my_seq_timing,
-                 double& min_seq_timing,
-                 double& max_seq_timing,
-                 const bool contiguous_cache_blocks) :
-        P_first_ (P_first__),
-        P_last_ (P_last__),
-        Q_ (Q),
-        C_ (C),
-        top_blocks_ (top_blocks),
-        factor_output_ (factor_output),
-        seq_ (seq),
-        apply_type_ (ApplyType::NoTranspose), // FIXME: modify to support Q^T and Q^H
-        my_seq_timing_ (my_seq_timing),
-        min_seq_timing_ (min_seq_timing),
-        max_seq_timing_ (max_seq_timing),
-        contiguous_cache_blocks_ (contiguous_cache_blocks)
-      {}
-
-      tbb::task* execute ()
-      {
-        if (P_first_ > P_last_ || Q_.empty() || C_.empty())
-          return NULL;
-        else if (P_first_ == P_last_)
-          {
-            execute_base_case ();
-            return NULL;
-          }
-        else
-          {
-            // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-            const size_t P_mid = (P_first_ + P_last_) / 2;
-            const_split_t Q_split =
-              partitioner_.split (Q_, P_first_, P_mid, P_last_,
-                                  contiguous_cache_blocks_);
-            split_t C_split =
-              partitioner_.split (C_, P_first_, P_mid, P_last_,
-                                  contiguous_cache_blocks_);
-
-            // The partitioner may decide that the current blocks Q_
-            // and C_ have too few rows to be worth splitting.  In
-            // that case, Q_split.second and C_split.second (the
-            // bottom block) will be empty.  We can deal with this by
-            // treating it as the base case.
-            if (Q_split.second.empty() || Q_split.second.extent(0) == 0)
-              {
-                execute_base_case ();
-                return NULL;
-              }
-
-            double top_timing;
-            double top_min_timing = 0.0;
-            double top_max_timing = 0.0;
-            double bot_timing;
-            double bot_min_timing = 0.0;
-            double bot_max_timing = 0.0;
-
-            apply_pair (P_first_, P_mid+1);
-            ApplyTask& topTask = *new( allocate_child() )
-              ApplyTask (P_first_, P_mid, Q_split.first, C_split.first,
-                         top_blocks_, factor_output_, seq_,
-                         top_timing, top_min_timing, top_max_timing,
-                         contiguous_cache_blocks_);
-            ApplyTask& botTask = *new( allocate_child() )
-              ApplyTask (P_mid+1, P_last_, Q_split.second, C_split.second,
-                         top_blocks_, factor_output_, seq_,
-                         bot_timing, bot_min_timing, bot_max_timing,
-                         contiguous_cache_blocks_);
-
-            set_ref_count (3); // 3 children (2 + 1 for the wait)
-            spawn (topTask);
-            spawn_and_wait_for_all (botTask);
-
-            top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing;
-            top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing;
-
-            bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing;
-            bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing;
-
-            min_seq_timing_ = std::min (top_min_timing, bot_min_timing);
-            max_seq_timing_ = std::min (top_max_timing, bot_max_timing);
-
-            return NULL;
-          }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      const_mat_view_type Q_;
-      mat_view_type C_;
-      array_top_blocks_t& top_blocks_;
-      const FactorOutput& factor_output_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      TSQR::ApplyType apply_type_;
-      TSQR::Combine<LocalOrdinal, Scalar> combine_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-      double& my_seq_timing_;
-      double& min_seq_timing_;
-      double& max_seq_timing_;
-      bool contiguous_cache_blocks_;
-
-      void
-      execute_base_case ()
-      {
-        TimerType timer("");
-        timer.start();
-        const std::vector<SeqOutput>& seq_outputs = factor_output_.first;
-        seq_.apply (apply_type_, Q_.extent(0), Q_.extent(1),
-                    Q_.data(), Q_.stride(1), seq_outputs[P_first_],
-                    C_.extent(1), C_.data(), C_.stride(1),
-                    contiguous_cache_blocks_);
-        my_seq_timing_ = timer.stop();
-      }
-
-      void
-      apply_pair (const size_t P_top,
-                  const size_t P_bot)
-      {
-        if (P_top == P_bot)
-          throw std::logic_error("apply_pair: should never get here!");
-
-        const_mat_view_type& Q_bot = top_blocks_[P_bot].first;
-        mat_view_type& C_top = top_blocks_[P_top].second;
-        mat_view_type& C_bot = top_blocks_[P_bot].second;
-
-        const ParOutput& par_output = factor_output_.second;
-        const std::vector<Scalar>& tau = par_output[P_bot];
-        std::vector<Scalar> work (C_top.extent(1));
-        combine_.apply_pair (apply_type_,
-                             C_top.extent(1), Q_bot.extent(1),
-                             Q_bot.data(), Q_bot.stride(1), tau.data(),
-                             C_top.data(), C_top.stride(1),
-                             C_bot.data(), C_bot.stride(1), work.data());
-      }
-
-    };
-
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_ApplyTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp
deleted file mode 100644
index 8827a1ce4091..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_CacheBlockTask.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_CacheBlockTask_hpp
-#define __TSQR_TBB_CacheBlockTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-    /// \class CacheBlockTask
-    /// \brief TBB task for recursive TSQR cache blocking phase.
-    ///
-    /// "Cache blocking" here means copying the input matrix, which is
-    /// stored with noncontiguous cache blocks, to the output matrix,
-    /// which is stored with contiguous cache blocks.
-    template<class LocalOrdinal, class Scalar>
-    class CacheBlockTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-
-      CacheBlockTask (const size_t P_first__,
-                      const size_t P_last__,
-                      mat_view_type& A_out,
-                      const_mat_view_type& A_in,
-                      const SequentialTsqr<LocalOrdinal, Scalar>& seq) :
-        P_first_ (P_first__),
-        P_last_ (P_last__),
-        A_out_ (A_out),
-        A_in_ (A_in),
-        seq_ (seq)
-      {}
-
-      tbb::task* execute ()
-      {
-        using tbb::task;
-
-        if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty())
-          return nullptr;
-        else if (P_first_ == P_last_)
-          {
-            execute_base_case ();
-            return nullptr;
-          }
-        else
-          {
-            // Recurse on two intervals: [P_first, P_mid] and
-            // [P_mid+1, P_last].
-            const size_t P_mid = (P_first_ + P_last_) / 2;
-            split_t out_split =
-              partitioner_.split (A_out_, P_first_, P_mid, P_last_, true);
-            const_split_t in_split =
-              partitioner_.split (A_in_, P_first_, P_mid, P_last_, false);
-
-            // The partitioner may decide that the current blocks
-            // A_out_ and A_in_ have too few rows to be worth
-            // splitting.  (It should split both A_out_ and A_in_ in
-            // the same way.)  In that case, out_split.second and
-            // in_split.second (the bottom block) will be empty.  We
-            // can deal with this by treating it as the base case.
-            if (out_split.second.empty() || out_split.second.extent(0) == 0)
-              {
-                execute_base_case ();
-                return nullptr;
-              }
-
-            // "c": continuation task
-            tbb::empty_task& c =
-              *new( allocate_continuation() ) tbb::empty_task;
-            // Recurse on the split
-            CacheBlockTask& topTask = *new( c.allocate_child() )
-              CacheBlockTask (P_first_, P_mid, out_split.first,
-                              in_split.first, seq_);
-            CacheBlockTask& botTask = *new( c.allocate_child() )
-              CacheBlockTask (P_mid+1, P_last_, out_split.second,
-                              in_split.second, seq_);
-            // Set reference count of parent (in this case, the
-            // continuation task) to 2 (since 2 children -- no
-            // additional task since no waiting).
-            c.set_ref_count (2);
-            c.spawn (botTask);
-            return &topTask; // scheduler bypass optimization
-          }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      mat_view_type A_out_;
-      const_mat_view_type A_in_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-
-      void
-      execute_base_case ()
-      {
-        seq_.cache_block (A_out_.extent(0), A_out_.extent(1),
-                          A_out_.data(), A_in_.data(), A_in_.stride(1));
-      }
-    };
-
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_CacheBlockTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp
deleted file mode 100644
index b0ce1e40f6c2..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_ExplicitQTask.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_ExplicitQTask_hpp
-#define __TSQR_TBB_ExplicitQTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-    /// \class ExplicitQTask
-    /// \brief TBB task for recursive TSQR "compute explicit Q" phase.
-    template< class LocalOrdinal, class Scalar >
-    class ExplicitQTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-
-    private:
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-
-    public:
-      ExplicitQTask (const size_t P_first__,
-                     const size_t P_last__,
-                     mat_view_type Q_out,
-                     const SequentialTsqr<LocalOrdinal, Scalar>& seq,
-                     const bool contiguous_cache_blocks) :
-        P_first_ (P_first__), P_last_ (P_last__), Q_out_ (Q_out),
-        seq_ (seq), contiguous_cache_blocks_ (contiguous_cache_blocks)
-      {}
-
-      tbb::task* execute ()
-      {
-        if (P_first_ > P_last_ || Q_out_.empty ()) {
-          return NULL;
-        }
-        else if (P_first_ == P_last_) {
-          execute_base_case ();
-          return NULL;
-        }
-        else {
-          // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-          const size_t P_mid = (P_first_ + P_last_) / 2;
-          split_t Q_split =
-            partitioner_.split (Q_out_, P_first_, P_mid, P_last_,
-                                contiguous_cache_blocks_);
-          // The partitioner may decide that the current block Q_out
-          // has too few rows to be worth splitting.  In that case,
-          // Q_split.second (the bottom block) will be empty.  We
-          // can deal with this by treating it as the base case.
-          if (Q_split.second.empty() || Q_split.second.extent(0) == 0) {
-            execute_base_case ();
-            return NULL;
-          }
-
-          // "c": continuation task
-          tbb::empty_task& c =
-            *new( allocate_continuation() ) tbb::empty_task;
-          // Recurse on the split
-          ExplicitQTask& topTask = *new( c.allocate_child() )
-            ExplicitQTask (P_first_, P_mid, Q_split.first, seq_,
-                           contiguous_cache_blocks_);
-          ExplicitQTask& botTask = *new( c.allocate_child() )
-            ExplicitQTask (P_mid+1, P_last_, Q_split.second, seq_,
-                           contiguous_cache_blocks_);
-          // Set reference count of parent (in this case, the
-          // continuation task) to 2 (since 2 children -- no
-          // additional task since no waiting).
-          c.set_ref_count (2);
-          c.spawn (botTask);
-          return &topTask; // scheduler bypass optimization
-        }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      mat_view_type Q_out_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-      bool contiguous_cache_blocks_;
-
-      void
-      execute_base_case ()
-      {
-        // Fill my partition with zeros.
-        seq_.fill_with_zeros (Q_out_.extent(0), Q_out_.extent(1),
-                              Q_out_.data(), Q_out_.stride(1),
-                              contiguous_cache_blocks_);
-        // If our partition is the first (topmost), fill it with
-        // the first Q_out.extent(1) columns of the identity matrix.
-        if (P_first_ == 0) {
-          // Fetch the topmost cache block of my partition.  Its
-          // leading dimension should be set correctly by
-          // top_block().
-          mat_view_type Q_out_top =
-            seq_.top_block (Q_out_, contiguous_cache_blocks_);
-          // Set the top block of Q_out to the first ncols
-          // columns of the identity matrix.
-          for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j) {
-            Q_out_top(j,j) = Scalar(1);
-          }
-        }
-      }
-    };
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TBB_ExplicitQTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp
deleted file mode 100644
index e03757db9e18..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_FactorTask.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_FactorTask_hpp
-#define __TSQR_TBB_FactorTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-#include "Teuchos_Assert.hpp"
-#include <algorithm>
-
-namespace TSQR {
-  namespace TBB {
-    /// \class FactorTask
-    /// \brief TBB task for recursive TSQR factorization phase.
-    template<class LocalOrdinal, class Scalar, class TimerType>
-    class FactorTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-
-      /// \typedef SeqOutput
-      /// Result of SequentialTsqr for each thread.
-      typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
-      /// \typedef ParOutput
-      ///
-      /// Array of ncores "local tau arrays" from parallel TSQR.
-      /// (Local Q factors are stored in place.)
-      typedef std::vector<std::vector<Scalar> > ParOutput;
-      /// \typedef FactorOutput
-      /// Result of SequentialTsqr for the data on each thread,
-      /// and the result of combining the threads' data.
-      typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
-
-      /// \brief Constructor.
-      ///
-      /// \note The timing references are only modified by one thread
-      ///   at a time; recursive calls use distinct references and
-      ///   combine the results.
-      FactorTask (const size_t P_first__,
-                  const size_t P_last__,
-                  mat_view_type A,
-                  mat_view_type* const A_top_ptr,
-                  std::vector<SeqOutput>& seq_outputs,
-                  ParOutput& par_output,
-                  const SequentialTsqr<LocalOrdinal, Scalar>& seq,
-                  double& my_seq_timing,
-                  double& min_seq_timing,
-                  double& max_seq_timing,
-                  const bool contiguous_cache_blocks) :
-        P_first_ (P_first__),
-        P_last_ (P_last__),
-        A_ (A),
-        A_top_ptr_ (A_top_ptr),
-        seq_outputs_ (seq_outputs),
-        par_output_ (par_output),
-        seq_ (seq),
-        contiguous_cache_blocks_ (contiguous_cache_blocks),
-        my_seq_timing_ (my_seq_timing),
-        min_seq_timing_ (min_seq_timing),
-        max_seq_timing_ (max_seq_timing)
-      {}
-
-      tbb::task* execute ()
-      {
-        if (P_first_ > P_last_ || A_.empty())
-          return NULL;
-        else if (P_first_ == P_last_)
-          {
-            execute_base_case ();
-            return NULL;
-          }
-        else
-          {
-            // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-            const size_t P_mid = (P_first_ + P_last_) / 2;
-            split_t A_split =
-              partitioner_.split (A_, P_first_, P_mid, P_last_,
-                                  contiguous_cache_blocks_);
-            // The partitioner may decide that the current block A_
-            // has too few rows to be worth splitting.  In that case,
-            // A_split.second (the bottom block) will be empty.  We
-            // can deal with this by treating it as the base case.
-            if (A_split.second.empty() || A_split.second.extent(0) == 0)
-              {
-                execute_base_case ();
-                return NULL;
-              }
-
-            double top_timing;
-            double top_min_timing = 0.0;
-            double top_max_timing = 0.0;
-            double bot_timing;
-            double bot_min_timing = 0.0;
-            double bot_max_timing = 0.0;
-
-            FactorTask& topTask = *new( allocate_child() )
-              FactorTask (P_first_, P_mid, A_split.first, A_top_ptr_,
-                          seq_outputs_, par_output_, seq_,
-                          top_timing, top_min_timing, top_max_timing,
-                          contiguous_cache_blocks_);
-            // After the task finishes, A_bot will be set to the topmost
-            // partition of A_split.second.  This will let us combine
-            // the two subproblems (using factor_pair()) after their
-            // tasks complete.
-            mat_view_type A_bot;
-            FactorTask& botTask = *new( allocate_child() )
-              FactorTask (P_mid+1, P_last_, A_split.second, &A_bot,
-                          seq_outputs_, par_output_, seq_,
-                          bot_timing, bot_min_timing, bot_max_timing,
-                          contiguous_cache_blocks_);
-            set_ref_count (3); // 3 children (2 + 1 for the wait)
-            spawn (topTask);
-            spawn_and_wait_for_all (botTask);
-
-            // Combine the two results
-            factor_pair (P_first_, P_mid+1, *A_top_ptr_, A_bot);
-
-            top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing;
-            top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing;
-
-            bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing;
-            bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing;
-
-            min_seq_timing_ = std::min (top_min_timing, bot_min_timing);
-            max_seq_timing_ = std::min (top_max_timing, bot_max_timing);
-
-            return NULL;
-          }
-      }
-
-    private:
-      const size_t P_first_, P_last_;
-      mat_view_type A_;
-      mat_view_type* const A_top_ptr_;
-      std::vector<SeqOutput>& seq_outputs_;
-      ParOutput& par_output_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      TSQR::Combine<LocalOrdinal, Scalar> combine_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-      const bool contiguous_cache_blocks_;
-      double& my_seq_timing_;
-      double& min_seq_timing_;
-      double& max_seq_timing_;
-
-      void
-      factor_pair (const size_t P_top,
-                   const size_t P_bot,
-                   mat_view_type& A_top, // different than A_top_
-                   mat_view_type& A_bot)
-      {
-        const char thePrefix[] = "TSQR::TBB::Factor::factor_pair: ";
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (P_top == P_bot, std::logic_error, thePrefix << "Should "
-           "never get here! P_top == P_bot (= " << P_top << "), that "
-           "is, the indices of the thread partitions are the same.");
-        // We only read and write the upper ncols x ncols triangle of
-        // each block.
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (A_top.extent(1) != A_bot.extent(1), std::logic_error,
-           thePrefix << "The top cache block A_top is "
-           << A_top.extent(0) << " x " << A_top.extent(1)
-           << ", and the bottom cache block A_bot is "
-           << A_bot.extent(0) << " x " << A_bot.extent(1)
-           << "; this means we can't factor [A_top; A_bot].");
-        const LocalOrdinal ncols = A_top.extent(1);
-        std::vector<Scalar>& tau = par_output_[P_bot];
-        std::vector<Scalar> work (ncols);
-        combine_.factor_pair (A_top, A_bot, tau.data(), work.data());
-      }
-
-      void
-      execute_base_case ()
-      {
-        TimerType timer("");
-        timer.start();
-        seq_outputs_[P_first_] =
-          seq_.factor (A_.extent(0), A_.extent(1), A_.data(),
-                       A_.stride(1), contiguous_cache_blocks_);
-        // Assign the topmost cache block of the current partition to
-        // *A_top_ptr_.  Every base case invocation does this, so that
-        // we can combine subproblems.  The root task also does this,
-        // but for a different reason: so that we can extract the R
-        // factor, once we're done with the factorization.
-        *A_top_ptr_ = seq_.top_block (A_, contiguous_cache_blocks_);
-        my_seq_timing_ = timer.stop();
-      }
-    };
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TBB_FactorTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp
deleted file mode 100644
index 8bc0f42264a7..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_FillWithZerosTask.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_FillWithZerosTask_hpp
-#define __TSQR_TBB_FillWithZerosTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-    /// \class FillWithZerosTask
-    /// \brief TBB task for recursive TSQR "fill with zeros" phase.
-    template<class LocalOrdinal, class Scalar>
-    class FillWithZerosTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-
-    private:
-      typedef std::pair<mat_view_type, mat_view_type> split_type;
-
-    public:
-      FillWithZerosTask (const size_t P_first,
-                         const size_t P_last,
-                         mat_view_type C,
-                         const SequentialTsqr<LocalOrdinal, Scalar>& seq,
-                         const bool contiguous_cache_blocks = false)
-        : P_first_ (P_first),
-          P_last_ (P_last),
-          C_ (C),
-          seq_ (seq),
-          contiguous_cache_blocks_ (contiguous_cache_blocks)
-      {}
-
-      tbb::task* execute ()
-      {
-        if (P_first_ > P_last_ || C_.empty()) {
-          return nullptr;
-        }
-        else if (P_first_ == P_last_) {
-          execute_base_case ();
-          return nullptr;
-        }
-        else {
-          // Recurse on two intervals: [P_first, P_mid] and
-          // [P_mid+1, P_last].
-          const size_t P_mid = (P_first_ + P_last_) / 2;
-          split_type C_split =
-            partitioner_.split (C_, P_first_, P_mid, P_last_,
-                                contiguous_cache_blocks_);
-          // The partitioner may decide that the current block C_
-          // has too few rows to be worth splitting.  In that case,
-          // C_split.second (the bottom block) will be empty.  We
-          // can deal with this by treating it as the base case.
-          if (C_split.second.empty() || C_split.second.extent(0) == 0) {
-            execute_base_case ();
-            return nullptr;
-          }
-
-          // "c": continuation task
-          tbb::empty_task& c =
-            *new( allocate_continuation() ) tbb::empty_task;
-          // Recurse on the split
-          FillWithZerosTask& topTask = *new( c.allocate_child() )
-            FillWithZerosTask (P_first_, P_mid, C_split.first, seq_,
-                               contiguous_cache_blocks_);
-          FillWithZerosTask& botTask = *new( c.allocate_child() )
-            FillWithZerosTask (P_mid+1, P_last_, C_split.second, seq_,
-                               contiguous_cache_blocks_);
-          // Set reference count of parent (in this case, the
-          // continuation task) to 2 (since 2 children -- no
-          // additional task since no waiting).
-          c.set_ref_count (2);
-          c.spawn (botTask);
-          return &topTask; // scheduler bypass optimization
-        }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      mat_view_type C_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-      bool contiguous_cache_blocks_;
-
-      void
-      execute_base_case ()
-      {
-        // Fill my partition with zeros.
-        seq_.fill_with_zeros (C_.extent(0), C_.extent(1), C_.data(),
-                              C_.stride(1), contiguous_cache_blocks_);
-      }
-    };
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_FillWithZerosTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp b/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp
deleted file mode 100644
index f37ab6a7a06c..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_Partitioner.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_Partitioner_hpp
-#define __TSQR_TBB_Partitioner_hpp
-
-#include "Tsqr_MatView.hpp"
-
-#include <cstring> // size_t
-#include <sstream>
-#include <stdexcept>
-#include <utility>
-#include <vector>
-
-namespace TSQR {
-  namespace TBB {
-    template<class Ordinal, class Scalar>
-    class Partitioner {
-    private:
-      bool
-      should_split (const Ordinal nrows,
-                    const Ordinal ncols,
-                    const size_t num_partitions) const
-      {
-        using std::invalid_argument;
-        using std::ostringstream;
-
-        if (nrows < ncols) {
-          ostringstream os;
-          os << "Partitioner::should_split: nrows (= " << nrows
-             << ") < ncols (= " << ncols << ")";
-          throw invalid_argument (os.str());
-        }
-        else if (num_partitions == 0) {
-          ostringstream os;
-          os << "Partitioner::should_split: nrows (= " << nrows
-             << ") < ncols (= " << ncols << ")";
-          throw invalid_argument (os.str());
-        }
-        // FIXME (mfh 11 Jul 2010) Need more overflow checks here.
-        return static_cast<size_t>(nrows) / num_partitions >= static_cast<size_t>(ncols);
-      }
-
-    public:
-      /// Partition into [P_first, P_mid] and [P_mid+1, P_last].  The
-      /// base case is reached when the second returned MatrixViewType
-      /// is empty.
-      template< class MatrixViewType >
-      std::pair< MatrixViewType, MatrixViewType >
-      split (const MatrixViewType& A,
-             const size_t P_first,
-             const size_t P_mid,
-             const size_t P_last,
-             const bool contiguous_cache_blocks) const
-      {
-        using ordinal_type = typename MatrixViewType::ordinal_type;
-        using pointer_type = typename MatrixViewType::pointer;
-
-        const size_t num_partitions_top = P_mid - P_first + 1;
-        //const size_t num_partitions_bottom = P_last - P_mid;
-        const size_t num_partitions = P_last - P_first + 1;
-        const ordinal_type nrows = A.extent(0);
-        const ordinal_type ncols = A.extent(1);
-
-        if (! should_split (nrows, ncols, num_partitions)) {
-          return std::make_pair (MatrixViewType(A), MatrixViewType());
-        }
-        else {
-          const ordinal_type num_rows_partition = nrows / num_partitions;
-          const ordinal_type remainder = nrows % num_partitions;
-
-          // Top partition gets the remainder rows.  Doing the
-          // multiplication before the division might make it more
-          // likely to avoid truncating the fraction, but may cause
-          // overflow of ordinal_type.
-          const ordinal_type num_rows_top =
-            num_rows_partition * num_partitions_top + remainder;
-          const ordinal_type num_rows_bot = nrows - num_rows_top;
-
-          // We don't call (const_)mat_view::split_top(), because that
-          // is for splitting off a single cache block.  Each half
-          // of the split may contain more than one cache block.
-          if (contiguous_cache_blocks) {
-            pointer_type A_bot_ptr = A.data() + num_rows_top * ncols;
-            MatrixViewType A_top (num_rows_top, ncols, A.data(), num_rows_top);
-            MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, num_rows_bot);
-            return std::make_pair (A_top, A_bot);
-          }
-          else {
-            pointer_type A_bot_ptr = A.data() + num_rows_top;
-            MatrixViewType A_top (num_rows_top, ncols, A.data(), A.stride(1));
-            MatrixViewType A_bot (num_rows_bot, ncols, A_bot_ptr, A.stride(1));
-            return std::make_pair (A_top, A_bot);
-          }
-        }
-      }
-    }; // class Partitioner
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TBB_Partitioner_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp
deleted file mode 100644
index 7a3162b2f9a4..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_RevealRankTask.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_RevealRankTask_hpp
-#define __TSQR_TBB_RevealRankTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-    /// \class RevealRankTask
-    /// \brief TBB task for recursive TSQR "rank-revealing" phase.
-    ///
-    /// This part of the factorization doesn't actually reveal the
-    /// rank in parallel; we assume that this has already been done
-    /// and the columns of U form a basis for the column space of the
-    /// R factor (in the QR factorization of the original matrix).
-    /// All we need to do here is compute Q*U in parallel, respecting
-    /// the original partitioning and cache blocking scheme.
-    template<class LocalOrdinal, class Scalar>
-    class RevealRankTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<mat_view_type, mat_view_type> split_type;
-      typedef SequentialTsqr<LocalOrdinal, Scalar> seq_tsqr_type;
-
-      RevealRankTask (const size_t P_first,
-                      const size_t P_last,
-                      const mat_view_type& Q,
-                      const const_mat_view_type& U,
-                      const seq_tsqr_type& seq,
-                      const bool contiguous_cache_blocks) :
-        P_first_ (P_first),
-        P_last_ (P_last),
-        Q_ (Q),
-        U_ (U),
-        seq_ (seq),
-        contiguous_cache_blocks_ (contiguous_cache_blocks)
-      {}
-
-      void
-      execute_base_case ()
-      {
-        // Use SequentialTsqr to compute Q*U for this core's local
-        // part of Q.  The method is called "Q_times_B" so that it
-        // doesn't suggest any orthogonality of the B input matrix,
-        // though in this case B is U and U is orthogonal
-        // (resp. unitary if Scalar is complex).
-        seq_.Q_times_B (Q_.extent(0), Q_.extent(1),
-                        Q_.data(), Q_.stride(1),
-                        U_.data(), U_.stride(1),
-                        contiguous_cache_blocks_);
-      }
-
-      tbb::task* execute ()
-      {
-        using tbb::task;
-
-        if (P_first_ > P_last_ || Q_.empty()) {
-          return nullptr; // shouldn't get here, but just in case...
-        }
-        else if (P_first_ == P_last_) {
-          execute_base_case ();
-          return nullptr;
-        }
-        else {
-          // Recurse on two intervals: [P_first, P_mid] and
-          // [P_mid+1, P_last]
-          const size_t P_mid = (P_first_ + P_last_) / 2;
-          split_type out_split =
-            partitioner_.split (Q_, P_first_, P_mid, P_last_,
-                                contiguous_cache_blocks_);
-          // The partitioner may decide that the current block Q_ has
-          // too few rows to be worth splitting.  In that case,
-          // out_split.second (the bottom block) will be empty.  We
-          // can deal with this by treating it as the base case.
-          if (out_split.second.empty() || out_split.second.extent(0) == 0) {
-            execute_base_case ();
-            return nullptr;
-          }
-
-          // "c": continuation task
-          tbb::empty_task& c =
-            *new( allocate_continuation() ) tbb::empty_task;
-          // Recurse on the split
-          RevealRankTask& topTask = *new( c.allocate_child() )
-            RevealRankTask (P_first_, P_mid, out_split.first, U_,
-                            seq_, contiguous_cache_blocks_);
-          RevealRankTask& botTask = *new( c.allocate_child() )
-            RevealRankTask (P_mid+1, P_last_, out_split.second, U_,
-                            seq_, contiguous_cache_blocks_);
-          // Set reference count of parent (in this case, the
-          // continuation task) to 2 (since 2 children -- no
-          // additional task since no waiting).
-          c.set_ref_count (2);
-          c.spawn (botTask);
-          return &topTask; // scheduler bypass optimization
-        }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      mat_view_type Q_;
-      const_mat_view_type U_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-      bool contiguous_cache_blocks_;
-    };
-
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_RevealRankTask_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp
deleted file mode 100644
index 53a473d2e5f7..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_TbbMgs.hpp
+++ /dev/null
@@ -1,409 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_TbbMgs_hpp
-#define __TSQR_TBB_TbbMgs_hpp
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <numeric>
-#include <utility> // std::pair
-
-#include "Tsqr_MessengerBase.hpp"
-#include "Teuchos_ScalarTraits.hpp"
-#include "Tsqr_Util.hpp"
-#include "Teuchos_RCP.hpp"
-
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_for.h>
-#include <tbb/parallel_reduce.h>
-#include <tbb/partitioner.h>
-
-namespace TSQR {
-  namespace TBB {
-
-    // Forward declaration
-    template< class LocalOrdinal, class Scalar >
-    class TbbMgs {
-    public:
-      typedef Scalar scalar_type;
-      typedef LocalOrdinal ordinal_type;
-      typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
-      typedef MessengerBase< Scalar > messenger_type;
-      typedef Teuchos::RCP< messenger_type > messenger_ptr;
-
-      TbbMgs (const messenger_ptr& messenger) :
-        messenger_ (messenger) {}
-
-      void
-      mgs (const LocalOrdinal nrows_local,
-           const LocalOrdinal ncols,
-           Scalar A_local[],
-           const LocalOrdinal lda_local,
-           Scalar R[],
-           const LocalOrdinal ldr);
-
-    private:
-      messenger_ptr messenger_;
-    };
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-    namespace details {
-
-      /// Compute y'*x (where y' means conjugate transpose in the
-      /// complex case, and transpose in the real case).
-      template< class LocalOrdinal, class Scalar >
-      class TbbDot {
-      public:
-        void
-        operator() (const tbb::blocked_range< LocalOrdinal >& r)
-        {
-          typedef Teuchos::ScalarTraits<Scalar> STS;
-
-          // The TBB book likes this copying of pointers into the local routine.
-          // It probably helps the compiler do optimizations.
-          const Scalar* const x = x_;
-          const Scalar* const y = y_;
-          Scalar local_result = result_;
-
-          for (LocalOrdinal i = r.begin(); i != r.end(); ++i) {
-            local_result += x[i] * STS::conjugate (y[i]);
-          }
-          result_ = local_result;
-        }
-        /// Result of the reduction.
-        Scalar result() const { return result_; }
-
-        /// Ordinary constructor
-        TbbDot (const Scalar* const x, const Scalar* const y) :
-          result_ (Scalar(0)), x_ (x), y_ (y) {}
-
-        /// "Split constructor" for TBB reductions
-        TbbDot (TbbDot& d, tbb::split) :
-          result_ (Scalar(0)), x_ (d.x_), y_ (d.y_)
-        {}
-        /// "Join" operator for TBB reductions; it tells TBB how to
-        /// combine two subproblems.
-        void join (const TbbDot& d) { result_ += d.result(); }
-
-      private:
-        // Default constructor doesn't make sense.
-        TbbDot ();
-
-        Scalar result_;
-        const Scalar* const x_;
-        const Scalar* const y_;
-      };
-
-      template< class LocalOrdinal, class Scalar >
-      class TbbScale {
-      public:
-        TbbScale (Scalar* const x, const Scalar& denom) :
-          x_ (x), denom_ (denom) {}
-
-        // TBB demands that this be a "const" operator, in order for
-        // the parallel_for expression to compile.  Strictly speaking,
-        // it is const, because it does not change the address of the
-        // pointer x_ (only the values stored there).
-        void
-        operator() (const tbb::blocked_range< LocalOrdinal >& r) const
-        {
-          // TBB likes arrays to have their pointers copied like this in
-          // the operator() method.  I suspect it has something to do
-          // with compiler optimizations.  If C++ supported the
-          // "restrict" keyword, here would be a good place to add it...
-          Scalar* const x = x_;
-          const Scalar denom = denom_;
-          for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
-            x[i] = x[i] / denom;
-        }
-      private:
-        Scalar* const x_;
-        const Scalar denom_;
-      };
-
-      template< class LocalOrdinal, class Scalar >
-      class TbbAxpy {
-      public:
-        TbbAxpy (const Scalar& alpha, const Scalar* const x, Scalar* const y) :
-          alpha_ (alpha), x_ (x), y_ (y)
-        {}
-        // TBB demands that this be a "const" operator, in order for
-        // the parallel_for expression to compile.  Strictly speaking,
-        // it is const, because it does change the address of the
-        // pointer y_ (only the values stored there).
-        void
-        operator() (const tbb::blocked_range< LocalOrdinal >& r) const
-        {
-          const Scalar alpha = alpha_;
-          const Scalar* const x = x_;
-          Scalar* const y = y_;
-          for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
-            y[i] = y[i] + alpha * x[i];
-        }
-      private:
-        const Scalar alpha_;
-        const Scalar* const x_;
-        Scalar* const y_;
-      };
-
-      template< class LocalOrdinal, class Scalar >
-      class TbbNormSquared {
-      private:
-        typedef Teuchos::ScalarTraits<Scalar> STS;
-
-      public:
-        typedef typename STS::magnitudeType magnitude_type;
-
-        void
-        operator () (const tbb::blocked_range<LocalOrdinal>& r)
-        {
-          // Doing the right thing in the complex case requires taking
-          // an absolute value.  We want to avoid this additional cost
-          // in the real case, which is why we check is_complex.
-          if (STS::isComplex) {
-            // The TBB book favors copying array pointers into the
-            // local routine.  It probably helps the compiler do
-            // optimizations.
-            const Scalar* const x = x_;
-            for (LocalOrdinal i = r.begin(); i != r.end(); ++i) {
-              // One could implement this by computing
-              //
-              // result_ += STS::real (x[i] * STS::conjugate(x[i]));
-              //
-              // However, in terms of type theory, it's much more
-              // natural to start with a magnitude_type before
-              // doing the multiplication.
-              const magnitude_type xi = STS::magnitude (x[i]);
-              result_ += xi * xi;
-            }
-          }
-          else {
-            const Scalar* const x = x_;
-            for (LocalOrdinal i = r.begin(); i != r.end(); ++i) {
-              const Scalar xi = x[i];
-              result_ += xi * xi;
-            }
-          }
-        }
-
-        magnitude_type result () const { return result_; }
-
-        TbbNormSquared (const Scalar* const x) :
-          result_ (magnitude_type(0)), x_ (x) {}
-
-        TbbNormSquared (TbbNormSquared& d, tbb::split) :
-          result_ (magnitude_type(0)), x_ (d.x_) {}
-
-        void join (const TbbNormSquared& d) { result_ += d.result (); }
-
-      private:
-        // Default constructor doesn't make sense
-        TbbNormSquared ();
-
-        magnitude_type result_;
-        const Scalar* const x_;
-      };
-
-
-      template< class LocalOrdinal, class Scalar >
-      class TbbMgsOps {
-      private:
-        typedef tbb::blocked_range< LocalOrdinal > range_type;
-        typedef Teuchos::ScalarTraits<Scalar> STS;
-
-      public:
-        typedef MessengerBase<Scalar> messenger_type;
-        typedef Teuchos::RCP<messenger_type> messenger_ptr;
-        typedef typename STS::magnitudeType magnitude_type;
-
-        TbbMgsOps (const messenger_ptr& messenger) :
-          messenger_ (messenger) {}
-
-        void
-        axpy (const LocalOrdinal nrows_local,
-              const Scalar alpha,
-              const Scalar x_local[],
-              Scalar y_local[]) const
-        {
-          using tbb::auto_partitioner;
-          using tbb::parallel_for;
-
-          TbbAxpy< LocalOrdinal, Scalar > axpyer (alpha, x_local, y_local);
-          parallel_for (range_type (0, nrows_local), axpyer, auto_partitioner ());
-        }
-
-        void
-        scale (const LocalOrdinal nrows_local,
-               Scalar x_local[],
-               const Scalar denom) const
-        {
-          using tbb::auto_partitioner;
-          using tbb::parallel_for;
-
-          // "scaler" is spelled that way (and not as "scalar") on
-          // purpose.  Think about it.
-          TbbScale<LocalOrdinal, Scalar> scaler (x_local, denom);
-          parallel_for (range_type (0, nrows_local), scaler, auto_partitioner ());
-        }
-
-        /// $y^* \cdot x$: conjugate transpose when Scalar is complex,
-        /// else regular transpose.
-        Scalar
-        dot (const LocalOrdinal nrows_local,
-             const Scalar x_local[],
-             const Scalar y_local[])
-        {
-          Scalar localResult (0);
-          if (true)
-            {
-              // FIXME (mfh 26 Aug 2010) I'm not sure why I did this
-              // (i.e., why I wrote "if (true)" here).  Certainly the
-              // branch that is currently enabled should produce
-              // correct behavior.  I suspect the nonenabled branch
-              // will not.
-              if (true) {
-                TbbDot<LocalOrdinal, Scalar> dotter (x_local, y_local);
-                dotter (range_type (0, nrows_local));
-                localResult = dotter.result ();
-              }
-              else {
-                using tbb::auto_partitioner;
-                using tbb::parallel_reduce;
-
-                TbbDot<LocalOrdinal, Scalar> dotter (x_local, y_local);
-                parallel_reduce (range_type (0, nrows_local),
-                                 dotter, auto_partitioner ());
-                localResult = dotter.result ();
-              }
-            }
-          else {
-            for (LocalOrdinal i = 0; i != nrows_local; ++i) {
-              localResult += x_local[i] * STS::conjugate (y_local[i]);
-            }
-          }
-
-          // FIXME (mfh 23 Apr 2010) Does MPI_SUM do the right thing for
-          // complex or otherwise general MPI data types?  Perhaps an MPI_Op
-          // should belong in the MessengerBase...
-          return messenger_->globalSum (localResult);
-        }
-
-        magnitude_type
-        norm2 (const LocalOrdinal nrows_local,
-               const Scalar x_local[])
-        {
-          using tbb::auto_partitioner;
-          using tbb::parallel_reduce;
-
-          TbbNormSquared< LocalOrdinal, Scalar > normer (x_local);
-          parallel_reduce (range_type (0, nrows_local), normer,
-                           auto_partitioner ());
-          const magnitude_type localResult = normer.result();
-          // FIXME (mfh 12 Oct 2010) This involves an implicit
-          // typecast from Scalar to magnitude_type.
-          const magnitude_type globalResult =
-            messenger_->globalSum (localResult);
-          // Make sure that sqrt's argument is a magnitude_type.  Of
-          // course global_result should be nonnegative real, but we
-          // want the compiler to pick up the correct sqrt function.
-          typedef Teuchos::ScalarTraits<magnitude_type> STM;
-          return STM::squareroot (globalResult);
-        }
-
-        Scalar
-        project (const LocalOrdinal nrows_local,
-                 const Scalar q_local[],
-                 Scalar v_local[])
-        {
-          const Scalar coeff = this->dot (nrows_local, v_local, q_local);
-          this->axpy (nrows_local, -coeff, q_local, v_local);
-          return coeff;
-        }
-
-      private:
-        messenger_ptr messenger_;
-      };
-    } // namespace details
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-    template<class LocalOrdinal, class Scalar>
-    void
-    TbbMgs<LocalOrdinal, Scalar>::mgs (const LocalOrdinal nrows_local,
-                                       const LocalOrdinal ncols,
-                                       Scalar A_local[],
-                                       const LocalOrdinal lda_local,
-                                       Scalar R[],
-                                       const LocalOrdinal ldr)
-    {
-      details::TbbMgsOps<LocalOrdinal, Scalar> ops (messenger_);
-
-      for (LocalOrdinal j = 0; j < ncols; ++j) {
-        Scalar* const v = &A_local[j*lda_local];
-        for (LocalOrdinal i = 0; i < j; ++i) {
-          const Scalar* const q = &A_local[i*lda_local];
-          R[i + j*ldr] = ops.project (nrows_local, q, v);
-        }
-        const magnitude_type denom = ops.norm2 (nrows_local, v);
-
-        // FIXME (mfh 29 Apr 2010)
-        //
-        // NOTE IMPLICIT CAST.  This should work for complex numbers.
-        // If it doesn't work for your Scalar data type, it means that
-        // you need a different data type for the diagonal elements of
-        // the R factor, than you need for the other elements.  This
-        // is unlikely if we're comparing MGS against a Householder QR
-        // factorization; I don't really understand how the latter
-        // would work (not that it couldn't be given a sensible
-        // interpretation) in the case of Scalars that aren't plain
-        // old real or complex numbers.
-        R[j + j*ldr] = Scalar (denom);
-        ops.scale (nrows_local, v, denom);
-      }
-    }
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TBB_TbbMgs_hpp
-
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp
deleted file mode 100644
index c86123c42d8b..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_TbbParallelTsqr.hpp
+++ /dev/null
@@ -1,690 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_TbbParallelTsqr_hpp
-#define __TSQR_TBB_TbbParallelTsqr_hpp
-
-#include <tbb/tbb.h>
-#include <tbb/task_scheduler_init.h>
-#include "TbbTsqr_FactorTask.hpp"
-#include "TbbTsqr_ApplyTask.hpp"
-#include "TbbTsqr_ExplicitQTask.hpp"
-#include "TbbTsqr_RevealRankTask.hpp"
-#include "TbbTsqr_CacheBlockTask.hpp"
-#include "TbbTsqr_UnCacheBlockTask.hpp"
-#include "TbbTsqr_FillWithZerosTask.hpp"
-#include "Tsqr_ApplyType.hpp"
-#include "Teuchos_ScalarTraits.hpp"
-#include <algorithm>
-#include <limits>
-
-namespace TSQR {
-  namespace TBB {
-    /// \class TbbParallelTsqr
-    /// \brief Parallel implementation of \c TbbTsqr.
-    /// \author Mark Hoemmen
-    ///
-    /// This class implements the functionality of \c TbbTsqr.
-    /// It is not meant to be seen by users of \c TbbTsqr.
-    ///
-    /// The third template parameter, TimerType, allows different
-    /// timer implementations.  TbbParallelTsqr times each task's
-    /// invocations of \c SequentialTsqr::factor() and \c
-    /// SequentialTsqr::apply().  \c TrivialTimer is a "timer" that
-    /// does nothing, in case you don't want to invoke timers.
-    template<class LocalOrdinal, class Scalar, class TimerType>
-    class TbbParallelTsqr {
-    private:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-      typedef std::pair<const_mat_view_type, mat_view_type> top_blocks_t;
-      typedef std::vector<top_blocks_t> array_top_blocks_t;
-
-      template<class MatrixViewType>
-      MatrixViewType
-      top_block_helper (const size_t P_first,
-                        const size_t P_last,
-                        const MatrixViewType& C,
-                        const bool contiguous_cache_blocks) const
-      {
-        if (P_first > P_last)
-          throw std::logic_error ("P_first > P_last");
-        else if (P_first == P_last)
-          return seq_.top_block (C, contiguous_cache_blocks);
-        else
-          {
-            typedef std::pair<MatrixViewType, MatrixViewType> split_type;
-
-            // Divide [P_first, P_last] into two intervals: [P_first,
-            // P_mid] and [P_mid+1, P_last].  Recurse on the first
-            // interval [P_first, P_mid].
-            const size_t P_mid = (P_first + P_last) / 2;
-            split_type C_split = partitioner_.split (C, P_first, P_mid, P_last,
-                                                     contiguous_cache_blocks);
-            // The partitioner may decide that the current block C has
-            // too few rows to be worth splitting.  In that case,
-            // C_split.first should be the same block as C, and
-            // C_split.second (the bottom block) will be empty.  We
-            // deal with this in the same way as the base case
-            // (P_first == P_last) above.
-            if (C_split.second.empty() || C_split.second.extent(0) == 0)
-              return seq_.top_block (C_split.first, contiguous_cache_blocks);
-            else
-              return top_block_helper (P_first, P_mid, C_split.first,
-                                       contiguous_cache_blocks);
-          }
-      }
-
-    public:
-      typedef Scalar scalar_type;
-      typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
-      typedef LocalOrdinal ordinal_type;
-
-      /// Whether or not this QR factorization produces an R factor
-      /// with all nonnegative diagonal entries.
-      static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-        typedef Combine<LocalOrdinal, Scalar> combine_type;
-        return combine_type::QR_produces_R_factor_with_nonnegative_diagonal ();
-      }
-
-      /// \typedef SeqOutput
-      /// \brief Results of SequentialTsqr for each core.
-      typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
-
-      /// \typedef ParOutput
-      /// \brief Array of numTasks_ "local tau arrays" from parallel TSQR.
-      ///
-      /// (Local Q factors are stored in place.)
-      typedef std::vector<std::vector<Scalar> > ParOutput;
-
-      /// \typedef FactorOutput
-      /// \brief Partial representation of the Q factor.
-      ///
-      /// The \c factor() method returns a pair: the results of
-      /// SequentialTsqr for data on each core, and the results of
-      /// combining the data on the cores.
-      typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
-
-      /// \brief Constructor.
-      ///
-      /// \param numTasks [in] Number of parallel tasks to use in the
-      ///   factorization.  This should be >= the number of cores with
-      ///   which Intel TBB was initialized.
-      /// \param cacheSizeHint [in] Cache size hint in bytes.  Zero
-      ///   means that TSQR will pick a reasonable nonzero default.
-      TbbParallelTsqr (const size_t numTasks = 1,
-                       const size_t cacheSizeHint = 0) :
-        seq_ (cacheSizeHint),
-        min_seq_factor_timing_ (std::numeric_limits<double>::max()),
-        max_seq_factor_timing_ (std::numeric_limits<double>::min()),
-        min_seq_apply_timing_ (std::numeric_limits<double>::max()),
-        max_seq_apply_timing_ (std::numeric_limits<double>::min())
-      {
-        if (numTasks < 1)
-          numTasks_ = 1; // default is no parallelism
-        else
-          numTasks_ = numTasks;
-      }
-
-      /// \brief Constructor (that takes a parameter list).
-      ///
-      /// \param plist [in/out] On input: list of parameters.  On
-      ///   output: missing parameters are filled in with default
-      ///   values.
-      ///
-      /// For a list of accepted parameters and thei documentation,
-      /// see the parameter list returned by \c getValidParameters().
-      TbbParallelTsqr (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
-        seq_ (plist), // SequentialTsqr has a plist-accepting constructor.
-        numTasks_ (1),  // Set a safe default for now.
-        min_seq_factor_timing_ (std::numeric_limits<double>::max()),
-        max_seq_factor_timing_ (std::numeric_limits<double>::min()),
-        min_seq_apply_timing_ (std::numeric_limits<double>::max()),
-        max_seq_apply_timing_ (std::numeric_limits<double>::min())
-      {
-        if (! plist.is_null()) {
-          const int defaultNumTasks = 1; // A reasonable safe default value.
-          int numTasks = plist->get ("Num Tasks", defaultNumTasks);
-          if (numTasks < 1) { // Default is no parallelism.
-            plist->set ("Num Tasks", defaultNumTasks);
-          }
-          numTasks_ = numTasks;
-        }
-      }
-
-      Teuchos::RCP<const Teuchos::ParameterList>
-      getValidParameters () const
-      {
-        using Teuchos::ParameterList;
-        using Teuchos::parameterList;
-        using Teuchos::RCP;
-
-        // TbbTsqr recursively divides the tall skinny matrix on the
-        // node into TBB tasks.  Each task works on a block row.  The
-        // TBB task scheduler ensures that oversubscribing TBB tasks
-        // won't oversubscribe cores, so it's OK if
-        // default_num_threads() is too many.  For example, TBB might
-        // say default_num_threads() is the number of cores on the
-        // node, but the TBB task scheduler might have been
-        // initialized with the number of cores per NUMA region, for
-        // hybrid MPI + TBB parallelism.
-        const int numTasks =
-          tbb::task_scheduler_init::default_num_threads();
-        const size_t cacheSizeHint = 0;
-        const size_t sizeOfScalar = sizeof(Scalar);
-
-        RCP<ParameterList> params = parameterList ("NodeTsqr");
-        params->set ("Num Tasks", numTasks,
-                     "Number of tasks to use in the intranode parallel part "
-                     "TSQR.  There is little/no performance penalty for mild "
-                     "oversubscription, but a potential performance penalty "
-                     "for undersubscription.");
-        params->set ("Cache Size Hint", cacheSizeHint,
-                    "Cache size hint in bytes (as a size_t) to use for "
-                    "intranode TSQR.  If zero, TSQR will pick a reasonable "
-                    "default.  See the documentation of SequentialTsqr for "
-                     "a discussion of how to tune this parameter.");
-        params->set ("Size of Scalar", sizeOfScalar);
-
-        return params;
-      }
-
-      void
-      setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
-      {
-        seq_.setParameterList (plist);
-
-        if (! plist.is_null()) {
-          const int defaultNumCores = 1; // A reasonable safe default value.
-          int numTasks = plist->get ("Num Tasks", defaultNumCores);
-          if (numTasks < 1) { // Default is no parallelism.
-            plist->set ("Num Tasks", defaultNumCores);
-          }
-          numTasks_ = numTasks;
-        }
-      }
-
-      /// \brief Number of tasks that TSQR will use to solve the problem.
-      ///
-      /// This is the number of subproblems into which to divide the
-      /// main problem, in order to solve it in parallel.
-      size_t ntasks() const { return numTasks_; }
-
-      /// \brief Cache size hint (in bytes) used for the factorization.
-      ///
-      /// This may be different from the corresponding constructor
-      /// argument, because TSQR may revise unreasonable suggestions
-      /// into reasonable values.
-      size_t cache_size_hint() const { return seq_.cache_size_hint(); }
-
-      //! Fastest time over all tasks of the last SequentialTsqr::factor() call.
-      double
-      min_seq_factor_timing () const { return min_seq_factor_timing_; }
-      //! Slowest time over all tasks of the last SequentialTsqr::factor() call.
-      double
-      max_seq_factor_timing () const { return max_seq_factor_timing_; }
-      //! Fastest time over all tasks of the last SequentialTsqr::apply() call.
-      double
-      min_seq_apply_timing () const { return min_seq_apply_timing_; }
-      //! Slowest time over all tasks of the last SequentialTsqr::apply() call.
-      double
-      max_seq_apply_timing () const { return max_seq_apply_timing_; }
-
-      FactorOutput
-      factor (const LocalOrdinal nrows,
-              const LocalOrdinal ncols,
-              Scalar A[],
-              const LocalOrdinal lda,
-              Scalar R[],
-              const LocalOrdinal ldr,
-              const bool contiguous_cache_blocks) const
-      {
-        using tbb::task;
-
-        mat_view_type A_view (nrows, ncols, A, lda);
-        // A_top will be modified in place by exactly one task, to
-        // indicate the partition from which we may extract the R
-        // factor after finishing the factorization.
-        mat_view_type A_top;
-
-        std::vector<SeqOutput> seq_output (ntasks());
-        ParOutput par_output (ntasks(), std::vector<Scalar>(ncols));
-        if (ntasks() < 1)
-          {
-            if (! A_view.empty())
-              throw std::logic_error("Zero subproblems, but A not empty!");
-            else // Return empty results
-              return std::make_pair (seq_output, par_output);
-          }
-
-        double my_seq_timing = double(0);
-        double min_seq_timing = double(0);
-        double max_seq_timing = double(0);
-        try {
-          typedef FactorTask<LocalOrdinal, Scalar, TimerType> factor_task_t;
-
-          // When the root task completes, A_top will be set to the
-          // topmost partition of A.  We can then extract the R factor
-          // from A_top.
-          factor_task_t& root_task = *new( task::allocate_root() )
-            factor_task_t(0, ntasks()-1, A_view, &A_top, seq_output,
-                          par_output, seq_, my_seq_timing, min_seq_timing,
-                          max_seq_timing, contiguous_cache_blocks);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          // TBB can't guarantee on all systems that an exception
-          // thrown in another thread will have its type correctly
-          // propagated to this thread.  If it can't, then it captures
-          // the exception as a tbb:captured_exception, and propagates
-          // it to here.  It may be able to propagate the exception,
-          // though, so be prepared for that.  We deal with the latter
-          // case by allowing the exception to propagate.
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while computing the QR factor"
-            "ization of a matrix A.  Unfortunately, its type information was "
-            "lost, because the exception was thrown in another thread.  Its "
-            "\"what()\" function returns the following string: " << ex.what();
-          throw std::runtime_error (os.str());
-        }
-
-        // Copy the R factor out of A_top into R.
-        seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(),
-                        A_top.stride(1), R, ldr, contiguous_cache_blocks);
-
-        // Save the timings for future reference
-        if (min_seq_timing < min_seq_factor_timing_)
-          min_seq_factor_timing_ = min_seq_timing;
-        if (max_seq_timing > max_seq_factor_timing_)
-          max_seq_factor_timing_ = max_seq_timing;
-
-        return std::make_pair (seq_output, par_output);
-      }
-
-      void
-      apply (const ApplyType& apply_type,
-             const LocalOrdinal nrows,
-             const LocalOrdinal ncols_Q,
-             const Scalar Q[],
-             const LocalOrdinal ldq,
-             const FactorOutput& factor_output,
-             const LocalOrdinal ncols_C,
-             Scalar C[],
-             const LocalOrdinal ldc,
-             const bool contiguous_cache_blocks) const
-      {
-        using tbb::task;
-
-        if (apply_type.transposed())
-          throw std::logic_error ("Applying Q^T and Q^H not implemented");
-
-        const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq);
-        mat_view_type C_view (nrows, ncols_C, C, ldc);
-        if (! apply_type.transposed())
-          {
-            array_top_blocks_t top_blocks (ntasks());
-            build_partition_array (0, ntasks()-1, top_blocks, Q_view,
-                                   C_view, contiguous_cache_blocks);
-            double my_seq_timing = 0.0;
-            double min_seq_timing = 0.0;
-            double max_seq_timing = 0.0;
-            try {
-              typedef ApplyTask<LocalOrdinal, Scalar, TimerType> apply_task_t;
-              apply_task_t& root_task =
-                *new( task::allocate_root() )
-                apply_task_t (0, ntasks()-1, Q_view, C_view, top_blocks,
-                              factor_output, seq_, my_seq_timing,
-                              min_seq_timing, max_seq_timing,
-                              contiguous_cache_blocks);
-              task::spawn_root_and_wait (root_task);
-            } catch (tbb::captured_exception& ex) {
-              std::ostringstream os;
-              os << "Intel TBB caught an exception, while applying a Q factor "
-                "computed previously by factor() to the matrix C.  Unfortunate"
-                "ly, its type information was lost, because the exception was "
-                "thrown in another thread.  Its \"what()\" function returns th"
-                "e following string: " << ex.what();
-              throw std::runtime_error (os.str());
-            }
-
-            // Save the timings for future reference
-            if (min_seq_timing < min_seq_apply_timing_)
-              min_seq_apply_timing_ = min_seq_timing;
-            if (max_seq_timing > max_seq_apply_timing_)
-              max_seq_apply_timing_ = max_seq_timing;
-          }
-      }
-
-
-      void
-      explicit_Q (const LocalOrdinal nrows,
-                  const LocalOrdinal ncols_Q_in,
-                  const Scalar Q_in[],
-                  const LocalOrdinal ldq_in,
-                  const FactorOutput& factor_output,
-                  const LocalOrdinal ncols_Q_out,
-                  Scalar Q_out[],
-                  const LocalOrdinal ldq_out,
-                  const bool contiguous_cache_blocks) const
-      {
-        using tbb::task;
-
-        mat_view_type Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out);
-        try {
-          typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t;
-          explicit_Q_task_t& root_task = *new( task::allocate_root() )
-            explicit_Q_task_t (0, ntasks()-1, Q_out_view, seq_,
-                               contiguous_cache_blocks);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while preparing to compute"
-            " the explicit Q factor from a QR factorization computed previ"
-            "ously by factor().  Unfortunately, its type information was l"
-            "ost, because the exception was thrown in another thread.  Its"
-            " \"what()\" function returns the following string: "
-             << ex.what();
-          throw std::runtime_error (os.str());
-        }
-        apply (ApplyType::NoTranspose,
-               nrows, ncols_Q_in, Q_in, ldq_in, factor_output,
-               ncols_Q_out, Q_out, ldq_out,
-               contiguous_cache_blocks);
-      }
-
-      /// \brief Compute Q*B
-      ///
-      /// Compute matrix-matrix product Q*B, where Q is nrows by ncols
-      /// and B is ncols by ncols.  Respect cache blocks of Q.
-      void
-      Q_times_B (const LocalOrdinal nrows,
-                 const LocalOrdinal ncols,
-                 Scalar Q[],
-                 const LocalOrdinal ldq,
-                 const Scalar B[],
-                 const LocalOrdinal ldb,
-                 const bool contiguous_cache_blocks) const
-      {
-        // Compute Q := Q*B in parallel.  This works much like
-        // cache_block() (which see), in that each thread's instance
-        // does not need to communicate with the others.
-        try {
-          using tbb::task;
-          typedef RevealRankTask<LocalOrdinal, Scalar> rrtask_type;
-
-          mat_view_type Q_view (nrows, ncols, Q, ldq);
-          const_mat_view_type B_view (ncols, ncols, B, ldb);
-
-          rrtask_type& root_task = *new( task::allocate_root() )
-            rrtask_type (0, ntasks()-1, Q_view, B_view, seq_,
-                         contiguous_cache_blocks);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while computing Q := Q*U.  "
-            "Unfortunately, its type information was lost, because the "
-            "exception was thrown in another thread.  Its \"what()\" function "
-            "returns the following string: " << ex.what();
-          throw std::runtime_error (os.str());
-        }
-      }
-
-
-      /// Compute SVD \f$R = U \Sigma V^*\f$, not in place.  Use the
-      /// resulting singular values to compute the numerical rank of R,
-      /// with respect to the relative tolerance tol.  If R is full
-      /// rank, return without modifying R.  If R is not full rank,
-      /// overwrite R with \f$\Sigma \cdot V^*\f$.
-      ///
-      /// \return Numerical rank of R: 0 <= rank <= ncols.
-      LocalOrdinal
-      reveal_R_rank (const LocalOrdinal ncols,
-                     Scalar R[],
-                     const LocalOrdinal ldr,
-                     Scalar U[],
-                     const LocalOrdinal ldu,
-                     const magnitude_type tol) const
-      {
-        return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
-      }
-
-      /// \brief Rank-revealing decomposition
-      ///
-      /// Using the R factor from factor() and the explicit Q factor
-      /// from explicit_Q(), compute the SVD of R (\f$R = U \Sigma
-      /// V^*\f$).  R.  If R is full rank (with respect to the given
-      /// relative tolerance tol), don't change Q or R.  Otherwise,
-      /// compute \f$Q := Q \cdot U\f$ and \f$R := \Sigma V^*\f$ in
-      /// place (the latter may be no longer upper triangular).
-      ///
-      /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq ncols\f$.
-      ///
-      LocalOrdinal
-      reveal_rank (const LocalOrdinal nrows,
-                   const LocalOrdinal ncols,
-                   Scalar Q[],
-                   const LocalOrdinal ldq,
-                   Scalar R[],
-                   const LocalOrdinal ldr,
-                   const magnitude_type tol,
-                   const bool contiguous_cache_blocks = false) const
-      {
-        // Take the easy exit if available.
-        if (ncols == 0)
-          return 0;
-
-        Matrix<LocalOrdinal, Scalar> U (ncols, ncols, Scalar(0));
-        const LocalOrdinal rank =
-          reveal_R_rank (ncols, R, ldr, U.data(), U.ldu(), tol);
-
-        if (rank < ncols) {
-          // If R is not full rank: reveal_R_rank() already computed
-          // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
-          // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
-          // := Q \cdot U\f$, respecting cache blocks of Q.
-          Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1),
-                     contiguous_cache_blocks);
-        }
-        return rank;
-      }
-
-      void
-      cache_block (const LocalOrdinal nrows,
-                   const LocalOrdinal ncols,
-                   Scalar A_out[],
-                   const Scalar A_in[],
-                   const LocalOrdinal lda_in) const
-      {
-        using tbb::task;
-
-        const_mat_view_type A_in_view (nrows, ncols, A_in, lda_in);
-        // A_out won't have leading dimension lda_in, but that's OK,
-        // as long as all the routines are told that A_out is
-        // cache-blocked.
-        mat_view_type A_out_view (nrows, ncols, A_out, lda_in);
-        try {
-          typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t;
-          cache_block_task_t& root_task = *new( task::allocate_root() )
-            cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while cache-blocking a mat"
-            "rix.  Unfortunately, its type information was lost, because t"
-            "he exception was thrown in another thread.  Its \"what()\" fu"
-            "nction returns the following string: " << ex.what();
-          throw std::runtime_error (os.str());
-        }
-      }
-
-      void
-      un_cache_block (const LocalOrdinal nrows,
-                      const LocalOrdinal ncols,
-                      Scalar A_out[],
-                      const LocalOrdinal lda_out,
-                      const Scalar A_in[]) const
-      {
-        using tbb::task;
-
-        // A_in doesn't have leading dimension lda_out, but that's OK,
-        // as long as all the routines are told that A_in is cache-
-        // blocked.
-        const_mat_view_type A_in_view (nrows, ncols, A_in, lda_out);
-        mat_view_type A_out_view (nrows, ncols, A_out, lda_out);
-        try {
-          typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t;
-          un_cache_block_task_t& root_task = *new( task::allocate_root() )
-            un_cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while un-cache-blocking a "
-            "matrix.  Unfortunately, its type information was lost, becaus"
-            "e the exception was thrown in another thread.  Its \"what()\""
-            " function returns the following string: " << ex.what();
-          throw std::runtime_error (os.str());
-        }
-      }
-
-      template< class MatrixViewType >
-      MatrixViewType
-      top_block (const MatrixViewType& C,
-                 const bool contiguous_cache_blocks = false) const
-      {
-        return top_block_helper (0, ntasks()-1, C, contiguous_cache_blocks);
-      }
-
-      void
-      fill_with_zeros (const LocalOrdinal nrows,
-                       const LocalOrdinal ncols,
-                       Scalar C[],
-                       const LocalOrdinal ldc,
-                       const bool contiguous_cache_blocks) const
-      {
-        using tbb::task;
-        mat_view_type C_view (nrows, ncols, C, ldc);
-
-        try {
-          typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t;
-          fill_task_t& root_task = *new( task::allocate_root() )
-            fill_task_t (0, ntasks()-1, C_view, seq_, contiguous_cache_blocks);
-          task::spawn_root_and_wait (root_task);
-        } catch (tbb::captured_exception& ex) {
-          std::ostringstream os;
-          os << "Intel TBB caught an exception, while un-cache-blocking a "
-            "matrix.  Unfortunately, its type information was lost, becaus"
-            "e the exception was thrown in another thread.  Its \"what()\""
-            " function returns the following string: " << ex.what();
-          throw std::runtime_error (os.str());
-        }
-      }
-
-    private:
-      size_t numTasks_;
-      TSQR::SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      TSQR::Combine<LocalOrdinal, Scalar> combine_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-
-      mutable double min_seq_factor_timing_;
-      mutable double max_seq_factor_timing_;
-      mutable double min_seq_apply_timing_;
-      mutable double max_seq_apply_timing_;
-
-      void
-      build_partition_array (const size_t P_first,
-                             const size_t P_last,
-                             array_top_blocks_t& top_blocks,
-                             const_mat_view_type& Q,
-                             mat_view_type& C,
-                             const bool contiguous_cache_blocks = false) const
-      {
-        if (P_first > P_last) {
-          return;
-        }
-        else if (P_first == P_last) {
-          const_mat_view_type Q_top = seq_.top_block (Q, contiguous_cache_blocks);
-          mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks);
-          top_blocks[P_first] =
-            std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1),
-                                                 Q_top.data(), Q_top.stride(1)),
-                            mat_view_type (C_top.extent(1), C_top.extent(1),
-                                           C_top.data(), C_top.stride(1)));
-        }
-        else {
-          // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-          const size_t P_mid = (P_first + P_last) / 2;
-          const_split_t Q_split =
-            partitioner_.split (Q, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          split_t C_split =
-            partitioner_.split (C, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          // The partitioner may decide that the current blocks Q
-          // and C have too few rows to be worth splitting.  (The
-          // partitioner should split both Q and C in the same way.)
-          // In that case, Q_split.first should be the same block as
-          // Q, and Q_split.second (the bottom block) will be empty.
-          // Ditto for C_split.  We deal with this in the same way
-          // as the base case (P_first == P_last) above.
-          if (Q_split.second.empty() || Q_split.second.extent(0) == 0) {
-            const_mat_view_type Q_top =
-              seq_.top_block (Q, contiguous_cache_blocks);
-            mat_view_type C_top = seq_.top_block (C, contiguous_cache_blocks);
-            top_blocks[P_first] =
-              std::make_pair (const_mat_view_type (Q_top.extent(1), Q_top.extent(1),
-                                                   Q_top.data(), Q_top.stride(1)),
-                              mat_view_type (C_top.extent(1), C_top.extent(1),
-                                             C_top.data(), C_top.stride(1)));
-          }
-          else {
-            build_partition_array (P_first, P_mid, top_blocks,
-                                   Q_split.first, C_split.first,
-                                   contiguous_cache_blocks);
-            build_partition_array (P_mid+1, P_last, top_blocks,
-                                   Q_split.second, C_split.second,
-                                   contiguous_cache_blocks);
-          }
-        }
-      }
-    };
-  } // namespace TBB
-} // namespace TSQR
-
-#endif // __TSQR_TBB_TbbParallelTsqr_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp
deleted file mode 100644
index e7f79fb0c15d..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TbbRecursiveTsqr_hpp
-#define __TSQR_TbbRecursiveTsqr_hpp
-
-#include "Tsqr_ApplyType.hpp"
-#include "Tsqr_CacheBlocker.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-#include "TbbTsqr_Partitioner.hpp"
-#include <stdexcept>
-#include <string>
-#include <utility> // std::pair
-#include <vector>
-
-namespace TSQR {
-  namespace TBB {
-    /// \class TbbRecursiveTsqr
-    /// \brief Non-parallel "functioning stub" implementation of \c TbbTsqr.
-    template<class LocalOrdinal, class Scalar>
-    class TbbRecursiveTsqr {
-    public:
-      /// \brief Constructor.
-      ///
-      /// \param num_cores [in] Maximum parallelism to use (i.e.,
-      ///   maximum number of partitions into which to divide the
-      ///   matrix to factor).
-      ///
-      /// \param cache_size_hint [in] Approximate cache size in bytes
-      ///   per CPU core.  A hint, not a command.  If zero, set to a
-      ///   reasonable default.
-      TbbRecursiveTsqr (const size_t num_cores = 1,
-                        const size_t cache_size_hint = 0);
-
-      /// Number of cores to use to solve the problem (i.e., number of
-      /// subproblems into which to divide the main problem, to solve
-      /// it in parallel).
-      size_t ncores() const { return ncores_; }
-
-      //! Cache size hint (in bytes) used for the factorization.
-      size_t cache_size_hint() const { return seq_.cache_size_hint(); }
-
-      //! Results of SequentialTsqr for each core.
-      typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
-
-      /// \typedef ParOutput
-      /// \brief Array of ncores "local tau arrays" from parallel TSQR.
-      ///
-      /// Local Q factors are stored in place.
-      typedef std::vector<std::vector<Scalar> > ParOutput;
-
-      /// \typedef FactorOutput
-      /// \brief Return type of factor().
-      ///
-      /// factor() returns a pair: the results of SequentialTsqr for
-      /// data on each core, and the results of combining the data on
-      /// the cores.
-      typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
-
-      /// Copy the nrows by ncols matrix A_in (with leading dimension
-      /// lda_in >= nrows) into A_out, such that cache blocks are
-      /// arranged contiguously in memory.
-      void
-      cache_block (const LocalOrdinal nrows,
-                   const LocalOrdinal ncols,
-                   Scalar A_out[],
-                   const Scalar A_in[],
-                   const LocalOrdinal lda_in) const;
-
-      /// Copy the nrows by ncols matrix A_in, whose cache blocks are
-      /// arranged contiguously in memory, into A_out (with leading
-      /// dimension lda_out >= nrows), which is in standard
-      /// column-major order.
-      void
-      un_cache_block (const LocalOrdinal nrows,
-                      const LocalOrdinal ncols,
-                      Scalar A_out[],
-                      const LocalOrdinal lda_out,
-                      const Scalar A_in[]) const;
-
-      /// Compute the QR factorization of the nrows by ncols matrix A
-      /// (with leading dimension lda >= nrows), returning a
-      /// representation of the Q factor (which includes data stored
-      /// in-place in A), and overwriting R (an ncols by ncols matrix
-      /// in column-major order with leading dimension ldr >= ncols)
-      /// with the R factor.
-      FactorOutput
-      factor (const LocalOrdinal nrows,
-              const LocalOrdinal ncols,
-              Scalar A[],
-              const LocalOrdinal lda,
-              Scalar R[],
-              const LocalOrdinal ldr,
-              const bool contiguous_cache_blocks) const;
-
-      /// Apply the Q factor computed by factor() (which see) to the
-      /// nrows by ncols_C matrix C, with leading dimension ldc >=
-      /// nrows.
-      void
-      apply (const std::string& op,
-             const LocalOrdinal nrows,
-             const LocalOrdinal ncols_C,
-             Scalar C[],
-             const LocalOrdinal ldc,
-             const LocalOrdinal ncols_Q,
-             const Scalar Q[],
-             const LocalOrdinal ldq,
-             const FactorOutput& factor_output,
-             const bool contiguous_cache_blocks) const;
-
-      /// Compute the explicit representation of the Q factor computed
-      /// by factor().
-      void
-      explicit_Q (const LocalOrdinal nrows,
-                  const LocalOrdinal ncols_Q_in,
-                  const Scalar Q_in[],
-                  const LocalOrdinal ldq_in,
-                  const LocalOrdinal ncols_Q_out,
-                  Scalar Q_out[],
-                  const LocalOrdinal ldq_out,
-                  const FactorOutput& factor_output,
-                  const bool contiguous_cache_blocks) const;
-
-    private:
-      size_t ncores_;
-      TSQR::SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair<const_mat_view_type, const_mat_view_type> const_split_t;
-      typedef std::pair<mat_view_type, mat_view_type> split_t;
-      typedef std::pair<const_mat_view_type, mat_view_type> top_blocks_t;
-      typedef std::vector<top_blocks_t> array_top_blocks_t;
-
-      void
-      explicit_Q_helper (const size_t P_first,
-                         const size_t P_last,
-                         mat_view_type& Q_out,
-                         const bool contiguous_cache_blocks) const;
-
-      /// \brief Return a nonconst view of the topmost block.
-      ///
-      /// This is helpful for combining the R factors and extracting
-      /// the final R factor result.
-      mat_view_type
-      factor_helper (const size_t P_first,
-                     const size_t P_last,
-                     const size_t depth,
-                     mat_view_type A,
-                     std::vector<SeqOutput>& seq_outputs,
-                     ParOutput& par_outputs,
-                     Scalar R[],
-                     const LocalOrdinal ldr,
-                     const bool contiguous_cache_blocks) const;
-
-      bool
-      apply_helper_empty (const size_t P_first,
-                          const size_t P_last,
-                          const_mat_view_type &Q,
-                          mat_view_type& C) const;
-
-      /// \brief Build array of ncores() blocks, one for each partition.
-      ///
-      /// Each block is the topmost block in that partition.  This is
-      /// useful for apply_helper.
-      void
-      build_partition_array (const size_t P_first,
-                             const size_t P_last,
-                             array_top_blocks_t& top_blocks,
-                             const_mat_view_type& Q,
-                             mat_view_type& C,
-                             const bool contiguous_cache_blocks) const;
-
-      /// Apply Q (not Q^T or Q^H, which is why we don't ask for "op")
-      /// to C.
-      void
-      apply_helper (const size_t P_first,
-                    const size_t P_last,
-                    const_mat_view_type Q,
-                    mat_view_type C,
-                    array_top_blocks_t& top_blocks,
-                    const FactorOutput& factor_output,
-                    const bool contiguous_cache_blocks) const;
-
-      /// Apply Q^T or Q^H to C.
-      ///
-      /// \return Views of the topmost partitions of Q resp. C.
-      std::pair<const_mat_view_type, mat_view_type>
-      apply_transpose_helper (const std::string& op,
-                              const size_t P_first,
-                              const size_t P_last,
-                              const_mat_view_type Q,
-                              mat_view_type C,
-                              const FactorOutput& factor_output,
-                              const bool contiguous_cache_blocks) const;
-
-      void
-      factor_pair (const size_t P_top,
-                   const size_t P_bot,
-                   mat_view_type& A_top,
-                   mat_view_type& A_bot,
-                   std::vector< std::vector< Scalar > >& par_outputs,
-                   const bool contiguous_cache_blocks) const;
-
-      void
-      apply_pair (const std::string& trans,
-                  const size_t P_top,
-                  const size_t P_bot,
-                  const_mat_view_type& Q_bot,
-                  const std::vector< std::vector< Scalar > >& tau_arrays,
-                  mat_view_type& C_top,
-                  mat_view_type& C_bot,
-                  const bool contiguous_cache_blocks) const;
-
-      void
-      cache_block_helper (mat_view_type& A_out,
-                          const_mat_view_type& A_in,
-                          const size_t P_first,
-                          const size_t P_last) const;
-
-      void
-      un_cache_block_helper (mat_view_type& A_out,
-                             const const_mat_view_type& A_in,
-                             const size_t P_first,
-                             const size_t P_last) const;
-
-    }; // class TbbRecursiveTsqr
-  } // namespace TBB
-} // namespace TSQR
-
-#include "TSQR/TBB/TbbRecursiveTsqr_Def.hpp"
-
-#endif // __TSQR_TbbRecursiveTsqr_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp b/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp
deleted file mode 100644
index 27aef81f0328..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_TbbRecursiveTsqr_Def.hpp
+++ /dev/null
@@ -1,538 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_TbbRecursiveTsqr_Def_hpp
-#define __TSQR_TBB_TbbRecursiveTsqr_Def_hpp
-
-#include "TbbTsqr_TbbRecursiveTsqr.hpp"
-#include "Tsqr_Util.hpp"
-
-namespace TSQR {
-  namespace TBB {
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    explicit_Q_helper (const size_t P_first,
-                       const size_t P_last,
-                       mat_view& Q_out,
-                       const bool contiguous_cache_blocks) const
-    {
-      if (P_first > P_last || Q_out.empty ()) {
-        return;
-      }
-      else if (P_first == P_last) {
-        CacheBlocker< LocalOrdinal, Scalar >
-          blocker (Q_out.extent(0), Q_out.extent(1),
-                   seq_.cache_blocking_strategy());
-        // Fill my partition with zeros.
-        blocker.fill_with_zeros (Q_out, contiguous_cache_blocks);
-
-        // If our partition is the first (topmost), fill it with
-        // the first Q_out.extent(1) columns of the identity matrix.
-        if (P_first == 0) {
-          // Fetch the topmost cache block of my partition.  Its
-          // leading dimension should be set correctly by
-          // top_block().
-          mat_view Q_out_top =
-            blocker.top_block (Q_out, contiguous_cache_blocks);
-
-          for (LocalOrdinal j = 0; j < Q_out_top.extent(1); ++j)
-            Q_out_top(j,j) = Scalar(1);
-        }
-      }
-      else {
-        // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-        const size_t P_mid = (P_first + P_last) / 2;
-        split_t Q_out_split =
-          partitioner_.split (Q_out, P_first, P_mid, P_last,
-                              contiguous_cache_blocks);
-        explicit_Q_helper (P_first, P_mid, Q_out_split.first,
-                           contiguous_cache_blocks);
-        explicit_Q_helper (P_mid+1, P_last, Q_out_split.second,
-                           contiguous_cache_blocks);
-      }
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::mat_view
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    factor_helper (const size_t P_first,
-                   const size_t P_last,
-                   const size_t depth,
-                   mat_view A,
-                   std::vector<typename TbbRecursiveTsqr<LocalOrdinal, Scalar>::SeqOutput>& seq_outputs,
-                   typename TbbRecursiveTsqr<LocalOrdinal, Scalar>::ParOutput& par_outputs,
-                   Scalar R[],
-                   const LocalOrdinal ldr,
-                   const bool contiguous_cache_blocks) const
-    {
-      mat_view A_top;
-      if (P_first > P_last || A.empty()) {
-        return A;
-      }
-      else if (P_first == P_last) {
-        std::pair<SeqOutput, mat_view> results =
-          seq_.factor (A.extent(0), A.extent(1), A.data(), A.stride(1),
-                       contiguous_cache_blocks);
-        seq_outputs[P_first] = results.first;
-        A_top = A;
-      }
-      else {
-        // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-        const size_t P_mid = (P_first + P_last) / 2;
-        split_t A_split =
-          partitioner_.split (A, P_first, P_mid, P_last,
-                              contiguous_cache_blocks);
-        A_top = factor_helper (P_first, P_mid, depth+1, A_split.first,
-                               seq_outputs, par_outputs, R, ldr,
-                               contiguous_cache_blocks);
-        mat_view A_bot =
-          factor_helper (P_mid+1, P_last, depth+1, A_split.second,
-                         seq_outputs, par_outputs, R, ldr,
-                         contiguous_cache_blocks);
-        // Combine the two results
-        factor_pair (P_first, P_mid+1, A_top, A_bot, par_outputs,
-                     contiguous_cache_blocks);
-      }
-
-      // If we're completely done, extract the final R factor from
-      // the topmost partition.
-      if (depth == 0) {
-        seq_.extract_R (A_top.extent(0), A_top.extent(1), A_top.data(),
-                        A_top.stride(1), R, ldr, contiguous_cache_blocks);
-      }
-      return A_top;
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    bool
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    apply_helper_empty (const size_t P_first,
-                        const size_t P_last,
-                        const_mat_view& Q,
-                        mat_view& C) const
-    {
-      if (Q.empty ()) {
-        if (! C.empty())
-          throw std::logic_error("Q is empty but C is not!");
-        else
-          return true;
-      }
-      else if (C.empty()) {
-        if (! Q.empty())
-          throw std::logic_error("C is empty but Q is not!");
-        else
-          return true;
-      }
-      else if (P_first > P_last)
-        return true;
-      else
-        return false;
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    build_partition_array (const size_t P_first,
-                           const size_t P_last,
-                           typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::array_top_blocks_t& top_blocks,
-                           const_mat_view& Q,
-                           mat_view& C,
-                           const bool contiguous_cache_blocks) const
-    {
-      if (P_first > P_last)
-        return;
-      else if (P_first == P_last)
-        {
-          CacheBlocker< LocalOrdinal, Scalar > blocker (Q.extent(0), Q.extent(1), seq_.cache_blocking_strategy());
-          const_mat_view Q_top = blocker.top_block (Q, contiguous_cache_blocks);
-          mat_view C_top = blocker.top_block (C, contiguous_cache_blocks);
-          top_blocks[P_first] =
-            std::make_pair (const_mat_view (Q_top.extent(1), Q_top.extent(1), Q_top.data(), Q_top.stride(1)),
-                            mat_view (C_top.extent(1), C_top.extent(1), C_top.data(), C_top.stride(1)));
-        }
-      else
-        {
-          // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-          const size_t P_mid = (P_first + P_last) / 2;
-          const_split_t Q_split =
-            partitioner_.split (Q, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          split_t C_split =
-            partitioner_.split (C, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          build_partition_array (P_first, P_mid, top_blocks, Q_split.first,
-                                 C_split.first, contiguous_cache_blocks);
-          build_partition_array (P_mid+1, P_last, top_blocks, Q_split.second,
-                                 C_split.second, contiguous_cache_blocks);
-        }
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    apply_helper (const size_t P_first,
-                  const size_t P_last,
-                  const_mat_view Q,
-                  mat_view C,
-                  typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::array_top_blocks_t& top_blocks,
-                  const FactorOutput& factor_output,
-                  const bool contiguous_cache_blocks) const
-    {
-      typedef std::pair< const_mat_view, mat_view > apply_t;
-
-      if (apply_helper_empty (P_first, P_last, Q, C))
-        return;
-      else if (P_first == P_last)
-        {
-          const std::vector< SeqOutput >& seq_outputs = factor_output.first;
-          seq_.apply ("N", Q.extent(0), Q.extent(1), Q.data(), Q.stride(1),
-                      seq_outputs[P_first], C.extent(1), C.data(),
-                      C.stride(1), contiguous_cache_blocks);
-        }
-      else
-        {
-          // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-          const size_t P_mid = (P_first + P_last) / 2;
-          const_split_t Q_split =
-            partitioner_.split (Q, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          split_t C_split =
-            partitioner_.split (C, P_first, P_mid, P_last,
-                                contiguous_cache_blocks);
-          const ParOutput& par_output = factor_output.second;
-
-          apply_pair ("N", P_first, P_mid+1, top_blocks[P_mid+1].first,
-                      par_output, top_blocks[P_first].second,
-                      top_blocks[P_mid+1].second, contiguous_cache_blocks);
-          apply_helper (P_first, P_mid, Q_split.first, C_split.first,
-                        top_blocks, factor_output, contiguous_cache_blocks);
-          apply_helper (P_mid+1, P_last, Q_split.second, C_split.second,
-                        top_blocks, factor_output, contiguous_cache_blocks);
-        }
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::top_blocks_t
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    apply_transpose_helper (const std::string& op,
-                            const size_t P_first,
-                            const size_t P_last,
-                            const_mat_view Q,
-                            mat_view C,
-                            const typename TbbRecursiveTsqr<LocalOrdinal, Scalar>::FactorOutput& factor_output,
-                            const bool contiguous_cache_blocks) const
-    {
-      if (apply_helper_empty (P_first, P_last, Q, C)) {
-        return std::make_pair (Q, C);
-      }
-      else if (P_first == P_last) {
-        const std::vector<SeqOutput>& seq_outputs = factor_output.first;
-        seq_.apply (op, Q.extent(0), Q.extent(1), Q.data(), Q.stride(1),
-                    seq_outputs[P_first], C.extent(1), C.data(),
-                    C.stride(1), contiguous_cache_blocks);
-        return std::make_pair (Q, C);
-      }
-      else {
-        // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
-        const size_t P_mid = (P_first + P_last) / 2;
-
-        const_split_t Q_split =
-          partitioner_.split (Q, P_first, P_mid, P_last,
-                              contiguous_cache_blocks);
-        split_t C_split =
-          partitioner_.split (C, P_first, P_mid, P_last,
-                              contiguous_cache_blocks);
-        const ParOutput& par_output = factor_output.second;
-        top_blocks_t Top =
-          apply_transpose_helper (op, P_first, P_mid, Q_split.first,
-                                  C_split.first, factor_output,
-                                  contiguous_cache_blocks);
-        top_blocks_t Bottom =
-          apply_transpose_helper (op, P_mid+1, P_last, Q_split.second,
-                                  C_split.second, factor_output,
-                                  contiguous_cache_blocks);
-        apply_pair (op, P_first, P_mid+1, Bottom.first,
-                    par_output, Top.second, Bottom.second,
-                    contiguous_cache_blocks);
-        return Top;
-      }
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    factor_pair (const size_t P_top,
-                 const size_t P_bot,
-                 mat_view& A_top,
-                 mat_view& A_bot,
-                 std::vector<std::vector<Scalar>>& par_outputs,
-                 const bool contiguous_cache_blocks) const
-    {
-      if (P_top == P_bot) {
-        throw std::logic_error("factor_pair: should never get here!");
-      }
-      // We only read and write the upper ncols x ncols triangle of
-      // each block.
-      const LocalOrdinal ncols = A_top.extent(1);
-      if (A_bot.extent(1) != ncols) {
-        throw std::logic_error("A_bot.extent(1) != A_top.extent(1)");
-      }
-      std::vector<Scalar>& tau = par_outputs[P_bot];
-      std::vector<Scalar> work (ncols);
-
-      TSQR::Combine<LocalOrdinal, Scalar> combine_;
-      combine_.factor_pair (A_top, A_bot, tau.data(), work.data());
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    apply_pair (const std::string& trans,
-                const size_t P_top,
-                const size_t P_bot,
-                const_mat_view& Q_bot,
-                const std::vector<std::vector<Scalar> >& tau_arrays,
-                mat_view& C_top,
-                mat_view& C_bot,
-                const bool contiguous_cache_blocks) const
-    {
-      if (P_top == P_bot) {
-        throw std::logic_error ("apply_pair: should never get here!");
-      }
-      const std::vector<Scalar>& tau = tau_arrays[P_bot];
-      std::vector<Scalar> work (C_top.extent(1));
-
-      TSQR::Combine<LocalOrdinal, Scalar> combine_;
-      combine_.apply_pair (trans.c_str(), C_top.extent(1), Q_bot.extent(1),
-                           Q_bot.data(), Q_bot.stride(1), &tau[0],
-                           C_top.data(), C_top.stride(1),
-                           C_bot.data(), C_bot.stride(1), &work[0]);
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    cache_block_helper (mat_view& A_out,
-                        const_mat_view& A_in,
-                        const size_t P_first,
-                        const size_t P_last) const
-    {
-      if (P_first > P_last)
-        return;
-      else if (P_first == P_last)
-        seq_.cache_block (A_out.extent(0), A_out.extent(1), A_out.data(),
-                          A_in.data(), A_in.stride(1));
-      else
-        {
-          const size_t P_mid = (P_first + P_last) / 2;
-          const_split_t A_in_split =
-            partitioner_.split (A_in, P_first, P_mid, P_last, false);
-          split_t A_out_split =
-            partitioner_.split (A_out, P_first, P_mid, P_last, true);
-          cache_block_helper (A_out_split.first, A_in_split.first,
-                              P_first, P_mid);
-          cache_block_helper (A_out_split.second, A_in_split.second,
-                              P_mid+1, P_last);
-        }
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    un_cache_block_helper (mat_view& A_out,
-                           const const_mat_view& A_in,
-                           const size_t P_first,
-                           const size_t P_last) const
-    {
-      if (P_first > P_last) {
-        return;
-      }
-      else if (P_first == P_last) {
-        seq_.un_cache_block (A_out.extent(0), A_out.extent(1),
-                             A_out.data(), A_out.stride(1),
-                             A_in.data());
-      }
-      else {
-        const size_t P_mid = (P_first + P_last) / 2;
-        const const_split_t A_in_split =
-          partitioner_.split (A_in, P_first, P_mid, P_last, true);
-        split_t A_out_split =
-          partitioner_.split (A_out, P_first, P_mid, P_last, false);
-
-        un_cache_block_helper (A_out_split.first, A_in_split.first,
-                               P_first, P_mid);
-        un_cache_block_helper (A_out_split.second, A_in_split.second,
-                               P_mid+1, P_last);
-      }
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    TbbRecursiveTsqr (const size_t num_cores,
-                      const size_t cache_size_hint)
-      : seq_ (cache_size_hint), ncores_ (1)
-    {
-      if (num_cores < 1)
-        ncores_ = 1; // default is no parallelism
-      else
-        ncores_ = num_cores;
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    cache_block (const LocalOrdinal nrows,
-                 const LocalOrdinal ncols,
-                 Scalar A_out[],
-                 const Scalar A_in[],
-                 const LocalOrdinal lda_in) const
-    {
-      const_mat_view A_in_view (nrows, ncols, A_in, lda_in);
-      // Leading dimension doesn't matter, since we're going to cache block it.
-      mat_view A_out_view (nrows, ncols, A_out, lda_in);
-      cache_block_helper (A_out_view, A_in_view, 0, ncores()-1);
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    un_cache_block (const LocalOrdinal nrows,
-                    const LocalOrdinal ncols,
-                    Scalar A_out[],
-                    const LocalOrdinal lda_out,
-                    const Scalar A_in[]) const
-    {
-      // Leading dimension doesn't matter, since it's cache-blocked.
-      const_mat_view A_in_view (nrows, ncols, A_in, lda_out);
-      mat_view A_out_view (nrows, ncols, A_out, lda_out);
-      un_cache_block_helper (A_out_view, A_in_view, 0, ncores()-1);
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    factor (const LocalOrdinal nrows,
-            const LocalOrdinal ncols,
-            Scalar A[],
-            const LocalOrdinal lda,
-            Scalar R[],
-            const LocalOrdinal ldr,
-            const bool contiguous_cache_blocks) const
-    {
-      mat_view A_view (nrows, ncols, A, lda);
-      std::vector< SeqOutput > seq_outputs (ncores());
-      ParOutput par_outputs (ncores(), std::vector< Scalar >(ncols));
-      (void) factor_helper (0, ncores()-1, 0, A_view, seq_outputs,
-                            par_outputs, R, ldr, contiguous_cache_blocks);
-      return std::make_pair (seq_outputs, par_outputs);
-    }
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    apply (const std::string& op,
-           const LocalOrdinal nrows,
-           const LocalOrdinal ncols_C,
-           Scalar C[],
-           const LocalOrdinal ldc,
-           const LocalOrdinal ncols_Q,
-           const Scalar Q[],
-           const LocalOrdinal ldq,
-           const typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput& factor_output,
-           const bool contiguous_cache_blocks) const
-    {
-      const ApplyType apply_type (op);
-      if (apply_type == ApplyType::ConjugateTranspose &&
-          Teuchos::ScalarTraits<Scalar>::isComplex)
-        throw std::logic_error("Applying Q^H for complex scalar types "
-                               "not yet implemented");
-
-      const_mat_view Q_view (nrows, ncols_Q, Q, ldq);
-      mat_view C_view (nrows, ncols_C, C, ldc);
-      if (! apply_type.transposed ()) {
-        array_top_blocks_t top_blocks (ncores ());
-        build_partition_array (0, ncores () - 1, top_blocks, Q_view,
-                               C_view, contiguous_cache_blocks);
-        apply_helper (0, ncores () - 1, Q_view, C_view, top_blocks,
-                      factor_output, contiguous_cache_blocks);
-      }
-      else {
-        apply_transpose_helper (op, 0, ncores () - 1, Q_view, C_view,
-                                factor_output, contiguous_cache_blocks);
-      }
-    }
-
-
-    template< class LocalOrdinal, class Scalar >
-    void
-    TbbRecursiveTsqr< LocalOrdinal, Scalar >::
-    explicit_Q (const LocalOrdinal nrows,
-                const LocalOrdinal ncols_Q_in,
-                const Scalar Q_in[],
-                const LocalOrdinal ldq_in,
-                const LocalOrdinal ncols_Q_out,
-                Scalar Q_out[],
-                const LocalOrdinal ldq_out,
-                const typename TbbRecursiveTsqr< LocalOrdinal, Scalar >::FactorOutput& factor_output,
-                const bool contiguous_cache_blocks) const
-    {
-      if (ncols_Q_out != ncols_Q_in)
-        throw std::logic_error("FIXME Currently, explicit_Q() only works for ncols_Q_out == ncols_Q_in");
-
-      const_mat_view Q_in_view (nrows, ncols_Q_in, Q_in, ldq_in);
-      mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out);
-
-      explicit_Q_helper (0, ncores()-1, Q_out_view, contiguous_cache_blocks);
-      apply ("N", nrows, ncols_Q_out, Q_out, ldq_out, ncols_Q_in,
-             Q_in, ldq_in, factor_output, contiguous_cache_blocks);
-    }
-
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_TbbRecursiveTsqr_Def_hpp
diff --git a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp b/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp
deleted file mode 100644
index dc8068c2d9eb..000000000000
--- a/packages/tpetra/tsqr/src/TbbTsqr_UnCacheBlockTask.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_TBB_UnCacheBlockTask_hpp
-#define __TSQR_TBB_UnCacheBlockTask_hpp
-
-#include <tbb/task.h>
-#include "TbbTsqr_Partitioner.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-
-namespace TSQR {
-  namespace TBB {
-    /// \class UnCacheBlockTask
-    /// \brief TBB task for recursive TSQR un-(cache blocking) phase.
-    ///
-    /// "Un-(cache blocking)" here means copying the input matrix,
-    /// which is stored with contiguous cache blocks, to the output
-    /// matrix, which is stored with noncontiguous cache blocks.
-    template<class LocalOrdinal, class Scalar>
-    class UnCacheBlockTask : public tbb::task {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-      typedef MatView<LocalOrdinal, const Scalar> const_mat_view_type;
-      typedef std::pair< mat_view_type, mat_view_type > split_t;
-      typedef std::pair< const_mat_view_type, const_mat_view_type > const_split_t;
-
-      UnCacheBlockTask (const size_t P_first__,
-                        const size_t P_last__,
-                        mat_view_type& A_out,
-                        const_mat_view_type& A_in,
-                        const SequentialTsqr<LocalOrdinal, Scalar>& seq) :
-        P_first_ (P_first__),
-        P_last_ (P_last__),
-        A_out_ (A_out),
-        A_in_ (A_in),
-        seq_ (seq)
-      {}
-
-      tbb::task* execute ()
-      {
-        using tbb::task;
-
-        if (P_first_ > P_last_ || A_out_.empty() || A_in_.empty()) {
-          return nullptr;
-        }
-        else if (P_first_ == P_last_) {
-          execute_base_case ();
-          return nullptr;
-        }
-        else {
-          // Recurse on two intervals: [P_first, P_mid] and
-          // [P_mid+1, P_last].
-          const size_t P_mid = (P_first_ + P_last_) / 2;
-          split_t out_split =
-            partitioner_.split (A_out_, P_first_, P_mid, P_last_, false);
-          const_split_t in_split =
-            partitioner_.split (A_in_, P_first_, P_mid, P_last_, true);
-
-          // The partitioner may decide that the current blocks A_out_
-          // and A_in_ have too few rows to be worth splitting.  (It
-          // should split both A_out_ and A_in_ in the same way.)  In
-          // that case, out_split.second and in_split.second (the
-          // bottom block) will be empty.  We can deal with this by
-          // treating it as the base case.
-          if (out_split.second.empty() || out_split.second.extent(0) == 0) {
-            execute_base_case ();
-            return nullptr;
-          }
-
-          // "c": continuation task
-          tbb::empty_task& c =
-            *new( allocate_continuation() ) tbb::empty_task;
-          // Recurse on the split
-          UnCacheBlockTask& topTask = *new( c.allocate_child() )
-            UnCacheBlockTask (P_first_, P_mid, out_split.first,
-                              in_split.first, seq_);
-          UnCacheBlockTask& botTask = *new( c.allocate_child() )
-            UnCacheBlockTask (P_mid+1, P_last_, out_split.second,
-                              in_split.second, seq_);
-          // Set reference count of parent (in this case, the
-          // continuation task) to 2 (since 2 children -- no
-          // additional task since no waiting).
-          c.set_ref_count (2);
-          c.spawn (botTask);
-          return &topTask; // scheduler bypass optimization
-        }
-      }
-
-    private:
-      size_t P_first_, P_last_;
-      mat_view_type A_out_;
-      const_mat_view_type A_in_;
-      SequentialTsqr<LocalOrdinal, Scalar> seq_;
-      Partitioner<LocalOrdinal, Scalar> partitioner_;
-
-      void
-      execute_base_case ()
-      {
-        seq_.un_cache_block (A_out_.extent(0), A_out_.extent(1),
-                             A_out_.data(), A_out_.stride(1),
-                             A_in_.data());
-      }
-    };
-
-  } // namespace TBB
-} // namespace TSQR
-
-
-#endif // __TSQR_TBB_UnCacheBlockTask_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr.hpp b/packages/tpetra/tsqr/src/Tsqr.hpp
index 31d1be6b9d01..293fba119542 100644
--- a/packages/tpetra/tsqr/src/Tsqr.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr.hpp
@@ -40,8 +40,8 @@
 /// \file Tsqr.hpp
 /// \brief Parallel Tall Skinny QR (TSQR) implementation
 
-#ifndef __TSQR_Tsqr_hpp
-#define __TSQR_Tsqr_hpp
+#ifndef TSQR_TSQR_HPP
+#define TSQR_TSQR_HPP
 
 #include "Tsqr_ApplyType.hpp"
 #include "Tsqr_Matrix.hpp"
@@ -90,8 +90,7 @@ namespace TSQR {
   ///   distributed linear algebra libraries, such as Tpetra, the
   ///   local and global ordinal types may be different.
   template<class LocalOrdinal,
-           class Scalar,
-           class NodeTsqrType = SequentialTsqr<LocalOrdinal, Scalar>>
+           class Scalar>
   class Tsqr {
   public:
     typedef MatView<LocalOrdinal, Scalar> mat_view_type;
@@ -103,16 +102,16 @@ namespace TSQR {
     typedef Teuchos::ScalarTraits<Scalar> STS;
     typedef typename STS::magnitudeType magnitude_type;
 
-    typedef NodeTsqrType node_tsqr_type;
-    typedef DistTsqr<LocalOrdinal, Scalar> dist_tsqr_type;
+    using node_tsqr_type = NodeTsqr<LocalOrdinal, Scalar>;
+    using dist_tsqr_type = DistTsqr<LocalOrdinal, Scalar>;
     typedef typename Teuchos::RCP<node_tsqr_type> node_tsqr_ptr;
     typedef typename Teuchos::RCP<dist_tsqr_type> dist_tsqr_ptr;
     /// \typedef rank_type
     /// \brief "Rank" here means MPI rank, not linear algebra rank.
     typedef typename dist_tsqr_type::rank_type rank_type;
 
-    typedef typename node_tsqr_type::FactorOutput NodeOutput;
-    typedef typename dist_tsqr_type::FactorOutput DistOutput;
+    using NodeOutput = typename node_tsqr_type::factor_output_type;
+    using DistOutput = typename dist_tsqr_type::FactorOutput;
 
     /// \typedef FactorOutput
     /// \brief Return value of \c factor().
@@ -120,7 +119,8 @@ namespace TSQR {
     /// Part of the implicit representation of the Q factor returned
     /// by \c factor().  The other part of that representation is
     /// stored in the A matrix on output.
-    typedef std::pair<NodeOutput, DistOutput> FactorOutput;
+    using FactorOutput =
+      std::pair<Teuchos::RCP<NodeOutput>, DistOutput>;
 
     /// \brief Constructor
     ///
@@ -133,14 +133,9 @@ namespace TSQR {
           const dist_tsqr_ptr& distTsqr) :
       nodeTsqr_ (nodeTsqr),
       distTsqr_ (distTsqr)
-    {}
-
-    /// \brief Get the intranode part of TSQR.
-    ///
-    /// Sometimes we need this in order to do post-construction
-    /// initialization.
-    Teuchos::RCP<node_tsqr_type> getNodeTsqr () {
-      return nodeTsqr_;
+    {
+      TEUCHOS_ASSERT( ! nodeTsqr_.is_null () );
+      TEUCHOS_ASSERT( ! distTsqr_.is_null () );
     }
 
     /// \brief Cache size hint in bytes used by the intranode part of TSQR.
@@ -166,6 +161,13 @@ namespace TSQR {
         distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal();
     }
 
+    /// \brief Whether the implementation wants device memory for
+    ///   "large" arrays, like the input matrix, and the output Q
+    ///   factor or C apply result.
+    bool wants_device_memory () const {
+      return nodeTsqr_->wants_device_memory ();
+    }
+
     /// \brief Compute QR factorization with explicit Q factor: "raw"
     ///   arrays interface, for column-major data.
     ///
@@ -227,84 +229,11 @@ namespace TSQR {
                     const LocalOrdinal LDR,
                     const bool forceNonnegativeDiagonal=false)
     {
-      const bool contiguousCacheBlocks = false;
-
-      // Sanity checks for matrix dimensions.
-      if (numRows < numCols) {
-        std::ostringstream os;
-        os << "In Tsqr::factorExplicit: input matrix A has " << numRows
-           << " local rows, and " << numCols << " columns.  The input "
-          "matrix must have at least as many rows on each processor as "
-          "there are columns.";
-        throw std::invalid_argument (os.str ());
-      }
-
-      // Check for quick exit, based on matrix dimensions.
-      if (numCols == 0) {
-        return;
-      }
-
-      // Fill R initially with zeros.
-      {
-        Scalar* R_j = R;
-        for (LocalOrdinal j = 0; j < numCols; ++j) {
-          for (LocalOrdinal i = 0; i < numCols; ++i) {
-            R_j[i] = STS::zero ();
-          }
-          R_j += LDR;
-        }
-      }
-      // Compute the local QR factorization, in place in A, with the R
-      // factor written to R.
-      NodeOutput nodeResults =
-        nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR,
-                           contiguousCacheBlocks);
-      // Prepare the output matrix Q by filling with zeros.
-      nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ,
-                                  contiguousCacheBlocks);
-      // Wrap the output matrix Q in a "view."
-      mat_view_type Q_rawView (numRows, numCols, Q, LDQ);
-      // Wrap the uppermost cache block of Q.  We will need to extract
-      // its numCols x numCols uppermost block below.  We can't just
-      // extract the numCols x numCols top block from all of Q, in
-      // case Q is arranged using contiguous cache blocks.
-      mat_view_type Q_top_block =
-        nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks);
-      if (Q_top_block.extent (0) < numCols) {
-        std::ostringstream os;
-        os << "The top block of Q has too few rows.  This means that the "
-           << "the intranode TSQR implementation has a bug in its top_block"
-           << "() method.  The top block should have at least " << numCols
-           << " rows, but instead has only " << Q_top_block.extent (1)
-           << " rows.";
-        throw std::logic_error (os.str ());
-      }
-      // Use the numCols x numCols top block of Q and the local R
-      // factor (computed above) to compute the distributed-memory
-      // part of the QR factorization.
-      {
-        mat_view_type Q_top (numCols, numCols, Q_top_block.data(),
-                            Q_top_block.stride(1));
-        mat_view_type R_view (numCols, numCols, R, LDR);
-        distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal);
-      }
-      // Apply the local part of the Q factor to the result of the
-      // distributed-memory QR factorization, to get the explicit Q
-      // factor.
-      nodeTsqr_->apply (ApplyType::NoTranspose,
-                        numRows, numCols, A, LDA,
-                        nodeResults, numCols, Q, LDQ,
-                        contiguousCacheBlocks);
-
-      // If necessary, and if the user asked, force the R factor to
-      // have a nonnegative diagonal.
-      if (forceNonnegativeDiagonal &&
-          ! QR_produces_R_factor_with_nonnegative_diagonal ()) {
-        details::NonnegDiagForcer<LocalOrdinal, Scalar, STS::isComplex> forcer;
-        mat_view_type Q_mine (numRows, numCols, Q, LDQ);
-        mat_view_type R_mine (numCols, numCols, R, LDR);
-        forcer.force (Q_mine, R_mine);
-      }
+      constexpr bool contiguousCacheBlocks = false;
+      this->factorExplicitRaw (numRows, numCols,
+                               A, LDA, Q, LDQ, R, LDR,
+                               contiguousCacheBlocks,
+                               forceNonnegativeDiagonal);
     }
 
     void
@@ -319,6 +248,8 @@ namespace TSQR {
                        const bool contiguousCacheBlocks,
                        const bool forceNonnegativeDiagonal = false)
     {
+      const char prefix[] = "TSQR::Tsqr::factorExplicitRaw: ";
+      
       // Sanity checks for matrix dimensions.
       if (numRows < numCols) {
         std::ostringstream os;
@@ -335,23 +266,41 @@ namespace TSQR {
       }
 
       // Fill R initially with zeros.
-      {
-        Scalar* R_j = R;
-        for (LocalOrdinal j = 0; j < numCols; ++j) {
-          for (LocalOrdinal i = 0; i < numCols; ++i) {
-            R_j[i] = STS::zero ();
-          }
-          R_j += LDR;
-        }
+      mat_view_type R_view (numCols, numCols, R, LDR);
+      try {
+        deep_copy (R_view, Scalar {});
+      }
+      catch (std::exception& e) {
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::runtime_error, prefix <<
+           "deep_copy(R_view, 0.0) threw: " << e.what ());
       }
+
       // Compute the local QR factorization, in place in A, with the R
       // factor written to R.
-      NodeOutput nodeResults =
-        nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR,
-                           contiguousCacheBlocks);
+      Teuchos::RCP<NodeOutput> nodeResults;
+      try {
+        nodeResults =
+          nodeTsqr_->factor (numRows, numCols, A, LDA, R, LDR,
+                             contiguousCacheBlocks);
+      }
+      catch (std::exception& e) {
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::runtime_error, prefix <<
+           "nodeTsqr_->factor(...) threw: " << e.what ());
+      }
+      
       // Prepare the output matrix Q by filling with zeros.
-      nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ,
-                                  contiguousCacheBlocks);
+      try {
+        nodeTsqr_->fill_with_zeros (numRows, numCols, Q, LDQ,
+                                    contiguousCacheBlocks);
+      }
+      catch (std::exception& e) {
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::runtime_error, prefix <<
+           "nodeTsqr_->fill_with_zeros(...) threw: " << e.what ());
+      }
+      
       // Wrap the output matrix Q in a "view."
       mat_view_type Q_rawView (numRows, numCols, Q, LDQ);
       // Wrap the uppermost cache block of Q.  We will need to extract
@@ -373,27 +322,79 @@ namespace TSQR {
       // factor (computed above) to compute the distributed-memory
       // part of the QR factorization.
       {
-        mat_view_type Q_top (numCols, numCols, Q_top_block.data(),
-                            Q_top_block.stride(1));
+        mat_view_type Q_top (numCols, numCols, Q_top_block.data (),
+                             Q_top_block.stride (1));
         mat_view_type R_view (numCols, numCols, R, LDR);
-        distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal);
+
+        if (nodeTsqr_->wants_device_memory ()) {
+          // DistTsqr doesn't know what to do with device memory, so
+          // if Q_top is device memory, we need to work in a host copy
+          // and copy back to Q_top.  Q_top is an output argument
+          // here, so we can just fill Q_top_copy with zeros.
+          matrix_type Q_top_copy (Q_top.extent (0), Q_top.extent (1),
+                                  Scalar {});
+          try {
+            distTsqr_->factorExplicit (R_view, Q_top_copy.view (),
+                                       forceNonnegativeDiagonal);
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix << "distTsqr_->"
+               "factorExplicit (wants_device_memory()=true case) "
+               "threw: " << e.what ());
+          }
+          try {
+            nodeTsqr_->copy_from_host (Q_top, Q_top_copy.view ());
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix << "nodeTsqr_->"
+               "copy_from_host threw: " << e.what ());
+          }
+        }
+        else {
+          try {
+            distTsqr_->factorExplicit (R_view, Q_top,
+                                       forceNonnegativeDiagonal);
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix << "distTsqr_->"
+               "factorExplicit (wants_device_memory()=false case) "
+               "threw: " << e.what ());
+          }
+        }
       }
       // Apply the local part of the Q factor to the result of the
       // distributed-memory QR factorization, to get the explicit Q
       // factor.
-      nodeTsqr_->apply (ApplyType::NoTranspose,
-                        numRows, numCols, A, LDA,
-                        nodeResults, numCols, Q, LDQ,
-                        contiguousCacheBlocks);
+      try {
+        nodeTsqr_->apply (ApplyType::NoTranspose,
+                          numRows, numCols, A, LDA,
+                          *nodeResults, numCols, Q, LDQ,
+                          contiguousCacheBlocks);
+      }
+      catch (std::exception& e) {
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::runtime_error, prefix << "nodeTsqr_->"
+           "apply threw: " << e.what ());
+      }
 
       // If necessary, and if the user asked, force the R factor to
       // have a nonnegative diagonal.
       if (forceNonnegativeDiagonal &&
           ! QR_produces_R_factor_with_nonnegative_diagonal ()) {
-        details::NonnegDiagForcer<LocalOrdinal, Scalar, STS::isComplex> forcer;
-        mat_view_type Q_mine (numRows, numCols, Q, LDQ);
-        mat_view_type R_mine (numCols, numCols, R, LDR);
-        forcer.force (Q_mine, R_mine);
+        // We ignore contiguousCacheBlocks here, since we're only
+        // looking at the top block of Q.
+        try {
+          nodeTsqr_->force_nonnegative_diagonal (numRows, numCols,
+                                                 Q, LDQ, R, LDR);
+        }
+        catch (std::exception& e) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::runtime_error, prefix << "nodeTsqr_->"
+             "force_nonnegative_diagonal threw: " << e.what ());
+        }
       }
     }
 
@@ -451,12 +452,12 @@ namespace TSQR {
     {
       mat_view_type R_view (ncols, ncols, R, ldr);
       deep_copy (R_view, Scalar {});
-      NodeOutput nodeResults =
+      auto nodeResults =
         nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local,
-                          R_view.data(), R_view.stride(1),
-                          contiguousCacheBlocks);
+                           R_view.data (), R_view.stride (1),
+                           contiguousCacheBlocks);
       DistOutput distResults = distTsqr_->factor (R_view);
-      return std::make_pair (nodeResults, distResults);
+      return {nodeResults, distResults};
     }
 
     /// \brief Apply Q factor to the global dense matrix C
@@ -496,7 +497,6 @@ namespace TSQR {
     ///
     /// \param contiguousCacheBlocks [in] Whether or not the cache
     ///   blocks of Q and C are stored contiguously.
-    ///
     void
     apply (const std::string& op,
            const LocalOrdinal nrows_local,
@@ -533,49 +533,50 @@ namespace TSQR {
       mat_view_type C_top_view (ncols_C, ncols_C, C_view_top_block.data(),
                                 C_view_top_block.stride(1));
 
+      // DistTsqr doesn't know what to do with device memory, so we
+      // need to copy the top block of C if applicable.  The NodeTsqr
+      // implementation can decide if that's necessary.
+      //
+      // That "matrix_type C_top" is the temporary copy of C_top_view.
+      // C_top_view here is the "top block of C" that might live in
+      // device memory.
+
       if (! transposed) {
         // C_top (small compact storage) gets a deep copy of the top
         // ncols_C by ncols_C block of C_local.
-        matrix_type C_top (C_top_view);
-
-        // Compute in place on all processors' C_top blocks.
-        distTsqr_->apply (applyType, C_top.extent(1), ncols_Q, C_top.data(),
-                          C_top.stride(1), factor_output.second);
-
-        // Copy the result from C_top back into the top ncols_C by
-        // ncols_C block of C_local.
-        deep_copy (C_top_view, C_top);
-
-        // Apply the local Q factor (in Q_local and
-        // factor_output.first) to C_local.
+        matrix_type C_top = nodeTsqr_->copy_to_host (C_top_view);
+        // Compute in place on all processes' C_top blocks.
+        distTsqr_->apply (applyType, C_top.extent (1), ncols_Q,
+                          C_top.data (), C_top.stride (1),
+                          factor_output.second);
+        // Copy result back to the top block of C_local.
+        nodeTsqr_->copy_from_host (C_top_view, C_top.view ());
+        // Apply the local Q factor to C_local.
         nodeTsqr_->apply (applyType, nrows_local, ncols_Q,
-                          Q_local, ldq_local, factor_output.first,
+                          Q_local, ldq_local, *(factor_output.first),
                           ncols_C, C_local, ldc_local,
                           contiguousCacheBlocks);
       }
       else {
-        // Apply the (transpose of the) local Q factor (in Q_local
-        // and factor_output.first) to C_local.
+        // Apply the (transpose of the) local Q factor to C_local.
         nodeTsqr_->apply (applyType, nrows_local, ncols_Q,
-                          Q_local, ldq_local, factor_output.first,
+                          Q_local, ldq_local, *(factor_output.first),
                           ncols_C, C_local, ldc_local,
                           contiguousCacheBlocks);
 
         // C_top (small compact storage) gets a deep copy of the top
         // ncols_C by ncols_C block of C_local.
-        matrix_type C_top (C_top_view);
+        matrix_type C_top = nodeTsqr_->copy_to_host (C_top_view);
 
         // Compute in place on all processors' C_top blocks.
         distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.data(),
                           C_top.stride(1), factor_output.second);
-
-        // Copy the result from C_top back into the top ncols_C by
-        // ncols_C block of C_local.
-        deep_copy (C_top_view, C_top);
+        // Copy result back to the top block of C_local.
+        nodeTsqr_->copy_from_host (C_top_view, C_top.view ());
       }
     }
 
-    /// \brief Compute the explicit Q factor from factor()
+    /// \brief Compute the explicit Q factor from result of factor().
     ///
     /// Compute the explicit version of the Q factor computed by
     /// factor() and represented implicitly (via Q_local_in and
@@ -633,11 +634,11 @@ namespace TSQR {
         mat_view_type Q_out_top =
           nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks);
 
-        // Fill (topmost cache block of) Q_out with the first
-        // ncols_Q_out columns of the identity matrix.
-        for (ordinal_type j = 0; j < ncols_Q_out; ++j) {
-          Q_out_top(j, j) = Scalar (1);
-        }
+        // Q_out_top is device memory, so we shouldn't write directly
+        // to it.  Instead, let NodeTsqr fill it with the first
+        // ncols_Q_out columns of the identity matrix.  Note that
+        // we've already filled Q_out with zeros above.
+        nodeTsqr_->set_diagonal_entries_to_one (Q_out_top);
       }
       apply ("N", nrows_local,
              ncols_Q_in, Q_local_in, ldq_local_in, factorOutput,
@@ -754,23 +755,21 @@ namespace TSQR {
       if (ncols == 0) {
         return 0;
       }
-      //
       // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as
       // the copy B) on Proc 0 only.  This would ensure that all
       // processors get the same SVD and rank (esp. in a heterogeneous
       // computing environment).  For now, we just do this computation
       // redundantly, and hope that all the returned rank values are
       // the same.
-      //
-      matrix_type U (ncols, ncols, STS::zero());
+      matrix_type U (ncols, ncols, Scalar {});
       const ordinal_type rank =
-        reveal_R_rank (ncols, R, ldr, U.data(), U.stride(1), tol);
+        reveal_R_rank (ncols, R, ldr, U.data (), U.stride (1), tol);
       if (rank < ncols) {
         // If R is not full rank: reveal_R_rank() already computed
         // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
         // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
         // := Q \cdot U\f$, respecting cache blocks of Q.
-        Q_times_B (nrows, ncols, Q, ldq, U.data(), U.stride(1),
+        Q_times_B (nrows, ncols, Q, ldq, U.data (), U.stride (1),
                    contiguousCacheBlocks);
       }
       return rank;
@@ -815,4 +814,4 @@ namespace TSQR {
 
 } // namespace TSQR
 
-#endif // __TSQR_Tsqr_hpp
+#endif // TSQR_TSQR_HPP
diff --git a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp
index 89236be2068c..b7cf98c735e4 100644
--- a/packages/tpetra/tsqr/src/TsqrAdaptor.hpp
+++ b/packages/tpetra/tsqr/src/TsqrAdaptor.hpp
@@ -74,10 +74,7 @@ namespace TSQR {
     ///
     /// TsqrAdaptor uses the appropriate specialization of
     /// TsqrTypeAdaptor to figure out which variant of TSQR to use on
-    /// the given multivector type.  For example, with
-    /// Tpetra::MultiVector<S, LO, GO, NodeType>, if NodeType is
-    /// KokkosClassic::DoNotUse::TBBNode, the TBB-parallel intranode
-    /// variant of TSQR will be used.  The caller is responsible for
+    /// the given multivector type.  The caller is responsible for
     /// constructing the intranode and internode TSQR objects.
     ///
     /// \tparam S Scalar type
diff --git a/packages/tpetra/tsqr/src/TsqrFactory.hpp b/packages/tpetra/tsqr/src/TsqrFactory.hpp
index ad4be2e7f831..7841207a06b9 100644
--- a/packages/tpetra/tsqr/src/TsqrFactory.hpp
+++ b/packages/tpetra/tsqr/src/TsqrFactory.hpp
@@ -45,10 +45,10 @@
 ///
 /// \warning TSQR users should _not_ include this file directly.
 
-#include "Tsqr_NodeTsqrFactory.hpp"
-#include "Teuchos_Comm.hpp"
-#include "Tsqr_MessengerBase.hpp"
 #include "Tsqr.hpp"
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_ParameterList.hpp"
+#include "Teuchos_RCP.hpp"
 
 namespace TSQR {
   namespace Trilinos {
@@ -63,8 +63,8 @@ namespace TSQR {
     /// \tparam LO The (local) ordinal type used by TSQR.
     /// \tparam S The Scalar type used by TSQR; the type of the
     ///   entries of the matrices to factor.
-    /// \tparam NodeTsqrType The type of the intranode part of TSQR.
-    /// \tparam DistTsqrType The type of the internode part of TSQR.
+    /// \tparam NodeTsqrType The type of the intraprocess part of TSQR.
+    /// \tparam DistTsqrType The type of the interprocess part of TSQR.
     ///
     /// \note Unless you need to change the interface between Trilinos
     ///   and TSQR, you don't need to do anything with TsqrFactory or
@@ -72,19 +72,19 @@ namespace TSQR {
     ///   \c TsqrAdaptor.  TsqrFactory and its subclasses don't have
     ///   anything to do with any of the Trilinos multivector classes.
     ///
-    /// \note If you have implemented a new intranode TSQR
+    /// \note If you have implemented a new intraprocess TSQR
     ///   factorization type (NodeTsqrType), you <i>may</i> need to
     ///   create a subclass (not specialization) of TsqrFactory that
-    ///   knows how to instantiate that intranode TSQR class.
+    ///   knows how to instantiate that intraprocess TSQR class.
     ///   Alternately, you could write NodeTsqrType so that the
-    ///   provided default implementation of \c makeNodeTsqr() works.
+    ///   provided default implementation of makeNodeTsqr works.
     ///
-    /// \note If you have implemented a new internode TSQR
+    /// \note If you have implemented a new interprocess TSQR
     ///   factorization type (DistTsqrType), you <i>may</i> need to
     ///   create a subclass (not specialization) of TsqrFactory that
-    ///   knows how to instantiate that internode TSQR class.
+    ///   knows how to instantiate that interprocess TSQR class.
     ///   Alternately, you could write DistTsqrType so that the
-    ///   provided default implementation of \c makeDistTsqr() works.
+    ///   provided default implementation of makeDistTsqr works.
     ///
     /// \note If you want to change which TSQR implementation is
     ///   invoked for a particular multivector (MV) class, you don't
@@ -102,22 +102,22 @@ namespace TSQR {
       typedef DistTsqrType dist_tsqr_type;
 
       typedef MessengerBase<S> scalar_messenger_type;
-      typedef Tsqr<LO, S, node_tsqr_type> tsqr_type;
+      typedef Tsqr<LO, S> tsqr_type;
 
       /// \brief Instantiate and return the TSQR implementation.
       ///
       /// \param plist [in/out] Parameter list (keys depend on the
-      ///   subclass; keys are accessed in the subclass'
-      ///   makeNodeTsqr() method).  On output: On output: Missing
-      ///   parameters are filled in with default values.
+      ///   subclass; keys are accessed in the subclass' makeNodeTsqr
+      ///   method).  On output: On output: Missing parameters are
+      ///   filled in with default values.
       ///
       /// \param nodeTsqr [out] On output, points to the
-      ///   node_tsqr_type object that TSQR will use for the intranode
-      ///   part of its computations.
+      ///   node_tsqr_type object that TSQR will use for the
+      ///   intraprocess part of its computations.
       ///
       /// \param distTsqr [out] On output, points to the
-      ///   dist_tsqr_type object that TSQR will use for the internode
-      ///   part of its computations.
+      ///   dist_tsqr_type object that TSQR will use for the
+      ///   interprocess part of its computations.
       ///
       /// \return The node_tsqr_type instance that implements TSQR.
       Teuchos::RCP<tsqr_type>
@@ -133,62 +133,57 @@ namespace TSQR {
         return rcp (new tsqr_type (nodeTsqr, distTsqr));
       }
 
-      void
-      prepareTsqr
-
-                const Teuchos::RCP<scalar_messenger_type>& messenger,
-
       //! Virtual destructor for memory safety of derived classes.
-      virtual ~TsqrFactory () {};
+      virtual ~TsqrFactory () = default;
 
     private:
-      /// \brief Instantiate and return the TSQR's intranode object.
+      /// \brief Instantiate and return TSQR's intraprocess object.
       ///
       /// \param plist [in/out] Same as the epinonymous input of
-      ///   \c makeTsqr().
+      ///   makeTsqr.
       ///
       /// \return The node_tsqr_type object that TSQR will use for the
-      ///   intranode part of its computations.
+      ///   intraprocess part of its computations.
       ///
-      /// \note For implementers: this and \c makeDistTsqr() are the
-      ///   two methods to implement.  makeTsqr()'s implementation is
+      /// \note For implementers: this and makeDistTsqr are the two
+      ///   methods to implement.  makeTsqr's implementation is
       ///   "generic"; it does not depend on node_tsqr_type or
-      ///   dist_tsqr_type.  The implementation of makeNodeTsqr()
-      ///   varies for different node_tsqr_type types.  This pattern
-      ///   is the compile-time polymorphism equivalent of the
-      ///   "Non-Virtual Interface" (NVI) idiom, where the "virtual"
-      ///   methods (here, the methods that vary for different
-      ///   template parameters) are private, and the "nonvirtual"
-      ///   methods (here, the methods that are the same for different
-      ///   template parameters) are part of the public interface.
+      ///   dist_tsqr_type.  The implementation of makeNodeTsqr varies
+      ///   for different node_tsqr_type types.  This pattern is the
+      ///   compile-time polymorphism equivalent of the "Non-Virtual
+      ///   Interface" (NVI) idiom, where the "virtual" methods (here,
+      ///   the methods that vary for different template parameters)
+      ///   are private, and the "nonvirtual" methods (here, the
+      ///   methods that are the same for different template
+      ///   parameters) are part of the public interface.
       virtual Teuchos::RCP<node_tsqr_type>
       makeNodeTsqr (const Teuchos::RCP<Teuchos::ParameterList>& plist) const
       {
         return Teuchos::rcp (new node_tsqr_type (plist));
       }
 
-      /// \brief Instantiate and return TSQR's internode object.
+      /// \brief Instantiate and return TSQR's interprocess object.
       ///
       /// \param messenger [in] Object used by TSQR for communicating
       ///   between MPI processes.
       ///
       /// \param plist [in/out] Same as the epinonymous input of
-      ///   \c makeTsqr().
+      ///   makeTsqr.
       ///
       /// \return The dist_tsqr_type object that TSQR will use for the
-      ///   internode part of its computations.
+      ///   interprocess part of its computations.
       ///
-      /// \note For implementers: this and \c makeNodeTsqr() are the
-      ///   two interesting methods.  makeTsqr()'s implementation is
+      /// \note For implementers: this and makeNodeTsqr are the two
+      ///   interesting methods.  makeTsqr's implementation is
       ///   "generic"; it does not depend on node_tsqr_type or
-      ///   dist_tsqr_type.  The implementation of makeDistTsqr()
+      ///   dist_tsqr_type.  The implementation of makeDistTsqr
       ///   varies for different dist_tsqr_type types.
       virtual Teuchos::RCP<dist_tsqr_type>
       makeDistTsqr (const Teuchos::RCP<scalar_messenger_type>& messenger,
                     const Teuchos::RCP<Teuchos::ParameterList>& plist) const
       {
-        (void) plist;
-        return Teuchos::rcp (new dist_tsqr_type (messenger));
+        auto ret = Teuchos::rcp (new dist_tsqr_type (messenger));
+        ret->setParameterList (plist);
       }
     };
   } // namespace Trilinos
diff --git a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp
index 5e6dccdbb87a..5de0142c768d 100644
--- a/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp
+++ b/packages/tpetra/tsqr/src/TsqrTypeAdaptor.hpp
@@ -114,8 +114,8 @@ namespace TSQR {
       /// \brief Type representing the whole TSQR method.
       ///
       /// Depends on \c node_tsqr_type and \c dist_tsqr_type.
-      typedef TSQR::Tsqr<LO, S, node_tsqr_type, dist_tsqr_type> tsqr_type;
-      typedef Teuchos::RCP<tsqr_type >                          tsqr_ptr;
+      using tsqr_type = TSQR::Tsqr<LO, S>;
+      typedef Teuchos::RCP<tsqr_type> tsqr_ptr;
 
       /// \typedef factory_type
       ///
diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp
index 56c9ef51f076..b650fbf37050 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlocker.hpp
@@ -158,14 +158,14 @@ namespace TSQR {
     ///   matrix with which this CacheBlocker was initialized.
     template< class MatrixViewType >
     MatrixViewType
-    split_top_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
+    split_top_block (MatrixViewType& A,
+                     const bool contiguous_cache_blocks) const
     {
       typedef typename MatrixViewType::ordinal_type ordinal_type;
       const ordinal_type nrows_top =
         strategy_.top_block_split_nrows (A.extent(0), extent(1),
                                          nrows_cache_block());
-      // split_top() sets A to A_rest, and returns A_top.
-      return A.split_top (nrows_top, contiguous_cache_blocks);
+      return split_top (A, nrows_top, contiguous_cache_blocks);
     }
 
     /// \brief View of the topmost cache block of A.
@@ -188,7 +188,7 @@ namespace TSQR {
         strategy_.top_block_split_nrows (A.extent(0), extent(1),
                                          nrows_cache_block());
       MatrixViewType A_copy (A);
-      return A_copy.split_top (nrows_top, contiguous_cache_blocks);
+      return split_top (A_copy, nrows_top, contiguous_cache_blocks);
     }
 
     /// \brief Split A in place into [A_rest; A_bot].
@@ -207,7 +207,8 @@ namespace TSQR {
     ///
     template< class MatrixViewType >
     MatrixViewType
-    split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
+    split_bottom_block (MatrixViewType& A,
+                        const bool contiguous_cache_blocks) const
     {
       typedef typename MatrixViewType::ordinal_type ordinal_type;
       // Ignore the number of columns in A, since we want to block all
@@ -216,7 +217,7 @@ namespace TSQR {
         strategy_.bottom_block_split_nrows (A.extent(0), extent(1),
                                             nrows_cache_block());
       // split_bottom() sets A to A_rest, and returns A_bot.
-      return A.split_bottom (nrows_bottom, contiguous_cache_blocks);
+      return split_bottom (A, nrows_bottom, contiguous_cache_blocks);
     }
 
     /// \brief Fill the matrix A with zeros, respecting cache blocks.
@@ -241,7 +242,7 @@ namespace TSQR {
       // won't be the correct leading dimension of A, but it won't
       // matter: we only ever operate on A_cur here, and A_cur's
       // leading dimension is set correctly by split_top_block().
-      while (! A.empty()) {
+      while (! empty (A)) {
         // This call modifies the matrix view A, but that's OK since
         // we passed the input view by copy, not by reference.
         MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks);
@@ -280,10 +281,10 @@ namespace TSQR {
       // Note: if the cache blocks are stored contiguously, lda won't
       // be the correct leading dimension of A, but it won't matter:
       // we only ever operate on A_cur here, and A_cur's leading
-      // dimension is set correctly by A_rest.split_top().
+      // dimension is set correctly by split_top_block.
       mat_view_type A_rest (num_rows, num_cols, A, lda);
 
-      while (! A_rest.empty()) {
+      while (! empty (A_rest)) {
         // This call modifies A_rest.
         mat_view_type A_cur = split_top_block (A_rest, contiguous_cache_blocks);
         deep_copy (A_cur, Scalar {});
@@ -322,8 +323,8 @@ namespace TSQR {
       // Leading dimension doesn't matter since A_out will be cache blocked.
       mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_in);
 
-      while (! A_in_rest.empty()) {
-        if (A_out_rest.empty()) {
+      while (! empty (A_in_rest)) {
+        if (empty (A_out_rest)) {
           throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
         }
         // This call modifies A_in_rest.
@@ -351,8 +352,8 @@ namespace TSQR {
       const_mat_view_type A_in_rest (num_rows, num_cols, A_in, lda_out);
       mat_view_type A_out_rest (num_rows, num_cols, A_out, lda_out);
 
-      while (! A_in_rest.empty()) {
-        if (A_out_rest.empty()) {
+      while (! empty (A_in_rest)) {
+        if (empty (A_out_rest)) {
           throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
         }
         // This call modifies A_in_rest.
@@ -389,9 +390,9 @@ namespace TSQR {
       const ordinal_type num_cache_blocks =
         strategy_.num_cache_blocks (A.extent(0), A.extent(1), nrows_cache_block());
 
-      if (cache_block_index >= num_cache_blocks)
-        return MatrixViewType (0, 0, NULL, 0); // empty
-
+      if (cache_block_index >= num_cache_blocks) {
+        return MatrixViewType {}; // empty
+      }
       // result[0] = starting row index of the cache block
       // result[1] = number of rows in the cache block
       // result[2] = pointer offset (A.data() + result[2])
@@ -402,8 +403,7 @@ namespace TSQR {
                                        nrows_cache_block(),
                                        contiguous_cache_blocks);
       if (result[1] == 0) {
-        // For some reason, the cache block is empty.
-        return MatrixViewType (0, 0, nullptr, 0);
+        return MatrixViewType {};
       }
 
       // We expect that ordinal_type is signed, so adding signed
@@ -414,19 +414,6 @@ namespace TSQR {
                              result[3]);
     }
 
-    /// \brief Equality operator.
-    ///
-    /// Two cache blockers are "equal" if they correspond to matrices
-    /// with the same dimensions (number of rows and number of
-    /// columns), and if their cache blocking strategies are equal.
-    bool
-    operator== (const CacheBlockingStrategy<Ordinal, Scalar>& rhs) const
-    {
-      return extent(0) == rhs.extent(0) &&
-        extent(1) == rhs.extent(1) &&
-        strategy_ == rhs.strategy_;
-    }
-
   private:
     //! Number of rows in the matrix to block.
     Ordinal nrows_ = 0;
diff --git a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp
index aa70035044ac..716c55467991 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CacheBlockingStrategy.hpp
@@ -156,18 +156,6 @@ namespace TSQR {
     /// most cases, however.
     size_t size_of_scalar () const { return size_of_scalar_; }
 
-    //! True if and only if the two strategies are the same.
-    bool operator== (const CacheBlockingStrategy& rhs) const {
-      return cache_size_hint() == rhs.cache_size_hint() &&
-        size_of_scalar() == rhs.size_of_scalar();
-    }
-
-    //! True if and only if the two strategies are not the same.
-    bool operator!= (const CacheBlockingStrategy& rhs) const {
-      return cache_size_hint() != rhs.cache_size_hint() ||
-        size_of_scalar() != rhs.size_of_scalar();
-    }
-
     /// \brief Pointer offset for the cache block with the given index.
     ///
     /// The pointer offset depends on whether cache blocks are stored
@@ -221,14 +209,14 @@ namespace TSQR {
                         const LocalOrdinal nrows_cache_block,
                         const bool contiguous_cache_blocks) const
     {
-      if (contiguous_cache_blocks)
-        {
-          std::pair<LocalOrdinal, LocalOrdinal> result =
-            cache_block (index, nrows, ncols, nrows_cache_block);
-          return result.second; // Number of rows in the cache block
-        }
-      else
+      if (contiguous_cache_blocks) {
+        std::pair<LocalOrdinal, LocalOrdinal> result =
+          cache_block (index, nrows, ncols, nrows_cache_block);
+        return result.second; // Number of rows in the cache block
+      }
+      else {
         return lda;
+      }
     }
 
     /// \brief Start and size of cache block number \c index.
@@ -257,39 +245,46 @@ namespace TSQR {
       LocalOrdinal my_row_start, my_nrows;
 
       my_row_start = index * nrows_cache_block;
-      if (quotient == 0)
-        { // There is only one cache block.
-          if (index == 0)
-            my_nrows = remainder;
-          else
-            my_nrows = 0; // Out-of-range block, therefore empty
+      if (quotient == 0) { // There is only one cache block.
+        if (index == 0) {
+          my_nrows = remainder;
         }
-      else if (remainder < ncols)
-        { // There are quotient cache blocks.
-          if (index < 0)
-            my_nrows = 0; // Out-of-range block, therefore empty
-          else if (index < quotient - 1)
-            my_nrows = nrows_cache_block;
-          else if (index == quotient - 1)
-            // The last cache block gets the leftover rows, so that no
-            // cache block has fewer than ncols rows.
-            my_nrows = nrows_cache_block + remainder;
-          else
-            my_nrows = 0; // Out-of-range block, therefore empty
+        else {
+          my_nrows = 0; // Out-of-range block, therefore empty
         }
-      else
-        { // There are quotient+1 cache blocks.
-          if (index < 0)
-            my_nrows = 0; // Out-of-range block, therefore empty
-          else if (index < quotient)
-            my_nrows = nrows_cache_block;
-          else if (index == quotient)
-            // The last cache block has the leftover rows, which are
-            // >= ncols and < nrows_cache_block.
-            my_nrows = remainder;
-          else
-            my_nrows = 0; // Out-of-range block, therefore empty
+      }
+      else if (remainder < ncols) { // There are quotient cache blocks.
+        if (index < 0) {
+          my_nrows = 0; // Out-of-range block, therefore empty
+        }
+        else if (index < quotient - 1) {
+          my_nrows = nrows_cache_block;
+        }
+        else if (index == quotient - 1) {
+          // The last cache block gets the leftover rows, so that no
+          // cache block has fewer than ncols rows.
+          my_nrows = nrows_cache_block + remainder;
+        }
+        else {
+          my_nrows = 0; // Out-of-range block, therefore empty
         }
+      }
+      else { // There are quotient+1 cache blocks.
+        if (index < 0) {
+          my_nrows = 0; // Out-of-range block, therefore empty
+        }
+        else if (index < quotient) {
+          my_nrows = nrows_cache_block;
+        }
+        else if (index == quotient) {
+          // The last cache block has the leftover rows, which are
+          // >= ncols and < nrows_cache_block.
+          my_nrows = remainder;
+        }
+        else {
+          my_nrows = 0; // Out-of-range block, therefore empty
+        }
+      }
       return std::make_pair (my_row_start, my_nrows);
     }
 
@@ -316,7 +311,6 @@ namespace TSQR {
     /// \note This method has an \f$O(1)\f$ cost, so that
     ///   parallelization by calling this method repeatedly for a
     ///   sequence of cache block indices is not expensive.
-    ///
     std::vector<LocalOrdinal>
     cache_block_details (const LocalOrdinal index,
                          const LocalOrdinal nrows,
diff --git a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp
index 7b1f15f0f8ae..5bdd5608ba22 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Combine.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Combine.hpp
@@ -38,18 +38,17 @@
 //@HEADER
 
 /// \file Tsqr_Combine.hpp
-/// \brief TSQR's six computational kernels.
+/// \brief Interface to TSQR's six computational kernels.
 
-#ifndef __TSQR_Combine_hpp
-#define __TSQR_Combine_hpp
+#ifndef TSQR_COMBINE_HPP
+#define TSQR_COMBINE_HPP
 
-#include "Teuchos_ScalarTraits.hpp"
 #include "Tsqr_ApplyType.hpp"
-#include "Tsqr_CombineNative.hpp"
+#include "Tsqr_MatView.hpp"
 
 namespace TSQR {
   /// \class Combine
-  /// \brief TSQR's six computational kernels
+  /// \brief Interface to TSQR's six computational kernels
   /// \author Mark Hoemmen
   ///
   /// This class provides the six computational primitives required by
@@ -69,46 +68,52 @@ namespace TSQR {
   ///
   /// \tparam Ordinal Type of indices into matrices.
   /// \tparam Scalar Type of entries of matrices.
-  /// \tparam CombineImpl Type of a particular implementation of
-  ///   Combine.  Its public interface must contain this class'
-  ///   interface.
   ///
-  /// All Combine methods are implemented using CombineImpl methods
-  /// with the same name.  TSQR includes three implementations of the
-  /// CombineImpl interface:
+  /// TSQR includes two implementations of the Combine interface:
   ///
   /// <ul>
   /// <li> CombineDefault, which uses LAPACK and copies in and out of
-  ///   scratch space that it owns, </li>
+  ///   scratch space that it owns, and </li>
   /// <li> CombineNative, a C++ in-place (no scratch space) generic
-  ///   implementation), and </li>
-  /// <li> CombineFortran, a Fortran 9x in-place implementation for
-  ///   LAPACK's four data types (S, D, C, and Z). </li>
+  ///   implementation) </li>
   /// </ul>
-  template< class Ordinal,
-            class Scalar,
-            class CombineImpl = CombineNative<Ordinal, Scalar, Teuchos::ScalarTraits<Scalar >::isComplex> >
+  ///
+  /// There used to be a third implementation, CombineFortran, but it
+  /// relied on a Fortran 9x compiler and was thus not often tested,
+  /// so we removed it.
+  template<class Ordinal, class Scalar>
   class Combine {
   public:
-    /// \typedef scalar_type
-    /// \brief Type of matrix entries.
-    typedef Scalar scalar_type;
-    /// \typedef ordinal_type
-    /// \brief Type of (intranode) matrix indices.
-    typedef Ordinal ordinal_type;
-    /// \typedef combine_impl_type
-    /// \brief Type of the implementation of Combine.
-    typedef CombineImpl combine_impl_type;
+    //! Type of matrix entries.
+    using scalar_type = Scalar;
+    //! Type of (intraprocess) matrix indices.
+    using ordinal_type = Ordinal;
+
+    virtual ~Combine () = default;
 
-    //! Constructor.
-    Combine () = default;
+    /// \brief Whether or not the QR factorizations computed by
+    ///   methods of this class produce an R factor with all
+    ///   nonnegative diagonal entries.
+    virtual bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const = 0;
 
-    /// Whether or not the QR factorizations computed by methods of
-    /// this class produce an R factor with all nonnegative diagonal
-    /// entries.
-    static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-      return combine_impl_type::QR_produces_R_factor_with_nonnegative_diagonal();
-    }
+    /// \brief Best work array size.
+    ///
+    /// \param num_rows_Q [in] Number of rows in each block of the
+    ///   matrix to factor.  ("Block" means the part of the matrix
+    ///   passed directly to factor_first or factor_inner.)
+    ///
+    /// \param num_cols_Q [in] Number of columns of the matrix to
+    ///   factor (the input/output matrix of factor_first or
+    ///   factor_inner).
+    ///
+    /// \param num_cols_C [in] Number of columns of the matrix output
+    ///   of apply_first, apply_inner, or apply_pair (use the max of
+    ///   all three).
+    virtual ordinal_type
+    work_size (const ordinal_type num_rows_Q,
+               const ordinal_type num_cols_Q,
+               const ordinal_type num_cols_C) const = 0;
 
     /// \brief Factor the first cache block.
     ///
@@ -118,84 +123,30 @@ namespace TSQR {
     /// (along with the length ncols tau array) with the implicitly
     /// stored Q factor.
     ///
-    /// \param nrows [in] Number of rows in A
-    /// \param ncols [in] Number of columns in A
     /// \param A [in/out] On input: the nrows by ncols matrix (in
     ///   column-major order, with leading dimension lda) to factor.
     ///   On output: upper triangle contains the R factor, and lower
     ///   part contains the implicitly stored Q factor.
-    /// \param lda [in] Leading dimension of A
     /// \param tau [out] Array of length ncols; on output, the
     ///   scaling factors for the Householder reflectors
     /// \param work [out] Workspace array of length ncols
-    void
-    factor_first (const MatView<Ordinal, Scalar>& A,
+    virtual void
+    factor_first (const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const
-    {
-      return impl_.factor_first (A, tau, work);
-    }
+                  Scalar work[],
+                  const ordinal_type lwork) = 0;
 
-    /// \brief Apply the result of \c factor_first().
+    /// \brief Apply the result of factor_first() to C.
     ///
     /// Apply the Q factor, as computed by factor_first() and stored
     /// implicitly in A and tau, to the matrix C.
-    void
+    virtual void
     apply_first (const ApplyType& applyType,
-                 const MatView<Ordinal, const Scalar>& A,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 const MatView<Ordinal, Scalar>& C,
-                 Scalar work[])
-    {
-      return impl_.apply_first (applyType, A, tau, C, work);
-    }
-
-    /// Apply the result of \c factor_inner().
-    ///
-    /// Apply the Q factor stored in [R; A] to [C_top; C_bot].  The C
-    /// blocks are allowed, but not required, to have different leading
-    /// dimensions (ldc_top resp. ldc_bottom).  R is upper triangular, so
-    /// we do not need it; the Householder reflectors representing the Q
-    /// factor are stored compactly in A (specifically, in all of A, not
-    /// just the lower triangle).
-    ///
-    /// In the "sequential under parallel" version of TSQR, this function
-    /// belongs to the sequential part (i.e., operating on cache blocks on
-    /// a single processor).
-    ///
-    /// \param apply_type [in] NoTranspose means apply Q, Transpose
-    ///   means apply Q^T, and ConjugateTranspose means apply Q^H.
-    /// \param m [in]         number of rows of A
-    /// \param ncols_C [in]   number of columns of [C_top; C_bot]
-    /// \param ncols_Q [in]   number of columns of [R; A]
-    /// \param A [in] m by ncols_Q matrix, in which the Householder
-    ///   reflectors representing the Q factor are stored
-    /// \param lda [in]       leading dimension of A
-    /// \param tau [in] array of length ncols_Q, storing the scaling
-    ///   factors for the Householder reflectors representing Q
-    /// \param C_top [inout]  ncols_Q by ncols_C matrix
-    /// \param ldc_top [in]   leading dimension of C_top
-    /// \param C_bot [inout]  m by ncols_C matrix
-    /// \param ldc_bot [in]   leading dimension of C_bot
-    /// \param work [out]     workspace array of length ncols_C
-    void
-    apply_inner (const ApplyType& apply_type,
-                 const Ordinal m,
-                 const Ordinal ncols_C,
-                 const Ordinal ncols_Q,
-                 const Scalar A[],
-                 const Ordinal lda,
-                 const Scalar tau[],
-                 Scalar C_top[],
-                 const Ordinal ldc_top,
-                 Scalar C_bot[],
-                 const Ordinal ldc_bot,
-                 Scalar work[]) const
-    {
-      impl_.apply_inner (apply_type, m, ncols_C, ncols_Q,
-                         A, lda, tau,
-                         C_top, ldc_top, C_bot, ldc_bot, work);
-    }
+                 const MatView<ordinal_type, Scalar>& C,
+                 Scalar work[],
+                 const ordinal_type lwork) = 0;
 
     /// \brief Factor [R; A] for square upper triangular R and cache block A.
     ///
@@ -231,61 +182,82 @@ namespace TSQR {
     ///   Corresponds to the TAU output of LAPACK's _GEQRF.
     /// \param work [out] Workspace (length >= n; don't need lwork or
     ///   workspace query)
-    void
-    factor_inner (const MatView<Ordinal, Scalar>& R,
-                  const MatView<Ordinal, Scalar>& A,
+    virtual void
+    factor_inner (const MatView<ordinal_type, Scalar>& R,
+                  const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const
-    {
-      impl_.factor_inner (R, A, tau, work);
-    }
+                  Scalar work[],
+                  const ordinal_type lwork) = 0;
 
-    /// \brief Factor the pair of square upper triangular matrices [R_top; R_bot].
+    /// Apply the result of factor_inner().
+    ///
+    /// Apply the Q factor stored in [R; A] to [C_top; C_bot], where
+    ///
+    /// <ul>
+    /// <li> A is     m       by ncols_Q, </li>
+    /// <li> R is     ncols_Q by ncols Q, </li>
+    /// <li> C_top is ncols_Q by ncols_C, and </li>
+    /// <li> C_bot is m       by ncols_C. </li>
+    /// </ul>
+    ///
+    /// The C blocks are allowed, but not required, to have different
+    /// strides ("leading dimensions," in BLAS and LAPACK terms).  R
+    /// is upper triangular, so we do not need an explicit version of
+    /// R here.  The Householder reflectors representing the Q factor
+    /// are stored compactly in A (specifically, in all of A, not just
+    /// the lower triangle) and tau.
+    ///
+    /// \param apply_type [in] NoTranspose means apply Q, Transpose
+    ///   means apply Q^T, and ConjugateTranspose means apply Q^H.
+    /// \param A [in] m by ncols_Q matrix, in which the Householder
+    ///   reflectors representing the Q factor are stored
+    /// \param tau [in] array of length ncols_Q, storing the scaling
+    ///   factors for the Householder reflectors representing Q
+    /// \param C_top [inout]  ncols_Q by ncols_C matrix
+    /// \param C_bot [inout]  m by ncols_C matrix
+    /// \param work [out]     workspace array of length ncols_C
+    virtual void
+    apply_inner (const ApplyType& apply_type,
+                 const MatView<ordinal_type, const Scalar>& A,
+                 const Scalar tau[],
+                 const MatView<ordinal_type, Scalar>& C_top,
+                 const MatView<ordinal_type, Scalar>& C_bot,
+                 Scalar work[],
+                 const ordinal_type lwork) = 0;
+
+    /// \brief Factor the pair of square upper triangular matrices
+    ///   [R_top; R_bot].
     ///
     /// Store the resulting R factor in R_top, and the resulting
     /// Householder reflectors implicitly in R_bot and tau.
-    void
-    factor_pair (const MatView<Ordinal, Scalar>& R_top,
-                 const MatView<Ordinal, Scalar>& R_bot,
+    virtual void
+    factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+                 const MatView<ordinal_type, Scalar>& R_bot,
                  Scalar tau[],
-                 Scalar work[]) const
-    {
-      impl_.factor_pair (R_top, R_bot, tau, work);
-    }
+                 Scalar work[],
+                 const ordinal_type lwork) = 0;
 
     /// \brief Apply the result of \c factor_pair().
     ///
     /// Apply Q factor (or Q^T or Q^H) of the 2*ncols_Q by ncols_Q
     /// matrix [R_top; R_bot] (stored in R_bot and tau) to the
     /// 2*ncols_Q by ncols_C matrix [C_top; C_bot].  The two blocks
-    /// C_top and C_bot may have different leading dimensions (ldc_top
-    /// resp. ldc_bot).
+    /// C_top and C_bot need not be stored contiguously in memory, and
+    /// they may have different strides ("leading dimensions," in BLAS
+    /// and LAPACK terms).
     ///
     /// \param apply_type [in] NoTranspose means apply Q, Transpose
     ///   means apply Q^T, and ConjugateTranspose means apply Q^H.
-    void
+    virtual void
     apply_pair (const ApplyType& apply_type,
-                const Ordinal ncols_C,
-                const Ordinal ncols_Q,
-                const Scalar R_bot[],
-                const Ordinal ldr_bot,
+                const MatView<ordinal_type, const Scalar>& R_bot,
                 const Scalar tau[],
-                Scalar C_top[],
-                const Ordinal ldc_top,
-                Scalar C_bot[],
-                const Ordinal ldc_bot,
-                Scalar work[]) const
-    {
-      impl_.apply_pair (apply_type, ncols_C, ncols_Q,
-                        R_bot, ldr_bot, tau,
-                        C_top, ldc_top, C_bot, ldc_bot, work);
-    }
-
-  private:
-    //! The implementation of Combine.
-    combine_impl_type impl_;
+                const MatView<ordinal_type, Scalar>& C_top,
+                const MatView<ordinal_type, Scalar>& C_bot,
+                Scalar work[],
+                const ordinal_type lwork) = 0;
   };
 
 } // namespace TSQR
 
-#endif // __TSQR_Combine_hpp
+#endif // TSQR_COMBINE_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp
index e77802f173ec..0e7e16d42d92 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmark.hpp
@@ -44,9 +44,6 @@
 #include "Tsqr_CombineBenchmarker.hpp"
 #include "Tsqr_CombineDefault.hpp"
 #include "Tsqr_CombineNative.hpp"
-#ifdef HAVE_KOKKOSTSQR_FORTRAN
-#  include "Tsqr_CombineFortran.hpp"
-#endif // HAVE_KOKKOSTSQR_FORTRAN
 
 #include <algorithm>
 #include <iostream>
@@ -320,52 +317,19 @@ namespace TSQR {
                                                          params.additionalData);
         const double slowdown = nativeTimings[1] / defaultTimings[1];
         const bool tooSlow = slowdown > params.allowance;
-        // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
-        // more appropriately named exception.
-        TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow,
-                           std::runtime_error,
-                           "CombineNative is too slow!  For cache block "
-                           "benchmark with numRows=" << numRows << " and numCols="
-                           << numCols << ", CombineNative time (= "
-                           << nativeTimings[1] << ") / CombineDefault time (= "
-                           << defaultTimings[1] << ") = " << slowdown
-                           << " > the allowed fraction " << params.allowance
-                           << ".");
+        // FIXME (mfh 10 Dec 2019) Return an error code / bool,
+        // instead of throwing.
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (params.strictPerfTests && tooSlow, std::runtime_error,
+           "CombineNative is too slow!  For cache block benchmark "
+           "with numRows=" << numRows << " and numCols=" << numCols
+           << ", CombineNative time (= " << nativeTimings[1] <<
+           ") / CombineDefault time (= " << defaultTimings[1] <<
+           ") = " << slowdown << " > the allowed fraction " <<
+           params.allowance << ".");
       }
-
-#ifdef HAVE_KOKKOSTSQR_FORTRAN
-      std::vector<double> fortranTimings;
-      {
-        typedef CombineFortran<Scalar> combine_type;
-        std::string combineTypeName ("Fortran");
-        fortranTimings =
-          benchmarkCombineType<combine_type, TimerType> (out, params.seed,
-                                                         dataTypeName,
-                                                         combineTypeName,
-                                                         numRows,
-                                                         numCols,
-                                                         cacheBlockNumTrials,
-                                                         pairNumTrials,
-                                                         params.averageTimings,
-                                                         params.additionalData);
-        const double slowdown = fortranTimings[1] / defaultTimings[1];
-        const bool tooSlow = slowdown > params.allowance;
-        // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
-        // more appropriately named exception.
-        TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow,
-                           std::runtime_error,
-                           "CombineFortran is too slow!  For cache block "
-                           "benchmark with numRows=" << numRows << " and numCols="
-                           << numCols << ", CombineFortran time (= "
-                           << fortranTimings[1] << ") / CombineDefault time (= "
-                           << defaultTimings[1] << ") = " << slowdown
-                           << " > the allowed fraction " << params.allowance
-                           << ".");
-      }
-#endif // HAVE_KOKKOSTSQR_FORTRAN
     }
 
-
     template<class TimerType>
     static void
     benchmarkAllCombineTypesAndScalars (std::ostream& out,
@@ -393,7 +357,7 @@ namespace TSQR {
         }
       if (params.testComplex)
         {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
           using std::complex;
 
           dataTypeName = "complex<float>";
@@ -403,9 +367,9 @@ namespace TSQR {
           benchmarkAllCombineTypes<complex<double>, TimerType> (out, dataTypeName,
                                                                 params, timerResolution);
 
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
+#else // Don't HAVE_TPETRATSQR_COMPLEX
           throw std::logic_error("TSQR not built with complex arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
         }
     }
 
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp
index 54d5f199b0ad..18ea69f0ad3e 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineBenchmarker.hpp
@@ -37,8 +37,8 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __Tsqr_CombineBenchmarker_hpp
-#define __Tsqr_CombineBenchmarker_hpp
+#ifndef TSQR_COMBINEBENCHMARKER_HPP
+#define TSQR_COMBINEBENCHMARKER_HPP
 
 #include "Tsqr_ConfigDefs.hpp"
 #include "Tsqr_Random_NormalGenerator.hpp"
@@ -60,6 +60,19 @@
 namespace TSQR {
   namespace Test {
 
+    template<class Ordinal, class Scalar>
+    void
+    fill_with_identity_columns (const MatView<Ordinal, Scalar>& A)
+    {
+      deep_copy (A, Scalar {});
+      const Ordinal numCols = A.extent (1);
+      // FIXME (mfh 08 Dec 2019) Eventually stop writing to Matrix or
+      // MatView entries on host, for eventual GPU-ization.
+      for (Ordinal j = 0; j < numCols; ++j) {
+        A(j,j) = Scalar (1.0);
+      }
+    }
+
     /// \fn computeTimerResolution
     /// \brief Compute resolution in seconds of the TimerType timer.
     ///
@@ -74,15 +87,15 @@ namespace TSQR {
     double
     computeTimerResolution ()
     {
-      typedef TimerType timer_type;
+      using timer_type = TimerType;
       timer_type timer ("Timer resolution");
 
-      // Warmup run for the timer.
-      for (int warmup = 0; warmup < 5; ++warmup)
-        {
-          timer.start();
-          (void) timer.stop();
-        }
+      // Warmup run for the timer.  Some timer implementations needed
+      // to be called at least once in order to get sensible results.
+      for (int warmup = 0; warmup < 5; ++warmup) {
+        timer.start ();
+        (void) timer.stop ();
+      }
 
       // Keep a count of the total number of times timer.stop() is
       // called (once per outer loop iteration).  If bigger than
@@ -177,21 +190,21 @@ namespace TSQR {
     template<class Ordinal, class Scalar, class CombineType, class TimerType>
     class CombineBenchmarker {
     public:
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-      typedef CombineType combine_type;
-      typedef TimerType timer_type;
+      using ordinal_type = Ordinal;
+      using scalar_type = Scalar;
+      using combine_type = CombineType;
+      using timer_type = TimerType;
 
     private:
-      typedef Teuchos::ScalarTraits<scalar_type> STS;
-      typedef typename STS::magnitudeType magnitude_type;
-      typedef Teuchos::ScalarTraits<magnitude_type> STM;
-      typedef TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normgen_type;
-      typedef TSQR::Random::MatrixGenerator<ordinal_type, scalar_type, normgen_type> matgen_type;
-      typedef Matrix<ordinal_type, scalar_type> matrix_type;
+      using mag_type =
+        typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
+      using normgen_type =
+        TSQR::Random::NormalGenerator<ordinal_type, scalar_type>;
+      using matgen_type =
+        TSQR::Random::MatrixGenerator<ordinal_type, scalar_type, normgen_type>;
+      using matrix_type = Matrix<ordinal_type, scalar_type>;
 
     public:
-
       /// \brief Constructor with user-specified seed.
       ///
       /// \param timerRes [in] Resolution in seconds of the TimerType
@@ -291,33 +304,34 @@ namespace TSQR {
 
         // Generate a random cache block A.
         matrix_type A (numRows, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
         matGen.fill_random_svd (numRows, numCols, A.data(),
                                 A.stride(1), sigmas.data());
 
         // A place to put the Q factor.
         matrix_type Q (numRows, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j) {
-          Q(j,j) = STS::one();
-        }
+        fill_with_identity_columns (Q.view ());
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (numRows, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_first (A.view(), tau.data(), work.data());
-          combiner.apply_first (ApplyType("N"),
-                                A.view(), tau.data(),
-                                Q.view(), work.data());
+          combiner.factor_first (A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_first (ApplyType ("N"),
+                                A.view (), tau.data (),
+                                Q.view (), work.data (), lwork);
         }
 
         // How much time numTrials runs must take in order for
@@ -342,10 +356,11 @@ namespace TSQR {
           numTrials *= 2; // First value of numTrials is 4.
           timer.start();
           for (int trial = 0; trial < numTrials; ++trial) {
-            combiner.factor_first (A.view(), tau.data(), work.data());
-            combiner.apply_first (ApplyType("N"),
-                                  A.view(), tau.data(),
-                                  Q.view(), work.data());
+            combiner.factor_first (A.view (), tau.data (),
+                                   work.data (), lwork);
+            combiner.apply_first (ApplyType ("N"),
+                                  A.view (), tau.data (),
+                                  Q.view (), work.data (), lwork);
           }
           theTime = timer.stop();
         } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
@@ -388,32 +403,34 @@ namespace TSQR {
 
         // Generate a random cache block A.
         matrix_type A (numRows, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
         matGen.fill_random_svd (numRows, numCols, A.data(),
                                 A.stride(1), sigmas.data());
 
         // A place to put the Q factor.
         matrix_type Q (numRows, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j)
-          Q(j,j) = STS::one();
+        fill_with_identity_columns (Q.view ());
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (numRows, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_first (A.view(), tau.data(), work.data());
-          combiner.apply_first (ApplyType("N"),
-                                A.view(), tau.data(),
-                                Q.view(), work.data());
+          combiner.factor_first (A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_first (ApplyType ("N"),
+                                A.view (), tau.data (),
+                                Q.view (), work.data (), lwork);
         }
         //
         // The actual timing runs.
@@ -421,10 +438,11 @@ namespace TSQR {
         timer_type timer ("Combine first");
         timer.start();
         for (int trial = 0; trial < numTrials; ++trial) {
-          combiner.factor_first (A.view(), tau.data(), work.data());
-          combiner.apply_first (ApplyType("N"),
-                                A.view(), tau.data(),
-                                Q.view(), work.data());
+          combiner.factor_first (A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_first (ApplyType ("N"),
+                                A.view (), tau.data (),
+                                Q.view (), work.data (), lwork);
         }
         return timer.stop();
       }
@@ -459,53 +477,56 @@ namespace TSQR {
                            const Ordinal numCols,
                            const double accuracyFactor)
       {
-        if (numRows == 0 || numCols == 0)
+        if (numRows == 0 || numCols == 0) {
           throw std::invalid_argument("Calibrating timings is impossible for "
                                       "a matrix with either zero rows or zero "
                                       "columns.");
-        else if (accuracyFactor < 0)
+        }
+        else if (accuracyFactor < 0) {
           throw std::invalid_argument("Accuracy factor for Combine numTrials "
                                       "calibration must be nonnegative.");
+        }
         // Random matrix generator.
         matgen_type matGen (normGenS_);
 
         // Generate a random R factor first.
         matrix_type R (numCols, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
-        matGen.fill_random_R (numCols, R.data(),
-                              R.stride(1), sigmas.data());
+        matGen.fill_random_R (numCols, R.data (),
+                              R.stride (1), sigmas.data ());
 
         // Now generate a random cache block.
         matrix_type A (numRows, numCols);
         randomSingularValues (sigmas, numCols);
-        matGen.fill_random_svd (numRows, numCols, A.data(),
-                                A.stride(1), sigmas.data());
+        matGen.fill_random_svd (numRows, numCols, A.data (),
+                                A.stride (1), sigmas.data ());
 
         // A place to put the Q factor.
-        matrix_type Q (numRows + numCols, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j)
-          Q(j,j) = STS::one();
+        matrix_type Q (numCols + numRows, numCols);
+        fill_with_identity_columns (Q.view ());
+        auto Q_top_Q_bot = partition_2x1 (Q, numCols);
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (numRows, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_inner (R.view(), A.view(),
-                                 tau.data(), work.data());
-          combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
-                                A.data(), A.stride(1), tau.data(),
-                                &Q(0, 0), Q.stride(1),
-                                &Q(numCols, 0), Q.stride(1),
-                                work.data());
+          combiner.factor_inner (R.view (), A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_inner (ApplyType ("N"), A.view (),
+                                tau.data (), Q_top_Q_bot.first,
+                                Q_top_Q_bot.second,
+                                work.data (), lwork);
         }
 
         // How much time numTrials runs must take in order for
@@ -530,13 +551,12 @@ namespace TSQR {
           numTrials *= 2; // First value of numTrials is 4.
           timer.start();
           for (int trial = 0; trial < numTrials; ++trial) {
-            combiner.factor_inner (R.view(), A.view(),
-                                   tau.data(), work.data());
-            combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
-                                  A.data(), A.stride(1), tau.data(),
-                                  &Q(0, 0), Q.stride(1),
-                                  &Q(numCols, 0), Q.stride(1),
-                                  work.data());
+            combiner.factor_inner (R.view (), A.view (), tau.data (),
+                                   work.data (), lwork);
+            combiner.apply_inner (ApplyType ("N"), A.view (),
+                                  tau.data (), Q_top_Q_bot.first,
+                                  Q_top_Q_bot.second, work.data (),
+                                  lwork);
           }
           theTime = timer.stop();
         } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
@@ -544,7 +564,6 @@ namespace TSQR {
         return std::make_pair (numTrials, theTime);
       }
 
-
       /// \brief Benchmark TSQR::Combine on [R; A];
       ///
       /// TSQR::Combine implementations use factor_inner() to factor a
@@ -581,7 +600,7 @@ namespace TSQR {
 
         // Generate a random R factor first.
         matrix_type R (numCols, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
         matGen.fill_random_R (numCols, R.data(), R.stride(1), sigmas.data());
 
@@ -591,47 +610,45 @@ namespace TSQR {
         matGen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), sigmas.data());
 
         // A place to put the Q factor.
-        matrix_type Q (numRows + numCols, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j)
-          Q(j,j) = STS::one();
+        matrix_type Q (numCols + numRows, numCols);
+        fill_with_identity_columns (Q.view ());
+        auto Q_top_Q_bot = partition_2x1 (Q, numCols);
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (numRows, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_inner (R.view(), A.view(),
-                                 tau.data(), work.data());
-          combiner.apply_inner (ApplyType("N"),
-                                numRows, numCols, numCols,
-                                A.data(), A.stride(1), tau.data(),
-                                &Q(0, 0), Q.stride(1),
-                                &Q(numCols, 0), Q.stride(1),
-                                work.data());
+          combiner.factor_inner (R.view (), A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_inner (ApplyType ("N"), A.view (),
+                                tau.data (), Q_top_Q_bot.first,
+                                Q_top_Q_bot.second,
+                                work.data (), lwork);
         }
         //
         // The actual timing runs.
         //
         timer_type timer ("Combine cache block");
-        timer.start();
+        timer.start ();
         for (int trial = 0; trial < numTrials; ++trial) {
-          combiner.factor_inner (R.view(), A.view(),
-                                 tau.data(), work.data());
-          combiner.apply_inner (ApplyType("N"),
-                                numRows, numCols, numCols,
-                                A.data(), A.stride(1), tau.data(),
-                                &Q(0, 0), Q.stride(1),
-                                &Q(numCols, 0), Q.stride(1),
-                                work.data());
+          combiner.factor_inner (R.view (), A.view (), tau.data (),
+                                 work.data (), lwork);
+          combiner.apply_inner (ApplyType ("N"), A.view (),
+                                tau.data (), Q_top_Q_bot.first,
+                                Q_top_Q_bot.second,
+                                work.data (), lwork);
         }
-        return timer.stop();
+        return timer.stop ();
       }
 
       /// \brief Estimate number of trials for TSQR::Combine on [R1; R2].
@@ -672,38 +689,43 @@ namespace TSQR {
 
         // Generate R1 first.
         matrix_type R1 (numCols, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
         matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data());
 
         // Now generate R2.
         matrix_type R2 (numCols, numCols);
         randomSingularValues (sigmas, numCols);
-        matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data());
+        matGen.fill_random_R (numCols, R2.data (),
+                              R2.stride (1), sigmas.data ());
 
         // A place to put the Q factor of [R1; R2].
         matrix_type Q (2*numCols, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j)
-          Q(j,j) = STS::one();
+        fill_with_identity_columns (Q.view ());
+        auto Q_top_Q_bot = partition_2x1 (Q.view (), numCols);
+
+        auto R1_view = R1.view ();
+        auto R2_view = R2.view ();
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (2 * numCols, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data());
-          combiner.apply_pair (ApplyType("N"), numCols, numCols,
-                               R2.data(), R2.stride(1), tau.data(),
-                               &Q(0, 0), Q.stride(1),
-                               &Q(numCols, 0), Q.stride(1),
-                               work.data());
+          combiner.factor_pair (R1_view, R2_view, tau.data (),
+                                work.data (), lwork);
+          combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (),
+                               Q_top_Q_bot.first, Q_top_Q_bot.second,
+                               work.data (), lwork);
         }
 
         // How much time numTrials runs must take in order for
@@ -728,13 +750,12 @@ namespace TSQR {
           numTrials *= 2; // First value of numTrials is 4.
           timer.start();
           for (int trial = 0; trial < numTrials; ++trial) {
-            combiner.factor_pair (R1.view(), R2.view(),
-                                  tau.data(), work.data());
-            combiner.apply_pair (ApplyType("N"), numCols, numCols,
-                                 R2.data(), R2.stride(1), tau.data(),
-                                 &Q(0, 0), Q.stride(1),
-                                 &Q(numCols, 0), Q.stride(1),
-                                 work.data());
+            combiner.factor_pair (R1_view, R2_view, tau.data (),
+                                  work.data (), lwork);
+            combiner.apply_pair (ApplyType ("N"), R2_view,
+                                 tau.data (), Q_top_Q_bot.first,
+                                 Q_top_Q_bot.second,
+                                 work.data (), lwork);
           }
           theTime = timer.stop();
         } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
@@ -742,7 +763,6 @@ namespace TSQR {
         return std::make_pair (numTrials, theTime);
       }
 
-
       /// \brief Benchmark TSQR::Combine on [R1; R2].
       ///
       /// TSQR::Combine implementations use factor_pair() to factor a
@@ -763,50 +783,57 @@ namespace TSQR {
       benchmarkPair (const Ordinal numCols,
                      const int numTrials)
       {
-        if (numCols == 0)
-          throw std::invalid_argument("Benchmarking does not make sense for "
-                                      "a matrix with zero columns.");
-        TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
-                           "The number of trials must be positive, but "
-                           "numTrials = " << numTrials << ".");
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (numCols == 0, std::invalid_argument, "Benchmarking does "
+           "not make sense for a matrix with zero columns.");
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (numTrials < 1, std::invalid_argument, "The number of "
+           "trials must be positive, but numTrials = " << numTrials
+           << ".");
 
         // Random matrix generator.
         matgen_type matGen (normGenS_);
 
         // Generate R1 first.
         matrix_type R1 (numCols, numCols);
-        std::vector<magnitude_type> sigmas (numCols);
+        std::vector<mag_type> sigmas (numCols);
         randomSingularValues (sigmas, numCols);
-        matGen.fill_random_R (numCols, R1.data(), R1.stride(1), sigmas.data());
+        matGen.fill_random_R (numCols, R1.data (), R1.stride (1),
+                              sigmas.data ());
 
         // Now generate R2.
         matrix_type R2 (numCols, numCols);
         randomSingularValues (sigmas, numCols);
-        matGen.fill_random_R (numCols, R2.data(), R2.stride(1), sigmas.data());
+        matGen.fill_random_R (numCols, R2.data (), R2.stride (1),
+                              sigmas.data ());
 
         // A place to put the Q factor of [R1; R2].
         matrix_type Q (2*numCols, numCols);
-        deep_copy (Q, Scalar {});
-        for (Ordinal j = 0; j < numCols; ++j)
-          Q(j,j) = STS::one();
+        fill_with_identity_columns (Q.view ());
+        auto Q_top_Q_bot = partition_2x1 (Q.view (), numCols);
+
+        auto R1_view = R1.view ();
+        auto R2_view = R2.view ();
 
         // TAU array (Householder reflector scaling factors).
         std::vector<Scalar> tau (numCols);
-        // Work space array for factorization and applying the Q factor.
-        std::vector<Scalar> work (numCols);
 
         // The Combine instance to benchmark.
         combine_type combiner;
 
+        // Work space array for factorization and applying the Q factor.
+        const Ordinal lwork =
+          combiner.work_size (2 * numCols, numCols, numCols);
+        std::vector<Scalar> work (lwork);
+
         // A few warmup runs just to avoid timing anomalies.
         const int numWarmupRuns = 3;
         for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-          combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data());
-          combiner.apply_pair (ApplyType("N"), numCols, numCols,
-                               R2.data(), R2.stride(1), tau.data(),
-                               &Q(0, 0), Q.stride(1),
-                               &Q(numCols, 0), Q.stride(1),
-                               work.data());
+          combiner.factor_pair (R1_view, R2_view, tau.data (),
+                                work.data (), lwork);
+          combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (),
+                               Q_top_Q_bot.first, Q_top_Q_bot.second,
+                               work.data (), lwork);
         }
         //
         // The actual timing runs.
@@ -814,23 +841,21 @@ namespace TSQR {
         timer_type timer ("Combine pair");
         timer.start();
         for (int trial = 0; trial < numTrials; ++trial) {
-          combiner.factor_pair (R1.view(), R2.view(), tau.data(), work.data());
-          combiner.apply_pair (ApplyType("N"), numCols, numCols,
-                               R2.data(), R2.stride(1), tau.data(),
-                               &Q(0, 0), Q.stride(1),
-                               &Q(numCols, 0), Q.stride(1),
-                               work.data());
+          combiner.factor_pair (R1_view, R2_view, tau.data (),
+                                work.data (), lwork);
+          combiner.apply_pair (ApplyType ("N"), R2_view, tau.data (),
+                               Q_top_Q_bot.first, Q_top_Q_bot.second,
+                               work.data (), lwork);
         }
         return timer.stop();
       }
 
     private:
-
       //! Pseudorandom normal(0,1) generator for Scalar values.
       TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normGenS_;
 
-      //! Pseudorandom normal(0,1) generator for magnitude_type values.
-      TSQR::Random::NormalGenerator<ordinal_type, magnitude_type> normGenM_;
+      //! Pseudorandom normal(0,1) generator for mag_type values.
+      TSQR::Random::NormalGenerator<ordinal_type, mag_type> normGenM_;
 
       //! Timer resolution (in seconds) for TimerType timers.
       double timerResolution_;
@@ -842,33 +867,33 @@ namespace TSQR {
       /// \param numValues [in] Number of random singular values to
       ///   generate.
       void
-      randomSingularValues (std::vector<magnitude_type>& sigmas,
+      randomSingularValues (std::vector<mag_type>& sigmas,
                             const Ordinal numValues)
       {
-        // Cast to avoid compiler warnings for signed / unsigned
-        // comparisons.
-        typedef typename std::vector<magnitude_type>::size_type size_type;
-        if (sigmas.size() < static_cast<size_type> (numValues))
-          sigmas.resize (numValues);
+        using STM = Teuchos::ScalarTraits<mag_type>;
 
+        if (sigmas.size () < size_t (numValues)) {
+          sigmas.resize (numValues);
+        }
         // Relative amount by which to perturb each singular value.  The
         // perturbation will be multiplied by a normal(0,1) pseudorandom
         // number drawn from magGen.
-        const magnitude_type perturbationFactor = magnitude_type(10) * STM::eps();
-        const magnitude_type one = STM::one();
-        for (Ordinal k = 0; k < numValues; ++k)
-          {
-            magnitude_type perturbation = perturbationFactor * normGenM_();
-            // If (1 - perturbation) is a small or nonpositive number,
-            // subtract instead.
-            if (one - perturbation <= perturbationFactor)
-              perturbation = -perturbation;
-            sigmas[k] = one - perturbation;
+        const mag_type perturbationFactor =
+          mag_type (10.0) * STM::eps ();
+        const mag_type one (1.0);
+        for (Ordinal k = 0; k < numValues; ++k) {
+          mag_type perturbation = perturbationFactor * normGenM_ ();
+          // If (1 - perturbation) is a small or nonpositive number,
+          // subtract instead.
+          if (one - perturbation <= perturbationFactor) {
+            perturbation = -perturbation;
           }
+          sigmas[k] = one - perturbation;
+        }
       }
     };
 
   } // namespace Test
 } // namespace TSQR
 
-#endif // __Tsqr_CombineBenchmarker_hpp
+#endif // TSQR_COMBINEBENCHMARKER_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp
index f5e5ed7c9ce7..eb5ee23b5ff0 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineDefault.hpp
@@ -38,15 +38,16 @@
 //@HEADER
 
 /// \file Tsqr_CombineDefault.hpp
-/// \brief Default copy-in, copy-out implementation of \c TSQR::Combine.
-///
-#ifndef __TSQR_CombineDefault_hpp
-#define __TSQR_CombineDefault_hpp
+/// \brief Default copy-in, copy-out implementation of TSQR::Combine.
 
-#include "Teuchos_ScalarTraits.hpp"
-#include "Tsqr_ApplyType.hpp"
+#ifndef TSQR_COMBINEDEFAULT_HPP
+#define TSQR_COMBINEDEFAULT_HPP
+
+#include "Tsqr_Combine.hpp"
 #include "Tsqr_Impl_Lapack.hpp"
 #include "Tsqr_Matrix.hpp"
+#include "Teuchos_Assert.hpp"
+#include "Teuchos_ScalarTraits.hpp"
 
 namespace TSQR {
 
@@ -62,13 +63,14 @@ namespace TSQR {
   /// that should be zero because of the input's structure (e.g.,
   /// upper triangular).
   template<class Ordinal, class Scalar>
-  class CombineDefault {
+  class CombineDefault : public Combine<Ordinal, Scalar> {
   public:
-    typedef Ordinal ordinal_type;
-    typedef Scalar scalar_type;
-    typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
-    typedef MatView<Ordinal, const Scalar> const_mat_view_type;
-    typedef MatView<Ordinal, Scalar> mat_view_type;
+    using ordinal_type = Ordinal;
+    using scalar_type = Scalar;
+    using const_mat_view_type = MatView<ordinal_type, const Scalar>;
+    using mat_view_type = MatView<ordinal_type, Scalar>;
+
+    ~CombineDefault () override = default;
 
     /// \brief Does the R factor have a nonnegative diagonal?
     ///
@@ -78,44 +80,74 @@ namespace TSQR {
     /// entries.  This Boolean tells you whether CombineDefault
     /// promises to compute an R factor whose diagonal entries are all
     /// nonnegative.
-    static bool QR_produces_R_factor_with_nonnegative_diagonal()
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
     {
-      return false; // lapack_type::QR_produces_R_factor_with_nonnegative_diagonal();
+      // FIXME (mfh 19 Dec 2019) This _should_ depend on Impl::Lapack.
+      return false;
+    }
+
+    ordinal_type
+    work_size (const ordinal_type num_rows_Q,
+               const ordinal_type num_cols_Q,
+               const ordinal_type num_cols_C) const override
+    {
+      using STS = Teuchos::ScalarTraits<Scalar>;
+
+      const int ncols = num_cols_Q < num_cols_C ?
+        num_cols_C : num_cols_Q;
+      const int nrows = num_rows_Q + ncols;
+      const int lda = nrows;
+
+      const int lwork1 =
+        lapack_.compute_QR_lwork (nrows, ncols, nullptr, lda);
+      TEUCHOS_ASSERT( lwork1 >= num_cols_Q );
+
+      const int ldc = nrows;
+      const int lwork2 =
+        lapack_.apply_Q_factor_lwork ('L', 'N',
+                                      nrows, num_cols_C, num_cols_Q,
+                                      nullptr, lda, nullptr,
+                                      nullptr, ldc);
+      TEUCHOS_ASSERT( lwork2 >= 0 );
+      return std::max (lwork1, lwork2);
     }
 
     void
-    factor_first (const MatView<Ordinal, Scalar>& A,
+    factor_first (const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[])
+                  Scalar work[],
+                  const ordinal_type lwork) override
     {
-      const int lwork = A.extent (1);
       lapack_.compute_QR (A.extent (0), A.extent (1),
                           A.data (), A.stride (1),
                           tau, work, lwork);
     }
 
     void
-    factor_first (Matrix<Ordinal, Scalar>& A,
+    factor_first (Matrix<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[])
+                  Scalar work[],
+                  const ordinal_type lwork)
     {
-      MatView<Ordinal, Scalar> A_view
+      MatView<ordinal_type, Scalar> A_view
         (A.extent (0), A.extent (1), A.data (), A.stride (1));
-      factor_first (A_view, tau, work);
+      this->factor_first (A_view, tau, work, lwork);
     }
 
     void
     apply_first (const ApplyType& applyType,
-                 const MatView<Ordinal, const Scalar>& A,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 const MatView<Ordinal, Scalar>& C,
-                 Scalar work[])
+                 const MatView<ordinal_type, Scalar>& C,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      const Ordinal nrows = A.extent(0);
-      const Ordinal ncols_C = C.extent(1);
-      const Ordinal ncols_A = A.extent(1);
-      const Ordinal lda = A.stride(1);
-      const Ordinal ldc = C.stride(1);
+      const ordinal_type nrows = A.extent(0);
+      const ordinal_type ncols_C = C.extent(1);
+      const ordinal_type ncols_A = A.extent(1);
+      const ordinal_type lda = A.stride(1);
+      const ordinal_type ldc = C.stride(1);
 
       // LAPACK has the nice feature that it only reads the first
       // letter of input strings that specify things like which side
@@ -123,78 +155,76 @@ namespace TSQR {
       // transpose.  That means we can make the strings more verbose,
       // as in "Left" here for the SIDE parameter.
       const std::string trans = applyType.toString ();
-      const int lwork = ncols_C;
       lapack_.apply_Q_factor ('L', trans[0], nrows, ncols_C, ncols_A,
                               A.data(), lda, tau, C.data(), ldc,
-                              work, lwork);
+                              work, static_cast<int> (lwork));
+    }
+
+    void
+    factor_inner (const MatView<ordinal_type, Scalar>& R,
+                  const MatView<ordinal_type, Scalar>& A,
+                  Scalar tau[],
+                  Scalar work[],
+                  const ordinal_type lwork) override
+    {
+      const ordinal_type m = A.extent (0);
+      const ordinal_type n = A.extent (1);
+      const ordinal_type lda = A.stride (1);
+      factor_inner_impl (m, n, R.data (), R.stride (1),
+                         A.data (), lda, tau, work, lwork);
     }
 
     void
     apply_inner (const ApplyType& apply_type,
-                 const Ordinal m,
-                 const Ordinal ncols_C,
-                 const Ordinal ncols_Q,
-                 const Scalar A[],
-                 const Ordinal lda,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 Scalar C_top[],
-                 const Ordinal ldc_top,
-                 Scalar C_bot[],
-                 const Ordinal ldc_bot,
-                 Scalar work[])
+                 const MatView<ordinal_type, Scalar>& C_top,
+                 const MatView<ordinal_type, Scalar>& C_bot,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      const Ordinal numRows = m + ncols_Q;
+      const ordinal_type m = A.extent (0);
+      TEUCHOS_ASSERT( m == ordinal_type (C_bot.extent (0)) );
+      const ordinal_type ncols_Q = A.extent (1);
+      const ordinal_type ncols_C = C_top.extent (1);
+      TEUCHOS_ASSERT( ncols_C == ordinal_type (C_bot.extent (1)) );
+      const ordinal_type numRows = ncols_Q + m;
 
       A_buf_.reshape (numRows, ncols_Q);
       deep_copy (A_buf_, Scalar {});
-      const_mat_view_type A_bot (m, ncols_Q, A, lda);
-      mat_view_type A_buf_bot (m, ncols_Q, &A_buf_(ncols_Q, 0), A_buf_.stride(1));
-      deep_copy (A_buf_bot, A_bot);
+      auto A_buf_top_bot = partition_2x1 (A_buf_.view (), ncols_Q);
+      deep_copy (A_buf_top_bot.second, A);
 
       C_buf_.reshape (numRows, ncols_C);
       deep_copy (C_buf_, Scalar {});
-      mat_view_type C_buf_top (ncols_Q, ncols_C, &C_buf_(0, 0), C_buf_.stride(1));
-      mat_view_type C_buf_bot (m, ncols_C, &C_buf_(ncols_Q, 0), C_buf_.stride(1));
-      mat_view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top);
-      mat_view_type C_bot_view (m, ncols_C, C_bot, ldc_bot);
-      deep_copy (C_buf_top, C_top_view);
-      deep_copy (C_buf_bot, C_bot_view);
+      auto C_buf_top_bot = partition_2x1 (C_buf_.view (), ncols_Q);
+      deep_copy (C_buf_top_bot.first, C_top);
+      deep_copy (C_buf_top_bot.second, C_bot);
 
       const std::string trans = apply_type.toString ();
-      const int lwork = ncols_C;
-      lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q,
-                              A_buf_.data(), A_buf_.stride(1), tau,
-                              C_buf_.data(), C_buf_.stride(1),
+      lapack_.apply_Q_factor ('L', trans[0],
+                              numRows, ncols_C, ncols_Q,
+                              A_buf_.data (), A_buf_.stride (1), tau,
+                              C_buf_.data (), C_buf_.stride (1),
                               work, lwork);
       // Copy back the results.
-      deep_copy (C_top_view, C_buf_top);
-      deep_copy (C_bot_view, C_buf_bot);
-    }
-
-    void
-    factor_inner (const MatView<Ordinal, Scalar>& R,
-                  const MatView<Ordinal, Scalar>& A,
-                  Scalar tau[],
-                  Scalar work[])
-    {
-      const Ordinal m = A.extent(0);
-      const Ordinal n = A.extent(1);
-      factor_inner_impl (m, n, R.data(), R.stride(1),
-                         A.data(), A.stride(1), tau, work);
+      deep_copy (C_top, C_buf_top_bot.first);
+      deep_copy (C_bot, C_buf_top_bot.second);
     }
 
   private:
     void
-    factor_inner_impl (const Ordinal m,
-                       const Ordinal n,
+    factor_inner_impl (const ordinal_type m,
+                       const ordinal_type n,
                        Scalar R[],
-                       const Ordinal ldr,
+                       const ordinal_type ldr,
                        Scalar A[],
-                       const Ordinal lda,
+                       const ordinal_type lda,
                        Scalar tau[],
-                       Scalar work[])
+                       Scalar work[],
+                       const ordinal_type lwork)
     {
-      const Ordinal numRows = m + n;
+      const ordinal_type numRows = m + n;
 
       A_buf_.reshape (numRows, n);
       deep_copy (A_buf_, Scalar {});
@@ -202,58 +232,46 @@ namespace TSQR {
       // we only want to include the upper triangle in the
       // factorization.  Thus, only copy the upper triangle of R into
       // the appropriate place in the buffer.
-      MatView<Ordinal, Scalar> R_view (n, n, R, ldr);
-      MatView<Ordinal, Scalar> A_buf_top (n, n, A_buf_.data(),
+      MatView<ordinal_type, Scalar> R_view (n, n, R, ldr);
+      MatView<ordinal_type, Scalar> A_buf_top (n, n, A_buf_.data(),
                                           A_buf_.stride(1));
       deep_copy (A_buf_top, R_view);
 
-      MatView<Ordinal, Scalar> A_view (m, n, A, lda);
-      MatView<Ordinal, Scalar> A_buf_bot (m, n, &A_buf_(n, 0),
+      MatView<ordinal_type, Scalar> A_view (m, n, A, lda);
+      MatView<ordinal_type, Scalar> A_buf_bot (m, n, &A_buf_(n, 0),
                                           A_buf_.stride(1));
       deep_copy (A_buf_bot, A_view);
-
-      const int lwork = n;
-      lapack_.compute_QR (numRows, n, A_buf_.data(), A_buf_.stride(1),
-                          tau, work, lwork);
+      lapack_.compute_QR (numRows, n, A_buf_.data (),
+                          A_buf_.stride (1), tau, work, lwork);
       // Copy back the results.  R might be a view of the upper
       // triangle of a cache block, so only copy into the upper
       // triangle of R.
-      copy_upper_triangle (n, n, R, ldr, A_buf_top.data(),
-                           A_buf_top.stride(1));
+      copy_upper_triangle (R_view, A_buf_top);
       deep_copy (A_view, A_buf_bot);
     }
 
   public:
     void
-    factor_pair (const MatView<Ordinal, Scalar>& R_top,
-                 const MatView<Ordinal, Scalar>& R_bot,
+    factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+                 const MatView<ordinal_type, Scalar>& R_bot,
                  Scalar tau[],
-                 Scalar work[])
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      const Ordinal numRows = Ordinal(2) * R_top.extent (1);
-      const Ordinal numCols = R_top.extent (1);
+      const ordinal_type numRows = ordinal_type(2) * R_top.extent (1);
+      const ordinal_type numCols = R_top.extent (1);
 
       A_buf_.reshape (numRows, numCols);
       deep_copy (A_buf_, Scalar {});
-      MatView<Ordinal, Scalar> A_buf_top (numCols, numCols,
-                                          &A_buf_(0, 0),
-                                          A_buf_.stride(1));
-      MatView<Ordinal, Scalar> A_buf_bot (numCols, numCols,
-                                          &A_buf_(numCols, 0),
-                                          A_buf_.stride(1));
+      auto A_buf_tb = partition_2x1 (A_buf_.view (), numCols);
       // Copy the inputs into the compute buffer.  Only touch the
       // upper triangles of R_top and R_bot, since they each may be
       // views of some cache block (where the strict lower triangle
       // contains things we don't want to include in the
       // factorization).
-      copy_upper_triangle (numCols, numCols,
-                           A_buf_top.data(), A_buf_top.stride(1),
-                           R_top.data(), R_top.stride(1));
-      copy_upper_triangle (numCols, numCols,
-                           A_buf_bot.data(), A_buf_bot.stride(1),
-                           R_bot.data(), R_bot.stride(1));
-
-      const int lwork = static_cast<int> (numCols);
+      copy_upper_triangle (A_buf_tb.first, R_top);
+      copy_upper_triangle (A_buf_tb.second, R_bot);
+
       lapack_.compute_QR (numRows, numCols,
                           A_buf_.data(), A_buf_.stride(1),
                           tau, work, lwork);
@@ -261,63 +279,49 @@ namespace TSQR {
       // two n by n row blocks of A_buf_ (this means we don't have to
       // zero out the strict lower triangles), and only touch the
       // upper triangles of R_top and R_bot.
-      copy_upper_triangle (numCols, numCols,
-                           R_top.data(), R_top.stride(1),
-                           A_buf_top.data(), A_buf_top.stride(1));
-      copy_upper_triangle (numCols, numCols,
-                           R_bot.data(), R_bot.stride(1),
-                           A_buf_bot.data(), A_buf_bot.stride(1));
+      copy_upper_triangle (R_top, A_buf_tb.first);
+      copy_upper_triangle (R_bot, A_buf_tb.second);
     }
 
     void
     apply_pair (const ApplyType& apply_type,
-                const Ordinal ncols_C,
-                const Ordinal ncols_Q,
-                const Scalar R_bot[],
-                const Ordinal ldr_bot,
+                const MatView<ordinal_type, const Scalar>& R_bot,
                 const Scalar tau[],
-                Scalar C_top[],
-                const Ordinal ldc_top,
-                Scalar C_bot[],
-                const Ordinal ldc_bot,
-                Scalar work[])
+                const MatView<ordinal_type, Scalar>& C_top,
+                const MatView<ordinal_type, Scalar>& C_bot,
+                Scalar work[],
+                const ordinal_type lwork) override
     {
-      const Ordinal numRows = Ordinal(2) * ncols_Q;
+      const ordinal_type ncols_C = C_top.extent (1);
+      const ordinal_type ncols_Q = R_bot.extent (1);
+      const ordinal_type numRows = ordinal_type(2) * ncols_Q;
 
       A_buf_.reshape (numRows, ncols_Q);
       deep_copy (A_buf_, Scalar {});
-      copy_upper_triangle (ncols_Q, ncols_Q,
-                           &A_buf_(ncols_Q, 0), A_buf_.stride(1),
-                           R_bot, ldr_bot);
-      C_buf_.reshape (numRows, ncols_C);
-
-      using view_type = MatView<Ordinal, Scalar>;
-      view_type C_top_view (ncols_Q, ncols_C, C_top, ldc_top);
-      view_type C_buf_top (ncols_Q, ncols_C,
-                           C_buf_.data (), C_buf_.stride (1));
-      deep_copy (C_buf_top, C_top_view);
+      auto A_buf_tb = partition_2x1 (A_buf_.view (), ncols_Q);
+      copy_upper_triangle (A_buf_tb.second, R_bot);
 
-      view_type C_bot_view (ncols_Q, ncols_C, C_bot, ldc_bot);
-      view_type C_buf_bot (ncols_Q, ncols_C,
-                           &C_buf_(ncols_Q, 0), C_buf_.stride (1));
-      deep_copy (C_buf_bot, C_bot_view);
+      C_buf_.reshape (numRows, ncols_C);
+      auto C_buf_tb = partition_2x1 (C_buf_.view (), ncols_Q);
+      deep_copy (C_buf_tb.first, C_top);
+      deep_copy (C_buf_tb.second, C_bot);
 
-      const int lwork = ncols_Q;
       const std::string trans = apply_type.toString ();
-      lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C, ncols_Q,
-                              A_buf_.data(), A_buf_.stride(1), tau,
-                              C_buf_.data(), C_buf_.stride(1),
+      lapack_.apply_Q_factor ('L', trans[0], numRows, ncols_C,
+                              ncols_Q, A_buf_.data (),
+                              A_buf_.stride (1), tau,
+                              C_buf_.data (), C_buf_.stride (1),
                               work, lwork);
       // Copy back the results.
-      deep_copy (C_top_view, C_buf_top);
-      deep_copy (C_bot_view, C_buf_bot);
+      deep_copy (C_top, C_buf_tb.first);
+      deep_copy (C_bot, C_buf_tb.second);
     }
 
   private:
     Impl::Lapack<Scalar> lapack_;
-    Matrix<Ordinal, Scalar> A_buf_;
-    Matrix<Ordinal, Scalar> C_buf_;
+    Matrix<ordinal_type, Scalar> A_buf_;
+    Matrix<ordinal_type, Scalar> C_buf_;
   };
 } // namespace TSQR
 
-#endif // __TSQR_CombineDefault_hpp
+#endif // TSQR_COMBINEDEFAULT_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp
new file mode 100644
index 000000000000..e2f1dbc289e8
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineFactory.hpp
@@ -0,0 +1,105 @@
+//@HEADER
+// ************************************************************************
+//
+//          Kokkos: Node API and Parallel Node Kernels
+//              Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+
+/// \file Tsqr_Combine.hpp
+/// \brief Interface to TSQR's six computational kernels.
+
+#ifndef TSQR_COMBINEFACTORY_HPP
+#define TSQR_COMBINEFACTORY_HPP
+
+#include "Tsqr_CombineDefault.hpp"
+#include "Tsqr_CombineNative.hpp"
+#include "Teuchos_TestForException.hpp"
+#include <memory>
+#include <string>
+
+namespace TSQR {
+  /// \class CombineFactory
+  /// \brief Factory for creating Combine instances.
+  /// \author Mark Hoemmen
+  template<class Ordinal, class Scalar>
+  class CombineFactory {
+  public:
+    /// \brief Given the maximum number of columns in either the
+    ///   matrix to factor, or the matrix to which to apply a Q factor
+    ///   or compute an explicit Q factor, return an appropriate
+    ///   Combine implementation.
+    static std::unique_ptr<Combine<Ordinal, Scalar>>
+    create (const Ordinal maxNumCols)
+    {
+      // FIXME (mfh 19 Dec 2019) This _should_ depend on the BLAS
+      // implementation.
+      constexpr Ordinal blas_3_threshold = 32;
+      if (maxNumCols >= blas_3_threshold) {
+        using impl_type = CombineDefault<Ordinal, Scalar>;
+        // NOTE (mfh 19 Dec 2019) We can't use std::make_unique yet,
+        // because it requires C++14.
+        return std::unique_ptr<impl_type> (new impl_type);
+      }
+      else {
+        using impl_type = CombineNative<Ordinal, Scalar>;
+        return std::unique_ptr<impl_type> (new impl_type);
+      }
+    }
+
+    static std::unique_ptr<Combine<Ordinal, Scalar>>
+    create (const std::string& combineType)
+    {
+      if (combineType == "CombineNative" ||
+          combineType == "Native") {
+        using impl_type = CombineNative<Ordinal, Scalar>;
+        return std::unique_ptr<impl_type> (new impl_type);
+      }        
+      else if (combineType == "CombineDefault" ||
+               combineType == "Default") {
+        using impl_type = CombineDefault<Ordinal, Scalar>;
+        return std::unique_ptr<impl_type> (new impl_type);
+      }        
+      else {
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::invalid_argument, "TSQR::CombineFactory: "
+           "Invalid Combine subclass name \"" << combineType <<
+           "\".");
+      }
+    }
+  };
+
+} // namespace TSQR
+
+#endif // TSQR_COMBINEFACTORY_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp
index 8e44d0fe8b75..c8d5cc759be6 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineNative.hpp
@@ -38,10 +38,10 @@
 //@HEADER
 
 /// \file Tsqr_CombineNative.hpp
-/// \brief Interface to C++ back end of \c TSQR::Combine.
-///
-#ifndef __TSQR_CombineNative_hpp
-#define __TSQR_CombineNative_hpp
+/// \brief Interface to C++ back end of TSQR::Combine.
+
+#ifndef TSQR_COMBINENATIVE_HPP
+#define TSQR_COMBINENATIVE_HPP
 
 #include "Teuchos_ScalarTraits.hpp"
 #include "Tsqr_ApplyType.hpp"
@@ -57,30 +57,36 @@ namespace TSQR {
   /// \class CombineNative
   /// \brief Interface to C++ back end of TSQR::Combine
   ///
-  /// \c TSQR::Combine has three implementations: \c CombineDefault,
-  /// CombineNative, and \c CombineFortran.  CombineNative,
-  /// implemented in this file, is a fully C++ (therefore "native," as
-  /// opposed to \c CombineFortran (implemented in Fortran) or \c
-  /// CombineNative (implemented by wrappers around LAPACK calls))
-  /// implementation.
+  /// TSQR::Combine has two implementations: CombineDefault and
+  /// CombineNative.  (It used to have CombineFortran as well, which
+  /// was a Fortran 9x implementation wrapped in C++ wrappers.  I got
+  /// rid of that because it complicated Trilinos' build system to
+  /// have to ask whether the Fortran compiler could handle Fortran
+  /// 9x.)  CombineNative, implemented in this file, is a "fully" C++
+  /// (therefore "native") implementation of Combine.  (I'm ignoring
+  /// calls to some BLAS functions.)
   ///
-  /// \warning CombineNative has no complex-arithmetic implementation
+  /// \note CombineNative has no complex-arithmetic implementation
   ///   yet.  It's not hard to implement this (use LAPACK's ZGEQR2(P)
   ///   and ZUNM2R as models), but it will take time that the author
   ///   doesn't have at the moment.
-  ///
-  template< class Ordinal, class Scalar, bool isComplex = Teuchos::ScalarTraits< Scalar >::isComplex >
-  class CombineNative
-  {
+  template<class Ordinal,
+           class Scalar,
+           bool isComplex = Teuchos::ScalarTraits<Scalar>::isComplex>
+  class CombineNative : public Combine<Ordinal, Scalar> {
   public:
-    typedef Scalar scalar_type;
-    typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
-    typedef Ordinal ordinal_type;
+    using ordinal_type = Ordinal;
+    using scalar_type = Scalar;
 
   private:
-    typedef CombineDefault<ordinal_type, scalar_type> combine_default_type;
+    using mag_type =
+      typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
+    using combine_default_type =
+      CombineDefault<ordinal_type, scalar_type>;
 
   public:
+    ~CombineNative () override = default;
+
     /// Whether or not the QR factorizations computed by methods of
     /// this class produce an R factor with all nonnegative diagonal
     /// entries.  It depends on LAPACK because this implementation
@@ -88,358 +94,345 @@ namespace TSQR {
     /// Householder reflectors; only LAPACK versions >= 3.2 have one
     /// of {LARFGP, LARFP}, which is necessary to ensure that the BETA
     /// output of the function is always nonnegative.
-    static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-      return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal();
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      return default_.
+        QR_produces_R_factor_with_nonnegative_diagonal ();
+    }
+
+    ordinal_type
+    work_size (const ordinal_type /* num_rows_Q */,
+               const ordinal_type num_cols_Q,
+               const ordinal_type num_cols_C) const override
+    {
+      return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q;
     }
 
     void
-    factor_first (const MatView<Ordinal, Scalar>& A,
+    factor_first (const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const
+                  Scalar work[],
+                  const ordinal_type lwork) override
     {
-      return default_.factor_first (A, tau, work);
+      return default_.factor_first (A, tau, work, lwork);
     }
 
     void
     apply_first (const ApplyType& applyType,
-                 const MatView<Ordinal, const Scalar>& A,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 const MatView<Ordinal, Scalar>& C,
-                 Scalar work[])
+                 const MatView<ordinal_type, Scalar>& C,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      return default_.apply_first (applyType, A, tau, C, work);
+      return default_.apply_first (applyType, A, tau, C, work, lwork);
     }
 
     void
-    apply_inner (const ApplyType& applyType,
-                 const Ordinal m,
-                 const Ordinal ncols_C,
-                 const Ordinal ncols_Q,
-                 const Scalar A[],
-                 const Ordinal lda,
-                 const Scalar tau[],
-                 Scalar C_top[],
-                 const Ordinal ldc_top,
-                 Scalar C_bot[],
-                 const Ordinal ldc_bot,
-                 Scalar work[]) const;
+    factor_inner (const MatView<ordinal_type, Scalar>& R,
+                  const MatView<ordinal_type, Scalar>& A,
+                  Scalar tau[],
+                  Scalar work[],
+                  const ordinal_type lwork) override;
 
     void
-    factor_inner (const MatView<Ordinal, Scalar>& R,
-                  const MatView<Ordinal, Scalar>& A,
-                  Scalar tau[],
-                  Scalar work[]) const;
+    apply_inner (const ApplyType& applyType,
+                 const MatView<ordinal_type, const Scalar>& A,
+                 const Scalar tau[],
+                 const MatView<ordinal_type, Scalar>& C_top,
+                 const MatView<ordinal_type, Scalar>& C_bot,
+                 Scalar work[],
+                 const ordinal_type lwork) override;
 
     void
-    factor_pair (const MatView<Ordinal, Scalar>& R_top,
-                 const MatView<Ordinal, Scalar>& R_bot,
+    factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+                 const MatView<ordinal_type, Scalar>& R_bot,
                  Scalar tau[],
-                 Scalar work[]) const;
+                 Scalar work[],
+                 const ordinal_type lwork) override;
 
     void
     apply_pair (const ApplyType& applyType,
-                const Ordinal ncols_C,
-                const Ordinal ncols_Q,
-                const Scalar R_bot[],
-                const Ordinal ldr_bot,
+                const MatView<ordinal_type, const Scalar>& R_bot,
                 const Scalar tau[],
-                Scalar C_top[],
-                const Ordinal ldc_top,
-                Scalar C_bot[],
-                const Ordinal ldc_bot,
-                Scalar work[]) const;
+                const MatView<ordinal_type, Scalar>& C_top,
+                const MatView<ordinal_type, Scalar>& C_bot,
+                Scalar work[],
+                const ordinal_type lwork) override;
 
   private:
-    mutable combine_default_type default_;
+    combine_default_type default_;
   };
 
-
   //! Specialization of CombineNative for the real-arithmetic case.
-  template< class Ordinal, class Scalar >
-  class CombineNative< Ordinal, Scalar, false >
-  {
-  private:
-    using memory_space = Kokkos::HostSpace;
-#ifdef KOKKOS_ENABLE_SERIAL
-    using execution_space = Kokkos::Serial;
-#else // NOT KOKKOS_ENABLE_SERIAL
-    using execution_space = Kokkos::HostSpace::execution_space;
-#endif // KOKKOS_ENABLE_SERIAL
-
+  template<class Ordinal, class Scalar>
+  class CombineNative<Ordinal, Scalar, false> :
+    public Combine<Ordinal, Scalar> {
   public:
-    typedef Scalar scalar_type;
-    typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
-    typedef Ordinal ordinal_type;
-    using device_type = Kokkos::Device<execution_space, memory_space>;
+    using ordinal_type = Ordinal;
+    using scalar_type = Scalar;
 
   private:
-    typedef CombineDefault<ordinal_type, scalar_type> combine_default_type;
+    using mag_type =
+      typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
+    using execution_space = Kokkos::DefaultHostExecutionSpace;
+    using memory_space = Kokkos::HostSpace;    
+    using device_type = Kokkos::Device<execution_space, memory_space>;
+    template<class SC>
+    using matrix_type =
+      Kokkos::View<SC**, Kokkos::LayoutLeft, device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    template<class SC>
+    using vector_type =
+      Kokkos::View<SC*, Kokkos::LayoutLeft, device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
 
     void
-    GER (const magnitude_type alpha,
-         const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& x,
-         const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& y,
-         const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& A) const;
+    GER (const mag_type alpha,
+         const vector_type<const scalar_type>& x,
+         const vector_type<const scalar_type>& y,
+         const matrix_type<scalar_type>& A) const;
 
     void
-    LARFG (const Ordinal n,
+    LARFG (const ordinal_type n,
            scalar_type& alpha,
-           const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& x,
+           const vector_type<scalar_type>& x,
            scalar_type& tau) const
     {
-      constexpr Ordinal incx {1};
+      constexpr ordinal_type incx {1};
       Impl::Lapack<scalar_type> lapack;
       lapack.LARFG (n, alpha, x.data (), incx, tau);
     }
 
-    magnitude_type
-    LAPY2 (const scalar_type& x, const scalar_type& y) const
-    {
-      using KAT = Kokkos::ArithTraits<scalar_type>;
-      if (KAT::isNan (x)) {
-        return x;
-      }
-      else if (KAT::isNan (y)) {
-        return y;
-      }
-      else {
-        const magnitude_type xabs = KAT::abs (x);
-        const magnitude_type yabs = KAT::abs (y);
-        const scalar_type w = xabs >= yabs ? xabs : yabs; // max (xabs, yabs);
-        const scalar_type z = xabs <= yabs ? xabs : yabs; // min (xabs, yabs);
-
-        if (z == KAT::zero ()) {
-          return w;
-        }
-        else {
-          const scalar_type z_div_w = z / w;
-          return w * KAT::sqrt (KAT::one () + z_div_w * z_div_w);
-        }
-      }
-    }
-
     void
     GEMV (const char trans[],
           const scalar_type alpha,
-          const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& A,
-          const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& x,
+          const matrix_type<const scalar_type>& A,
+          const vector_type<const scalar_type>& x,
           const scalar_type beta,
-          const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& y) const;
+          const vector_type<scalar_type>& y) const;
 
     void
-    factor_pair (const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_top,
-                 const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_bot,
-                 const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-                 const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const;
+    factor_pair (const matrix_type<scalar_type>& R_top,
+                 const matrix_type<scalar_type>& R_bot,
+                 const vector_type<scalar_type>& tau_view,
+                 const vector_type<scalar_type>& work_view) const;
 
     void
-    factor_inner (const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_view,
-                  const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& A_view,
-                  const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-                  const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const;
+    factor_inner (const matrix_type<scalar_type>& R_view,
+                  const matrix_type<scalar_type>& A_view,
+                  const vector_type<scalar_type>& tau_view,
+                  const vector_type<scalar_type>& work_view) const;
 
     void
     apply_pair (const ApplyType& applyType,
-                const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& R_bot, // ncols_Q
-                const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-                const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_top, // ncols_C
-                const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_bot,
-                const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const;
+                const matrix_type<const scalar_type>& R_bot, // ncols_Q
+                const vector_type<const scalar_type>& tau_view,
+                const matrix_type<scalar_type>& C_top, // ncols_C
+                const matrix_type<scalar_type>& C_bot,
+                const vector_type<scalar_type>& work_view) const;
 
     void
     apply_inner (const ApplyType& applyType,
-                 const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& A,
-                 const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& tau,
-                 const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_top,
-                 const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_bot,
-                 const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work) const;
+                 const matrix_type<const scalar_type>& A,
+                 const vector_type<const scalar_type>& tau,
+                 const matrix_type<scalar_type>& C_top,
+                 const matrix_type<scalar_type>& C_bot,
+                 const vector_type<scalar_type>& work) const;
 
   public:
-    CombineNative () = default;
+    ~CombineNative () override = default;
 
-    static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-      return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal();
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      return default_.
+        QR_produces_R_factor_with_nonnegative_diagonal ();
+    }
+
+    ordinal_type
+    work_size (const ordinal_type /* num_rows_Q */,
+               const ordinal_type num_cols_Q,
+               const ordinal_type num_cols_C) const override
+    {
+      return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q;
     }
 
     void
-    factor_first (const MatView<Ordinal, Scalar>& A,
+    factor_first (const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const
+                  Scalar work[],
+                  const ordinal_type lwork) override
     {
-      return default_.factor_first (A, tau, work);
+      return default_.factor_first (A, tau, work, lwork);
     }
 
     void
     apply_first (const ApplyType& applyType,
-                 const MatView<Ordinal, const Scalar>& A,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 const MatView<Ordinal, Scalar>& C,
-                 Scalar work[])
+                 const MatView<ordinal_type, Scalar>& C,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      return default_.apply_first (applyType, A, tau, C, work);
+      return default_.apply_first (applyType, A, tau, C, work, lwork);
     }
 
     void
-    factor_inner (const MatView<Ordinal, Scalar>& R,
-                  const MatView<Ordinal, Scalar>& A,
+    factor_inner (const MatView<ordinal_type, Scalar>& R,
+                  const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const;
-
+                  Scalar work[],
+                  const ordinal_type lwork) override;
     void
     apply_inner (const ApplyType& applyType,
-                 const Ordinal m,
-                 const Ordinal ncols_C,
-                 const Ordinal ncols_Q,
-                 const Scalar A[],
-                 const Ordinal lda,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 Scalar C_top[],
-                 const Ordinal ldc_top,
-                 Scalar C_bot[],
-                 const Ordinal ldc_bot,
-                 Scalar work[]) const;
+                 const MatView<ordinal_type, Scalar>& C_top,
+                 const MatView<ordinal_type, Scalar>& C_bot,
+                 Scalar work[],
+                 const ordinal_type lwork) override;
+
     void
-    factor_pair (const MatView<Ordinal, Scalar>& R_top,
-                 const MatView<Ordinal, Scalar>& R_bot,
+    factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+                 const MatView<ordinal_type, Scalar>& R_bot,
                  Scalar tau[],
-                 Scalar work[]) const;
-
+                 Scalar work[],
+                 const ordinal_type lwork) override;
     void
     apply_pair (const ApplyType& applyType,
-                const Ordinal ncols_C,
-                const Ordinal ncols_Q,
-                const scalar_type R_bot[],
-                const Ordinal ldr_bot,
-                const scalar_type tau[],
-                scalar_type C_top[],
-                const Ordinal ldc_top,
-                scalar_type C_bot[],
-                const Ordinal ldc_bot,
-                scalar_type work[]) const;
+                const MatView<ordinal_type, const Scalar>& R_bot,
+                const Scalar tau[],
+                const MatView<ordinal_type, Scalar>& C_top,
+                const MatView<ordinal_type, Scalar>& C_bot,
+                Scalar work[],
+                const ordinal_type lwork) override;
 
   private:
-    mutable combine_default_type default_;
+    CombineDefault<ordinal_type, scalar_type> default_;
   };
 
-
-  /// "Forward declaration" for the complex-arithmetic case.
-  ///
-  template< class Ordinal, class Scalar >
-  class CombineNative< Ordinal, Scalar, true >
-  {
+  //! Specialization of CombineNative for complex Scalar.
+  template<class Ordinal, class Scalar>
+  class CombineNative<Ordinal, Scalar, true> :
+    public Combine<Ordinal, Scalar> {
   public:
-    typedef Scalar scalar_type;
-    typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
-    typedef Ordinal ordinal_type;
+    using ordinal_type = Ordinal;
+    using scalar_type = Scalar;
 
   private:
-    typedef CombineDefault<ordinal_type, scalar_type> combine_default_type;
+    using mag_type =
+      typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
 
   public:
-    static bool QR_produces_R_factor_with_nonnegative_diagonal() {
-      return combine_default_type::QR_produces_R_factor_with_nonnegative_diagonal();
+    ~CombineNative () override = default;
+
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      return default_.
+        QR_produces_R_factor_with_nonnegative_diagonal ();
+    }
+
+    ordinal_type
+    work_size (const ordinal_type /* num_rows_Q */,
+               const ordinal_type num_cols_Q,
+               const ordinal_type num_cols_C) const override
+    {
+      return num_cols_Q < num_cols_C ? num_cols_C : num_cols_Q;
     }
 
     void
-    factor_first (const MatView<Ordinal, Scalar>& A,
+    factor_first (const MatView<ordinal_type, Scalar>& A,
                   Scalar tau[],
-                  Scalar work[]) const
+                  Scalar work[],
+                  const ordinal_type lwork) override
     {
-      return default_.factor_first (A, tau, work);
+      return default_.factor_first (A, tau, work, lwork);
     }
 
     void
     apply_first (const ApplyType& applyType,
-                 const MatView<Ordinal, const Scalar>& A,
+                 const MatView<ordinal_type, const Scalar>& A,
                  const Scalar tau[],
-                 const MatView<Ordinal, Scalar>& C,
-                 Scalar work[])
+                 const MatView<ordinal_type, Scalar>& C,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      return default_.apply_first (applyType, A, tau, C, work);
+      return default_.apply_first (applyType, A, tau, C, work, lwork);
     }
 
     void
-    apply_inner (const ApplyType& applyType,
-                 const Ordinal m,
-                 const Ordinal ncols_C,
-                 const Ordinal ncols_Q,
-                 const Scalar A[],
-                 const Ordinal lda,
-                 const Scalar tau[],
-                 Scalar C_top[],
-                 const Ordinal ldc_top,
-                 Scalar C_bot[],
-                 const Ordinal ldc_bot,
-                 Scalar work[]) const
+    factor_inner (const MatView<ordinal_type, Scalar>& R,
+                  const MatView<ordinal_type, Scalar>& A,
+                  Scalar tau[],
+                  Scalar work[],
+                  const ordinal_type lwork) override
     {
-      return default_.apply_inner (applyType, m, ncols_C, ncols_Q,
-                                   A, lda, tau,
-                                   C_top, ldc_top, C_bot, ldc_bot,
-                                   work);
+      return default_.factor_inner (R, A, tau, work, lwork);
     }
 
     void
-    factor_inner (const MatView<Ordinal, Scalar>& R,
-                  const MatView<Ordinal, Scalar>& A,
-                  Scalar tau[],
-                  Scalar work[]) const
+    apply_inner (const ApplyType& applyType,
+                 const MatView<ordinal_type, const Scalar>& A,
+                 const Scalar tau[],
+                 const MatView<ordinal_type, Scalar>& C_top,
+                 const MatView<ordinal_type, Scalar>& C_bot,
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      return default_.factor_inner (R, A, tau, work);
+      return default_.apply_inner (applyType, A, tau,
+                                   C_top, C_bot, work, lwork);
     }
 
     void
-    factor_pair (const MatView<Ordinal, Scalar>& R_top,
-                 const MatView<Ordinal, Scalar>& R_bot,
+    factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+                 const MatView<ordinal_type, Scalar>& R_bot,
                  Scalar tau[],
-                 Scalar work[]) const
+                 Scalar work[],
+                 const ordinal_type lwork) override
     {
-      return default_.factor_pair (R_top, R_bot, tau, work);
+      return default_.factor_pair (R_top, R_bot, tau, work, lwork);
     }
 
     void
     apply_pair (const ApplyType& applyType,
-                const Ordinal ncols_C,
-                const Ordinal ncols_Q,
-                const Scalar R_bot[],
-                const Ordinal ldr_bot,
+                const MatView<ordinal_type, const Scalar>& R_bot,
                 const Scalar tau[],
-                Scalar C_top[],
-                const Ordinal ldc_top,
-                Scalar C_bot[],
-                const Ordinal ldc_bot,
-                Scalar work[]) const
+                const MatView<ordinal_type, Scalar>& C_top,
+                const MatView<ordinal_type, Scalar>& C_bot,
+                Scalar work[],
+                const ordinal_type lwork) override
     {
-      return default_.apply_pair (applyType, ncols_C, ncols_Q,
-                                  R_bot, ldr_bot, tau,
-                                  C_top, ldc_top, C_bot, ldc_bot,
-                                  work);
+      return default_.apply_pair (applyType, R_bot, tau,
+                                  C_top, C_bot, work, lwork);
     }
 
   private:
-    mutable combine_default_type default_;
+    CombineDefault<ordinal_type, scalar_type> default_;
   };
 
-
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
-  GER (const magnitude_type alpha,
-       const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& x,
-       const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& y,
-       const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& A) const
+  CombineNative<Ordinal, Scalar, false>::
+  GER (const mag_type alpha,
+       const vector_type<const scalar_type>& x,
+       const vector_type<const scalar_type>& y,
+       const matrix_type<scalar_type>& A) const
   {
     constexpr scalar_type ZERO {0.0};
-    const Ordinal m = A.extent (0);
-    const Ordinal n = A.extent (1);
+    const ordinal_type m = A.extent (0);
+    const ordinal_type n = A.extent (1);
 
-    constexpr Ordinal incy {1};
-    //Ordinal jy = (incy > 0) ? 1 : 1 - (n-1) * incy;
-    Ordinal jy = 1;
+    constexpr ordinal_type incy {1};
+    //ordinal_type jy = (incy > 0) ? 1 : 1 - (n-1) * incy;
+    ordinal_type jy = 1;
 
-    for (Ordinal j = 0; j < n; ++j) {
+    for (ordinal_type j = 0; j < n; ++j) {
       if (y[jy-1] != ZERO) {
         const scalar_type temp = alpha * y[jy-1];
-        for (Ordinal i = 0; i < m; ++i) {
+        for (ordinal_type i = 0; i < m; ++i) {
           A(i,j) = A(i,j) + x[i] * temp;
         }
       }
@@ -447,23 +440,22 @@ namespace TSQR {
     }
   }
 
-
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
+  CombineNative<Ordinal, Scalar, false>::
   GEMV (const char trans[],
         const scalar_type alpha,
-        const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& A,
-        const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& x,
+        const matrix_type<const scalar_type>& A,
+        const vector_type<const scalar_type>& x,
         const scalar_type beta,
-        const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& y) const
+        const vector_type<scalar_type>& y) const
   {
-    using y_vec_type = Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using x_vec_type = Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using range_type = std::pair<Ordinal, Ordinal>;
+    using y_vec_type = vector_type<scalar_type>;
+    using x_vec_type = vector_type<const scalar_type>;
+    using range_type = std::pair<ordinal_type, ordinal_type>;
 
-    const Ordinal m = A.extent (0);
-    const Ordinal n = A.extent (1);
+    const ordinal_type m = A.extent (0);
+    const ordinal_type n = A.extent (1);
 
     const bool no_trans = (trans[0] == 'N' || trans[0] == 'n');
     x_vec_type x_view = Kokkos::subview (x, range_type (0, no_trans ? n : m));
@@ -472,36 +464,36 @@ namespace TSQR {
     KokkosBlas::gemv (trans, alpha, A, x_view, beta, y_view);
   }
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
-  factor_inner (const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_view,
-                const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& A_view,
-                const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-                const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const
+  CombineNative<Ordinal, Scalar, false>::
+  factor_inner (const matrix_type<scalar_type>& R_view,
+                const matrix_type<scalar_type>& A_view,
+                const vector_type<scalar_type>& tau_view,
+                const vector_type<scalar_type>& work_view) const
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using range_type = std::pair<Ordinal, Ordinal>;
+    using range_type = std::pair<ordinal_type, ordinal_type>;
     constexpr scalar_type ZERO {0.0};
     constexpr scalar_type ONE {1.0};
+    const ordinal_type m = A_view.extent (0);
+    const ordinal_type n = A_view.extent (1);
 
-    const Ordinal m = A_view.extent (0);
-    const Ordinal n = A_view.extent (1);
-
-    for (Ordinal k = 0; k < n; ++k) {
+    for (ordinal_type k = 0; k < n; ++k) {
       work_view(k) = ZERO;
     }
 
-    for (Ordinal k = 0; k < n-1; ++k) {
+    for (ordinal_type k = 0; k < n-1; ++k) {
       Scalar& R_kk = R_view(k, k);
       auto A_1k = subview (A_view, ALL (), k);
-      auto A_1kp1 = subview (A_view, range_type (0, m), range_type (k+1, n));
+      auto A_1kp1 =
+        subview (A_view, range_type (0, m), range_type (k+1, n));
 
       this->LARFG (m + 1, R_kk, A_1k, tau_view[k]);
       this->GEMV ("T", ONE, A_1kp1, A_1k, ZERO, work_view);
 
-      for (Ordinal j = k+1; j < n; ++j) {
+      for (ordinal_type j = k+1; j < n; ++j) {
         Scalar& R_kj = R_view(k, j);
 
         work_view(j-k-1) += R_kj;
@@ -515,58 +507,60 @@ namespace TSQR {
     this->LARFG (m+1, R_nn, A_1n, tau_view[n-1]);
   }
 
-
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
-  factor_inner (const MatView<Ordinal, Scalar>& R,
-                const MatView<Ordinal, Scalar>& A,
+  CombineNative<Ordinal, Scalar, false>::
+  factor_inner (const MatView<ordinal_type, Scalar>& R,
+                const MatView<ordinal_type, Scalar>& A,
                 Scalar tau[],
-                Scalar work[]) const
+                Scalar work[],
+                const ordinal_type lwork)
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using mat_type =
-      Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>;
-    using nonconst_vec_type =
-      Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using range_type = std::pair<Ordinal, Ordinal>;
-
-    mat_type A_full (A.data(), A.stride(1), A.extent(1));
-    mat_type A_view = subview (A_full, range_type (0, A.extent(0)), ALL ());
-    mat_type R_full (R.data(), R.stride(1), R.extent(1));
-    mat_type R_view = subview (R_full, range_type (0, R.extent(1)), ALL ());
-    nonconst_vec_type tau_view (tau, R.extent(1));
-    nonconst_vec_type work_view (work, R.extent(1));
+    using mat_type = matrix_type<scalar_type>;
+    using nonconst_vec_type = vector_type<scalar_type>;
+    using range = std::pair<ordinal_type, ordinal_type>;
+
+    const ordinal_type numRows (A.extent (0));
+    const ordinal_type A_numCols (A.extent (1));
+    const ordinal_type lda (A.stride (1));
+    const ordinal_type R_numCols (R.extent (1));
+
+    mat_type A_full (A.data (), lda, A_numCols);
+    mat_type A_view = subview (A_full, range (0, numRows), ALL ());
+    mat_type R_full (R.data (), R.stride (1), R_numCols);
+    mat_type R_view = subview (R_full, range (0, R_numCols), ALL ());
+    nonconst_vec_type tau_view (tau, R_numCols);
+    nonconst_vec_type work_view (work, lwork);
 
     this->factor_inner (R_view, A_view, tau_view, work_view);
   }
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
+  CombineNative<Ordinal, Scalar, false>::
   apply_inner (const ApplyType& applyType,
-               const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& A,
-               const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& tau,
-               const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_top,
-               const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_bot,
-               const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work) const
+               const matrix_type<const scalar_type>& A,
+               const vector_type<const scalar_type>& tau,
+               const matrix_type<scalar_type>& C_top,
+               const matrix_type<scalar_type>& C_bot,
+               const vector_type<scalar_type>& work) const
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using const_vec_type =
-      Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>;
+    using const_vec_type = vector_type<const scalar_type>;
     constexpr scalar_type ZERO {0.0};
 
-    const Ordinal m = A.extent (0);
-    const Ordinal ncols_Q = A.extent (1);
-    const Ordinal ncols_C = C_top.extent (1);
+    const ordinal_type m = A.extent (0);
+    const ordinal_type ncols_Q = A.extent (1);
+    const ordinal_type ncols_C = C_top.extent (1);
 
-    for (Ordinal i = 0; i < ncols_C; ++i) {
+    for (ordinal_type i = 0; i < ncols_C; ++i) {
       work(i) = ZERO;
     }
 
-    Ordinal j_start, j_end, j_step;
+    ordinal_type j_start, j_end, j_step;
     if (applyType == ApplyType::NoTranspose) {
       j_start = ncols_Q - 1;
       j_end = -1; // exclusive
@@ -577,18 +571,18 @@ namespace TSQR {
       j_end = ncols_Q; // exclusive
       j_step = +1;
     }
-    for (Ordinal j = j_start; j != j_end; j += j_step) {
+    for (ordinal_type j = j_start; j != j_end; j += j_step) {
       const_vec_type A_1j = subview (A, ALL (), j);
 
       //blas.GEMV ("T", m, ncols_C, ONE, C_bot, ldc_bot, A_1j, 1, ZERO, &y[0], 1);
-      for (Ordinal i = 0; i < ncols_C; ++i) {
+      for (ordinal_type i = 0; i < ncols_C; ++i) {
         work(i) = ZERO;
-        for (Ordinal k = 0; k < m; ++k) {
+        for (ordinal_type k = 0; k < m; ++k) {
           work(i) += A_1j(k) * C_bot(k, i);
         }
         work(i) += C_top(j, i);
       }
-      for (Ordinal k = 0; k < ncols_C; ++k) {
+      for (ordinal_type k = 0; k < ncols_C; ++k) {
         C_top(j, k) -= tau[j] * work(k);
       }
 
@@ -596,70 +590,69 @@ namespace TSQR {
     }
   }
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
+  CombineNative<Ordinal, Scalar, false>::
   apply_inner (const ApplyType& applyType,
-               const Ordinal m,
-               const Ordinal ncols_C,
-               const Ordinal ncols_Q,
-               const Scalar A[],
-               const Ordinal lda,
+               const MatView<ordinal_type, const Scalar>& A,
                const Scalar tau[],
-               Scalar C_top[],
-               const Ordinal ldc_top,
-               Scalar C_bot[],
-               const Ordinal ldc_bot,
-               Scalar work[]) const
+               const MatView<ordinal_type, Scalar>& C_top,
+               const MatView<ordinal_type, Scalar>& C_bot,
+               Scalar work[],
+               const ordinal_type lwork)
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using const_mat_type =
-      Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>;
-    using nonconst_mat_type =
-      Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>;
-    using const_vec_type =
-      Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using nonconst_vec_type =
-      Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using range_type = std::pair<Ordinal, Ordinal>;
-
-    const_mat_type A_full (A, lda, ncols_Q);
+    using const_mat_type = matrix_type<const scalar_type>;
+    using nonconst_mat_type = matrix_type<scalar_type>;
+    using const_vec_type = vector_type<const scalar_type>;
+    using nonconst_vec_type = vector_type<scalar_type>;
+    using range_type = std::pair<ordinal_type, ordinal_type>;
+
+    const ordinal_type m = A.extent (0);
+    const ordinal_type ncols_Q = A.extent (1);
+    const ordinal_type ncols_C = C_top.extent (1);
+
+    const_mat_type A_full (A.data (), A.stride (1), ncols_Q);
     auto A_view = subview (A_full, range_type (0, m), ALL ());
-    nonconst_mat_type C_top_full (C_top, ldc_top, ncols_C);
+    nonconst_mat_type C_top_full
+      (C_top.data (), C_top.stride (1), ncols_C);
     auto C_top_view = subview (C_top_full, range_type (0, m), ALL ());
-    nonconst_mat_type C_bot_full (C_bot, ldc_bot, ncols_C);
+    nonconst_mat_type C_bot_full
+      (C_bot.data (), C_bot.stride (1), ncols_C);
     auto C_bot_view = subview (C_bot_full, range_type (0, m), ALL ());
     const_vec_type tau_view (tau, ncols_Q);
-    nonconst_vec_type work_view (work, ncols_C);
+    nonconst_vec_type work_view (work, lwork);
 
-    this->apply_inner (applyType, A_view, tau_view, C_top_view, C_bot_view, work_view);
+    this->apply_inner (applyType, A_view, tau_view, C_top_view,
+                       C_bot_view, work_view);
   }
 
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
-  factor_pair (const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_top,
-               const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& R_bot,
-               const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-               const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const
+  CombineNative<Ordinal, Scalar, false>::
+  factor_pair (const matrix_type<scalar_type>& R_top,
+               const matrix_type<scalar_type>& R_bot,
+               const vector_type<scalar_type>& tau_view,
+               const vector_type<scalar_type>& work_view) const
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using range_type = std::pair<Ordinal, Ordinal>;
+    using range_type = std::pair<ordinal_type, ordinal_type>;
     constexpr scalar_type ZERO {0.0};
     constexpr scalar_type ONE {1.0};
 
-    const Ordinal n = R_top.extent (0);
-    for (Ordinal k = 0; k < n; ++k) {
+    const ordinal_type n = R_top.extent (0);
+    for (ordinal_type k = 0; k < n; ++k) {
       work_view(k) = ZERO;
     }
 
-    for (Ordinal k = 0; k < n-1; ++k) {
+    for (ordinal_type k = 0; k < n-1; ++k) {
       scalar_type& R_top_kk = R_top(k, k);
       auto R_bot_1k = subview (R_bot, ALL (), k);
-      auto R_bot_1kp1 = subview (R_bot, range_type (0, k+1), range_type (k+1, n));
+      auto R_bot_1kp1 =
+        subview (R_bot, range_type (0, k+1), range_type (k+1, n));
 
       // k+2: 1 element in R_top (R_top(k,k)), and k+1 elements in
       // R_bot (R_bot(1:k,k), in 1-based indexing notation).
@@ -669,7 +662,7 @@ namespace TSQR {
 
       this->GEMV ("T", ONE, R_bot_1kp1, R_bot_1k, ZERO, work_view);
 
-      for (Ordinal j = k+1; j < n; ++j) {
+      for (ordinal_type j = k+1; j < n; ++j) {
         scalar_type& R_top_kj = R_top(k, j);
         work_view(j-k-1) += R_top_kj;
         R_top_kj -= tau_view[k] * work_view(j-k-1);
@@ -685,106 +678,113 @@ namespace TSQR {
   }
 
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
   CombineNative<Ordinal, Scalar, false>::
-  factor_pair (const MatView<Ordinal, Scalar>& R_top,
-               const MatView<Ordinal, Scalar>& R_bot,
+  factor_pair (const MatView<ordinal_type, Scalar>& R_top,
+               const MatView<ordinal_type, Scalar>& R_bot,
                Scalar tau[],
-               Scalar work[]) const
+               Scalar work[],
+               const ordinal_type lwork)
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using range_type = std::pair<Ordinal, Ordinal>;
+    using range_type = std::pair<ordinal_type, ordinal_type>;
 
-    const Ordinal numCols = R_top.extent (1);
-    Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type> R_top_full
+    const ordinal_type numCols = R_top.extent (1);
+    matrix_type<scalar_type> R_top_full
       (R_top.data(), R_top.stride (1), numCols);
-    Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type> R_bot_full
+    matrix_type<scalar_type> R_bot_full
       (R_bot.data(), R_bot.stride (1), R_bot.extent (1));
-    Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> tau_view
-      (tau, numCols);
-    Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> work_view
-      (work, numCols);
+    vector_type<scalar_type> tau_view (tau, numCols);
+    vector_type<scalar_type> work_view (work, lwork);
 
     if (R_top.stride(1) == numCols) {
       if (R_bot.stride(1) == numCols) {
-        this->factor_pair (R_top_full, R_bot_full, tau_view, work_view);
+        this->factor_pair (R_top_full, R_bot_full, tau_view,
+                           work_view);
       }
       else {
-        auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ());
-        this->factor_pair (R_top_full, R_bot_view, tau_view, work_view);
+        auto R_bot_view =
+          subview (R_bot_full, range_type (0, numCols), ALL ());
+        this->factor_pair (R_top_full, R_bot_view, tau_view,
+                           work_view);
       }
     }
     else {
-      auto R_top_view = subview (R_top_full, range_type (0, numCols), ALL ());
+      auto R_top_view =
+        subview (R_top_full, range_type (0, numCols), ALL ());
       if (R_bot.stride(1) == numCols) {
-        this->factor_pair (R_top_view, R_bot_full, tau_view, work_view);
+        this->factor_pair (R_top_view, R_bot_full, tau_view,
+                           work_view);
       }
       else {
-        auto R_bot_view = subview (R_bot_full, range_type (0, numCols), ALL ());
-        this->factor_pair (R_top_view, R_bot_view, tau_view, work_view);
+        auto R_bot_view =
+          subview (R_bot_full, range_type (0, numCols), ALL ());
+        this->factor_pair (R_top_view, R_bot_view, tau_view,
+                           work_view);
       }
     }
   }
 
-
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
+  CombineNative<Ordinal, Scalar, false>::
   apply_pair (const ApplyType& applyType,
-              const Ordinal ncols_C,
-              const Ordinal ncols_Q,
-              const scalar_type R_bot[],
-              const Ordinal ldr_bot,
-              const scalar_type tau[],
-              scalar_type C_top[],
-              const Ordinal ldc_top,
-              scalar_type C_bot[],
-              const Ordinal ldc_bot,
-              scalar_type work[]) const
+              const MatView<ordinal_type, const Scalar>& R_bot,
+              const Scalar tau[],
+              const MatView<ordinal_type, Scalar>& C_top,
+              const MatView<ordinal_type, Scalar>& C_bot,
+              Scalar work[],
+              const ordinal_type lwork)
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using range_type = std::pair<Ordinal, Ordinal>;
-    using const_mat_type =
-      Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>;
-    using nonconst_mat_type =
-      Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>;
-    using const_vec_type =
-      Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>;
-    using nonconst_vec_type =
-      Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>;
-
-    const_mat_type R_bot_full (R_bot, ldr_bot, ncols_Q);
-    nonconst_mat_type C_top_full (C_top, ldc_top, ncols_C);
-    nonconst_mat_type C_bot_full (C_bot, ldc_bot, ncols_C);
+    using range_type = std::pair<ordinal_type, ordinal_type>;
+    using const_mat_type = matrix_type<const scalar_type>;
+    using nonconst_mat_type = matrix_type<scalar_type>;
+    using const_vec_type = vector_type<const scalar_type>;
+    using nonconst_vec_type = vector_type<scalar_type>;
+
+    const ordinal_type ncols_Q = R_bot.extent (1);
+    const ordinal_type ncols_C = C_top.extent (1);
+    const_mat_type R_bot_full
+      (R_bot.data (), R_bot.stride (1), ncols_Q);
+    nonconst_mat_type C_top_full
+      (C_top.data (), C_top.stride (1), ncols_C);
+    nonconst_mat_type C_bot_full
+      (C_bot.data (), C_bot.stride (1), ncols_C);
     const_vec_type tau_view (tau, ncols_Q);
-    nonconst_vec_type work_view (work, ncols_C);
-
-    auto R_bot_view = subview (R_bot_full, range_type (0, ncols_Q), ALL ());
-    auto C_top_view = subview (C_top_full, range_type (0, ncols_C), ALL ());
-    auto C_bot_view = subview (C_bot_full, range_type (0, ncols_C), ALL ());
-    this->apply_pair (applyType, R_bot_view, tau_view, C_top_view, C_bot_view, work_view);
+    nonconst_vec_type work_view (work, lwork);
+
+    auto R_bot_view =
+      subview (R_bot_full, range_type (0, ncols_Q), ALL ());
+    auto C_top_view =
+      subview (C_top_full, range_type (0, ncols_C), ALL ());
+    auto C_bot_view =
+      subview (C_bot_full, range_type (0, ncols_C), ALL ());
+    this->apply_pair (applyType, R_bot_view, tau_view,
+                      C_top_view, C_bot_view, work_view);
   }
 
-  template< class Ordinal, class Scalar >
+  template<class Ordinal, class Scalar>
   void
-  CombineNative< Ordinal, Scalar, false >::
+  CombineNative<Ordinal, Scalar, false>::
   apply_pair (const ApplyType& applyType,
-              const Kokkos::View<const scalar_type**, Kokkos::LayoutLeft, device_type>& R_bot, // ncols_Q
-              const Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>& tau_view,
-              const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_top, // ncols_C
-              const Kokkos::View<scalar_type**, Kokkos::LayoutLeft, device_type>& C_bot,
-              const Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type>& work_view) const
+              const matrix_type<const scalar_type>& R_bot, // ncols_Q
+              const vector_type<const scalar_type>& tau_view,
+              const matrix_type<scalar_type>& C_top, // ncols_C
+              const matrix_type<scalar_type>& C_bot,
+              const vector_type<scalar_type>& work_view) const
   {
-    using const_vec_type =
-      Kokkos::View<const scalar_type*, Kokkos::LayoutLeft, device_type>;
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    using const_vec_type = vector_type<const scalar_type>;
     constexpr scalar_type ZERO {0.0};
-    const Ordinal ncols_C = C_top.extent (1);
-    const Ordinal ncols_Q = R_bot.extent (1);
+    const ordinal_type ncols_C = C_top.extent (1);
+    const ordinal_type ncols_Q = R_bot.extent (1);
 
-    Ordinal j_start, j_end, j_step;
+    ordinal_type j_start, j_end, j_step;
     if (applyType == ApplyType::NoTranspose) {
       j_start = ncols_Q - 1;
       j_end = -1; // exclusive
@@ -795,29 +795,29 @@ namespace TSQR {
       j_end = ncols_Q; // exclusive
       j_step = +1;
     }
-    for (Ordinal j_Q = j_start; j_Q != j_end; j_Q += j_step) {
+    for (ordinal_type j_Q = j_start; j_Q != j_end; j_Q += j_step) {
       // Using Householder reflector stored in column j_Q of R_bot
-      const_vec_type R_bot_col = Kokkos::subview (R_bot, Kokkos::ALL (), j_Q);
+      const_vec_type R_bot_col = subview (R_bot, ALL (), j_Q);
 
       // In 1-based indexing notation, with k in 1, 2, ..., ncols_C
       // (inclusive): (Output is length ncols_C row vector)
       //
       // work(1:j) := R_bot(1:j,j)' * C_bot(1:j, 1:ncols_C) - C_top(j, 1:ncols_C)
-      for (Ordinal j_C = 0; j_C < ncols_C; ++j_C) {
+      for (ordinal_type j_C = 0; j_C < ncols_C; ++j_C) {
         // For each column j_C of [C_top; C_bot], update row j_Q
         // of C_top and rows 1:j_Q of C_bot.  (Again, this is in
         // 1-based indexing notation.
 
         scalar_type work_j_C = ZERO;
-        const_vec_type C_bot_col = Kokkos::subview (C_bot, Kokkos::ALL (), j_C);
+        const_vec_type C_bot_col = subview (C_bot, ALL (), j_C);
 
-        for (Ordinal k = 0; k <= j_Q; ++k)
+        for (ordinal_type k = 0; k <= j_Q; ++k) {
           work_j_C += R_bot_col(k) * C_bot_col(k);
-
+        }
         work_j_C += C_top(j_Q, j_C);
         work_view(j_C) = work_j_C;
       }
-      for (Ordinal j_C = 0; j_C < ncols_C; ++j_C) {
+      for (ordinal_type j_C = 0; j_C < ncols_C; ++j_C) {
         C_top(j_Q, j_C) -= tau_view[j_Q] * work_view(j_C);
       }
       this->GER (-tau_view[j_Q], R_bot_col, work_view, C_bot);
@@ -825,6 +825,4 @@ namespace TSQR {
   }
 } // namespace TSQR
 
-
-
-#endif // __TSQR_CombineNative_hpp
+#endif // TSQR_COMBINENATIVE_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp
new file mode 100644
index 000000000000..28b41c7bf640
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineNodeTsqr.hpp
@@ -0,0 +1,334 @@
+//@HEADER
+// ************************************************************************
+//
+//          Kokkos: Node API and Parallel Node Kernels
+//              Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+
+/// \file Tsqr_CombineNodeTsqr.hpp
+/// \brief Declaration and definition of an implementation of NodeTsqr
+///   (intranode TSQR) that just uses Combine for all the operations
+///   on an MPI process.
+
+#ifndef TSQR_COMBINENODETSQR_HPP
+#define TSQR_COMBINENODETSQR_HPP
+
+#include "Tsqr_NodeTsqr.hpp"
+#include "Tsqr_Impl_CombineUser.hpp"
+#include "Tsqr_Impl_SystemBlas.hpp"
+#include "Teuchos_TypeNameTraits.hpp"
+#include <memory>
+
+namespace TSQR {
+  namespace Impl {
+    template<class T>
+    using span = Kokkos::View<T*, Kokkos::HostSpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+    template<class Ordinal, class Scalar>
+    class CombineNodeFactorOutput :
+      public NodeFactorOutput<Ordinal, Scalar> {
+    public:
+      CombineNodeFactorOutput (std::vector<Scalar>&& tau) :
+        tau_ (tau)
+      {}
+      ~CombineNodeFactorOutput () override = default;
+      span<const Scalar> tau () const {
+        return span<const Scalar> (tau_.data (), tau_.size ());
+      }
+    private:
+      std::vector<Scalar> tau_;
+    };
+  } // namespace Impl
+
+  /// \class CombineNodeTsqr
+  /// \brief Implementation of NodeTsqr (intranode TSQR) that just
+  ///   uses Combine for all the operations on an MPI process.
+  template<class Ordinal, class Scalar>
+  class CombineNodeTsqr :
+    public NodeTsqr<Ordinal, Scalar>,
+    private Impl::CombineUser<Ordinal, Scalar> {
+  private:
+    using base_type = NodeTsqr<Ordinal, Scalar>;
+    using my_factor_output_type =
+      Impl::CombineNodeFactorOutput<Ordinal, Scalar>;
+
+  public:
+    using ordinal_type = typename base_type::ordinal_type;
+    using scalar_type = typename base_type::scalar_type;
+    using mat_view_type = typename base_type::mat_view_type;
+    using const_mat_view_type =
+      typename base_type::const_mat_view_type;
+    using magnitude_type = typename base_type::magnitude_type;
+    using factor_output_type = typename base_type::factor_output_type;
+
+    ~CombineNodeTsqr () override = default;
+
+    Teuchos::RCP<const Teuchos::ParameterList>
+    getValidParameters () const override {
+      return Teuchos::parameterList ("CombineNodeTsqr");
+    }
+
+    void
+    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>&) override
+    {}
+
+    bool ready() const override {
+      return true;
+    }
+
+    size_t cache_size_hint() const override {
+      return size_t (0);
+    }
+
+    std::string description () const override {
+      using Teuchos::TypeNameTraits;
+      std::ostringstream os;
+      os << "CombineNodeTsqr<Ordinal="
+         << TypeNameTraits<Ordinal>::name() << ", Scalar="
+         << TypeNameTraits<Scalar>::name() << ">: Intranode "
+        "Intraprocess TSQR based on TSQR::Combine";
+      return os.str();
+    }
+
+  private:
+    void
+    factorImpl (const mat_view_type& R,
+                const mat_view_type& A,
+                std::vector<Scalar>& tau) const
+    {
+      const ordinal_type ncols = A.extent (1);
+      TEUCHOS_ASSERT( R.extent (0) == ncols &&
+                      R.extent (1) == ncols );
+      auto& combine = this->getCombine (ncols);
+      const ordinal_type lwork =
+        combine.work_size (A.extent (0), ncols, ncols);
+      std::vector<Scalar> work (lwork);
+      combine.factor_first (A, tau.data (), work.data (), lwork);
+
+      // Copy the R factor resulting from the factorization out of the
+      // topmost block of A) into the R output argument.
+      deep_copy (R, Scalar {});
+      copy_upper_triangle (R, A);
+    }
+
+  public:
+    Teuchos::RCP<factor_output_type>
+    factor (const ordinal_type nrows,
+            const ordinal_type ncols,
+            Scalar A[],
+            const ordinal_type lda,
+            Scalar R[],
+            const ordinal_type ldr,
+            const bool /* contiguousCacheBlocks */) const override
+    {
+      // The "contiguous cache blocks" option does nothing here, since
+      // we just defer to an internal library that expects
+      // column-major matrices.
+      mat_view_type A_view (nrows, ncols, A, lda);
+      mat_view_type R_view (ncols, ncols, R, ldr);
+      std::vector<Scalar> tau (ncols);
+      factorImpl (R_view, A_view, tau);
+      using Teuchos::rcp;
+      return rcp (new my_factor_output_type (std::move (tau)));
+    }
+
+    void
+    apply (const ApplyType& applyType,
+           const ordinal_type nrows,
+           const ordinal_type ncols_Q,
+           const Scalar Q[],
+           const ordinal_type ldq,
+           const factor_output_type& factorOutput,
+           const ordinal_type ncols_C,
+           Scalar C[],
+           const ordinal_type ldc,
+           const bool /* contiguousCacheBlocks */) const override
+    {
+      const char prefix[] = "TSQR::CombineNodeTsqr::apply: ";
+
+      // Quick exit and error tests
+      if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) {
+        return;
+      }
+      else if (ldc < nrows) {
+        std::ostringstream os;
+        os << prefix << "ldc (= " << ldc << ") < nrows (= "
+           << nrows << ")";
+        throw std::invalid_argument (os.str());
+      }
+      else if (ldq < nrows) {
+        std::ostringstream os;
+        os << prefix << "ldq (= " << ldq << ") < nrows (= "
+           << nrows << ")";
+        throw std::invalid_argument (os.str());
+      }
+
+      const my_factor_output_type& output = [&] () {
+        const my_factor_output_type* output_ptr =
+          dynamic_cast<const my_factor_output_type*> (&factorOutput);
+        if (output_ptr == nullptr) {
+          using Teuchos::demangleName;
+          using Teuchos::TypeNameTraits;
+          using Teuchos::typeName;
+          std::ostringstream os;
+          os << prefix << "Input factor_output_type object was not "
+            "created by the same type of NodeTsqr object as this "
+            "one.  This object has type " << typeName (*this) <<
+            " and its subclass of factor_output_type has type " <<
+            TypeNameTraits<my_factor_output_type>::name () << ", but "
+            "the input factor_output_type object has dynamic type "
+            << demangleName (typeid (factorOutput).name ());
+          throw std::invalid_argument (os.str ());
+        }
+        return *output_ptr;
+      } ();
+
+      auto& combine = this->getCombine (std::max (ncols_Q, ncols_C));
+      const ordinal_type lwork =
+        combine.work_size (nrows, ncols_C, ncols_C);
+      std::vector<Scalar> work (lwork);
+
+      const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq);
+      mat_view_type C_view (nrows, ncols_C, C, ldc);
+      const auto tau = output.tau ();
+      combine.apply_first (applyType, Q_view, tau.data (),
+                           C_view, work.data (), lwork);
+    }
+
+    void
+    explicit_Q (const ordinal_type nrows,
+                const ordinal_type ncols_Q,
+                const Scalar Q[],
+                const ordinal_type ldq,
+                const factor_output_type& factorOutput,
+                const ordinal_type ncols_C,
+                Scalar C[],
+                const ordinal_type ldc,
+                const bool contiguousCacheBlocks) const override
+    {
+      mat_view_type C_view (nrows, ncols_C, C, ldc);
+      deep_copy (C_view, Scalar {});
+      this->set_diagonal_entries_to_one (C_view);
+      // Apply the Q factor to C, to extract the first ncols_C columns
+      // of Q in explicit form.
+      apply (ApplyType::NoTranspose,
+             nrows, ncols_Q, Q, ldq, factorOutput,
+             ncols_C, C, ldc, contiguousCacheBlocks);
+    }
+
+    void
+    cache_block (const ordinal_type /* nrows */,
+                 const ordinal_type /* ncols */,
+                 Scalar /* A_out */ [],
+                 const Scalar /* A_in */ [],
+                 const ordinal_type /* lda_in */) const override
+    {}
+
+    void
+    un_cache_block (const ordinal_type /* nrows */,
+                    const ordinal_type /* ncols */,
+                    Scalar /* A_out */ [],
+                    const ordinal_type /* lda_out */,
+                    const Scalar /* A_in */ []) const override
+    {}
+
+    void
+    Q_times_B (const ordinal_type nrows,
+               const ordinal_type ncols,
+               Scalar Q[],
+               const ordinal_type ldq,
+               const Scalar B[],
+               const ordinal_type ldb,
+               const bool /* contiguousCacheBlocks */) const override
+    {
+      using Teuchos::NO_TRANS;
+
+      // We don't do any other error checking here (e.g., matrix
+      // dimensions), though it would be a good idea to do so.
+
+      // Take the easy exit if available.
+      if (ncols == 0 || nrows == 0) {
+        return;
+      }
+
+      Impl::SystemBlas<Scalar> blas;
+      mat_view_type Q_view (nrows, ncols, Q, ldq);
+      // GEMM doesn't like its input and output arguments to alias
+      // each other, so we use a (deep) copy.
+      Matrix<ordinal_type, Scalar> Q_copy (Q_view);
+
+      // Q_view := Q_copy * B.
+      blas.GEMM (NO_TRANS, NO_TRANS,
+                 nrows, ncols, ncols,
+                 Scalar (1.0), Q_copy.data (), Q_copy.stride (1),
+                 B, ldb,
+                 Scalar {}, Q_view.data (), Q_view.stride (1));
+    }
+
+    void
+    fill_with_zeros (const ordinal_type nrows,
+                     const ordinal_type ncols,
+                     Scalar A[],
+                     const ordinal_type lda,
+                     const bool /* contiguousCacheBlocks */) const override
+    {
+      mat_view_type A_view (nrows, ncols, A, lda);
+      deep_copy (A_view, Scalar {});
+    }
+
+  protected:
+    const_mat_view_type
+    const_top_block (const const_mat_view_type& C,
+                     const bool /* contiguousCacheBlocks */) const override
+    {
+      return C; // For this class, "cache blocking" does nothing.
+    }
+
+  public:
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      // FIXME (19 Dec 2019) If the combine type is dynamic, we can't
+      // answer this question without knowing the number of columns.
+      // Just guess for now.
+      constexpr ordinal_type fakeNumCols = 10;
+      auto& c = this->getCombine (fakeNumCols);
+      return c.QR_produces_R_factor_with_nonnegative_diagonal ();
+    }
+  };
+} // namespace TSQR
+
+#endif // TSQR_COMBINENODETSQR_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp
index 341b22ae9d32..790160667e58 100644
--- a/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp
+++ b/packages/tpetra/tsqr/src/Tsqr_CombineTest.cpp
@@ -42,7 +42,7 @@
 #include "Tsqr_Random_NormalGenerator.hpp"
 #include "Tsqr_Random_MatrixGenerator.hpp"
 
-#include "Tsqr_Combine.hpp"
+#include "Tsqr_CombineFactory.hpp"
 #include "Tsqr_LocalVerify.hpp"
 #include "Tsqr_Matrix.hpp"
 #include "Tsqr_Util.hpp"
@@ -59,36 +59,50 @@
 namespace TSQR {
   namespace Test {
 
+    template<class Ordinal, class Scalar>
+    void
+    fill_with_identity_columns (const MatView<Ordinal, Scalar>& A)
+    {
+      deep_copy (A, Scalar {});
+      const Ordinal numCols = A.extent (1);
+      // FIXME (mfh 08 Dec 2019) Eventually stop writing to Matrix or
+      // MatView entries on host, for eventual GPU-ization.
+      for (Ordinal j = 0; j < numCols; ++j) {
+        A(j,j) = Scalar (1.0);
+      }
+    }
+
     template<class Ordinal, class MagnitudeType, class NormalGenType>
-    static void
+    void
     generateSingularValues (NormalGenType& magGen,
                             std::vector<MagnitudeType>& sigma,
                             const Ordinal numValues)
     {
-      typedef MagnitudeType magnitude_type;
-      const magnitude_type machEps =
-        std::numeric_limits<magnitude_type>::epsilon();
+      using mag_type = MagnitudeType;
+      const mag_type machEps =
+        std::numeric_limits<mag_type>::epsilon();
       sigma.resize (numValues);
 
       // Relative amount by which to perturb each singular value.  The
       // perturbation will be multiplied by a normal(0,1) pseudorandom
       // number drawn from magGen.
-      const magnitude_type perturbationFactor = magnitude_type(10) * machEps;
-
-      sigma[0] = magnitude_type (1);
-      for (Ordinal k = 1; k < numValues; ++k)
-        {
-          const magnitude_type perturbation = perturbationFactor * magGen();
-          const magnitude_type beforePerturb = sigma[k-1] / magnitude_type(2);
-          const magnitude_type candidate = beforePerturb + perturbation;
-
-          // If adding the perturbation to beforePerturb would result
-          // in a nonpositive number, subtract instead.
-          if (candidate <= magnitude_type(0))
-            sigma[k] = beforePerturb - perturbation;
-          else
-            sigma[k] = candidate;
+      const mag_type perturbationFactor = mag_type(10) * machEps;
+
+      sigma[0] = mag_type (1);
+      for (Ordinal k = 1; k < numValues; ++k) {
+        const mag_type perturbation = perturbationFactor * magGen();
+        const mag_type beforePerturb = sigma[k-1] / mag_type(2);
+        const mag_type candidate = beforePerturb + perturbation;
+
+        // If adding the perturbation to beforePerturb would result
+        // in a nonpositive number, subtract instead.
+        if (candidate <= mag_type {}) {
+          sigma[k] = beforePerturb - perturbation;
+        }
+        else {
+          sigma[k] = candidate;
         }
+      }
     }
 
     static void
@@ -98,41 +112,42 @@ namespace TSQR {
       using std::endl;
 
       const char prefix[] = "%";
-      cout << prefix
-           << "method"
-           << ",kernel"
+      cout << prefix << "kernel"
+           << ",combiner"
            << ",scalarType"
            << ",numRows"
            << ",numCols"
+           << ",frobA"
            << ",absFrobResid"
            << ",absFrobOrthog"
-           << ",frobA"
            << endl;
     }
 
     template<class MagnitudeType>
     static void
-    printR1R2results (const std::string& datatype,
+    printR1R2results (const std::string& combinerName,
+                      const std::string& scalarName,
                       const int numCols,
                       const std::vector<MagnitudeType>& results)
     {
       using std::cout;
       using std::endl;
 
-      cout << "Combine"
-           << "," << "R1R2"
-           << "," << datatype
+      cout << "R1R2"
+           << "," << combinerName
+           << "," << scalarName
            << "," << (2*numCols)
            << "," << numCols
+           << "," << results[2]
            << "," << results[0]
            << "," << results[1]
-           << "," << results[2]
            << endl;
     }
 
     template<class MagnitudeType>
     static void
-    printR3Aresults (const std::string& datatype,
+    printR3Aresults (const std::string& combinerName,
+                     const std::string& scalarName,
                      const int numRows,
                      const int numCols,
                      const std::vector<MagnitudeType>& results)
@@ -140,62 +155,68 @@ namespace TSQR {
       using std::cout;
       using std::endl;
 
-      cout << "Combine"
-           << "," << "R3A"
-           << "," << datatype
+      cout << "R3A"
+           << "," << combinerName
+           << "," << scalarName
            << "," << numRows
            << "," << numCols
+           << "," << results[5]
            << "," << results[3]
            << "," << results[4]
-           << "," << results[5]
            << endl;
     }
 
     template<class MagnitudeType>
     static void
-    printResults (const std::string& datatype,
+    printResults (const std::string& combinerName,
+                  const std::string& scalarName,
                   const int numRows,
                   const int numCols,
-                  const std::vector<MagnitudeType>& results,
-                  const bool printFieldNames)
+                  const std::vector<MagnitudeType>& results)
+    {
+      printR1R2results (combinerName, scalarName, numCols, results);
+      printR3Aresults (combinerName, scalarName,
+                       numRows, numCols, results);
+    }
+
+    static void
+    printSimSeqTsqrFieldNames ()
     {
-      if (printFieldNames)
-        printCombineFieldNames();
-      printR1R2results (datatype, numCols, results);
-      printR3Aresults (datatype, numRows, numCols, results);
+      using std::cout;
+      using std::endl;
+
+      const char prefix[] = "%";
+      cout << prefix
+           << "method"
+           << ",combiner"
+           << ",scalarType"
+           << ",numRows"
+           << ",numCols"
+           << ",frobA"
+           << ",absFrobResid"
+           << ",absFrobOrthog"
+           << endl;
     }
 
     template<class MagnitudeType>
     static void
-    printSimSeqTsqrResults (const std::string& datatype,
+    printSimSeqTsqrResults (const std::string& combinerName,
+                            const std::string& scalarName,
                             const int numRows,
                             const int numCols,
-                            const std::vector<MagnitudeType>& results,
-                            const bool printFieldNames)
+                            const std::vector<MagnitudeType>& results)
     {
       using std::cout;
       using std::endl;
 
-      if (printFieldNames)
-        {
-          const char prefix[] = "%";
-          cout << prefix
-               << "method"
-               << ",scalarType"
-               << ",numRows"
-               << ",numCols"
-               << ",absFrobResid"
-               << ",absFrobOrthog"
-               << ",frobA"
-               << endl;
-        }
       cout << "CombineSimSeqTsqr"
-           << "," << datatype
+           << "," << combinerName
+           << "," << scalarName
            << "," << numRows
            << "," << numCols
+           << "," << results[2]
            << "," << results[0]
            << "," << results[1]
-           << "," << results[2]
            << endl;
     }
 
@@ -204,7 +225,8 @@ namespace TSQR {
     printMatrix (std::ostream& out,
                  const MatrixViewType& A)
     {
-      print_local_matrix (out, A.extent(0), A.extent(1), A.data(), A.stride(1));
+      print_local_matrix (out, A.extent(0), A.extent(1),
+                          A.data(), A.stride(1));
     }
 
     template<class MatrixViewType>
@@ -218,8 +240,10 @@ namespace TSQR {
                  const MatrixViewType& Q,
                  const MatrixViewType& R)
     {
-      return local_verify (A.extent(0), A.extent(1), A.data(), A.stride(1),
-                           Q.data(), Q.stride(1), R.data(), R.stride(1));
+      return local_verify (A.extent(0), A.extent(1),
+                           A.data(), A.stride(1),
+                           Q.data(), Q.stride(1),
+                           R.data(), R.stride(1));
     }
 
     /// \brief Test accuracy of TSQR::Combine
@@ -230,13 +254,17 @@ namespace TSQR {
     /// 2. [R; A] where R is ncols by ncols upper triangular, and A is
     ///    nrows by ncols general dense.
     ///
-    /// \return ($\|A - QR\|_F$, $\|I - Q^* Q\|_F$, $\|A\|_F$) for each
-    ///   test problem (so, a vector of six elements).
+    /// Print ($\|A - QR\|_F$, $\|I - Q^* Q\|_F$, $\|A\|_F$) for each
+    /// test problem (6 numbers in total).
     ///
-    template<class Ordinal, class Scalar>
-    static std::vector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>
+    template<class Ordinal,
+             class Scalar,
+             class CombineType>
+    void
     verifyCombineTemplate (TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
                            TSQR::Random::NormalGenerator<Ordinal, typename Teuchos::ScalarTraits<Scalar>::magnitudeType>& magGen,
+                           CombineType& combiner,
+                           const std::string& combinerName,
                            const Ordinal numRows,
                            const Ordinal numCols,
                            const bool debug)
@@ -251,11 +279,11 @@ namespace TSQR {
       using std::vector;
 
       typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
+      typedef typename STS::magnitudeType mag_type;
       typedef NormalGenerator<Ordinal, Scalar> normgen_type;
       typedef MatrixGenerator<Ordinal, Scalar, normgen_type> matgen_type;
       typedef Matrix<Ordinal, Scalar> matrix_type;
-      typedef vector<magnitude_type> results_type;
+      typedef vector<mag_type> results_type;
 
       if (numRows < numCols) {
         ostringstream os;
@@ -271,37 +299,25 @@ namespace TSQR {
       // Generate four different sets of singular values.  Randomly
       // perturb them, but make sure all are positive.
       //
-      vector< magnitude_type > sigma_R1 (numCols);
-      vector< magnitude_type > sigma_R2 (numCols);
-      vector< magnitude_type > sigma_R3 (numCols);
-      vector< magnitude_type > sigma_A (numCols);
+      vector<mag_type> sigma_R1 (numCols);
+      vector<mag_type> sigma_R2 (numCols);
+      vector<mag_type> sigma_R3 (numCols);
+      vector<mag_type> sigma_A (numCols);
       generateSingularValues (magGen, sigma_R1, numCols);
       generateSingularValues (magGen, sigma_R2, numCols);
       generateSingularValues (magGen, sigma_R3, numCols);
       generateSingularValues (magGen, sigma_A, numCols);
 
-      matrix_type R1 (numCols, numCols, Scalar(0));
-      matrix_type R2 (numCols, numCols, Scalar(0));
-      matrix_type R3 (numCols, numCols, Scalar(0));
-      matrix_type A (numRows, numCols, Scalar(0));
+      matrix_type R1 (numCols, numCols, Scalar{});
+      matrix_type R2 (numCols, numCols, Scalar{});
+      matrix_type R3 (numCols, numCols, Scalar{});
+      matrix_type A (numRows, numCols, Scalar{});
       matgen_type matgen (gen);
       matgen.fill_random_R (numCols, R1.data(), R1.stride(1), &sigma_R1[0]);
       matgen.fill_random_R (numCols, R2.data(), R2.stride(1), &sigma_R2[0]);
       matgen.fill_random_R (numCols, R3.data(), R3.stride(1), &sigma_R3[0]);
       matgen.fill_random_svd (numRows, numCols, A.data(), A.stride(1), &sigma_A[0]);
 
-      if (false && debug) {
-        cerr << endl << "First test problem:" << endl;
-        print_local_matrix (cerr, numCols, numCols, R1.data(), R1.stride(1));
-        print_local_matrix (cerr, numCols, numCols, R2.data(), R2.stride(1));
-        cerr << endl;
-
-        cerr << endl << "Second test problem:" << endl;
-        print_local_matrix (cerr, numCols, numCols, R3.data(), R3.stride(1));
-        print_local_matrix (cerr, numRows, numCols, A.data(), A.stride(1));
-        cerr << endl;
-      }
-
       // Space to put the original test problem, expressed as one
       // dense matrix rather than in two blocks.  These will be deep
       // copies of the test problems, since the test problem matrices
@@ -324,18 +340,13 @@ namespace TSQR {
       }
 
       // Space to put the explicit Q factors.
-      matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar(0));
-      matrix_type Q_R3A (numRows + numCols, numCols, Scalar(0));
-
-      // Fill the explicit Q factor matrices with the first numCols
-      // columns of the identity matrix.
-      for (Ordinal k = 0; k < numCols; ++k) {
-        // FIXME (mfh 26 Nov 2019) Eventually we want to get away from
-        // direct modification of the entries of a Matrix or MatView,
-        // in favor of only doing so with a Kokkos kernel or TPL.
-        Q_R1R2(k, k) = Scalar(1.0);
-        Q_R3A(k, k) = Scalar(1.0);
-      }
+      matrix_type Q_R1R2 (Ordinal(2) * numCols, numCols, Scalar {});
+      auto Q_R1_Q_R2 = partition_2x1 (Q_R1R2.view (), numCols);
+      matrix_type Q_R3A (numCols + numRows, numCols, Scalar {});
+      auto Q_R3_A = partition_2x1 (Q_R3A.view (), numCols);
+
+      fill_with_identity_columns (Q_R1R2.view ());
+      fill_with_identity_columns (Q_R3A.view ());
 
       // tau factor arrays, one for each factorization test.
       vector<Scalar> tau_R1R2 (numCols);
@@ -343,7 +354,9 @@ namespace TSQR {
 
       // Workspace array for factorization and applying the Q factor.
       // We recycle this workspace for all tests.
-      vector<Scalar> work (numCols);
+      const Ordinal lwork =
+        combiner.work_size (numRows, numCols, numCols);
+      vector<Scalar> work (lwork);
 
       if (debug) {
         cerr << endl << "----------------------------------------" << endl
@@ -351,48 +364,47 @@ namespace TSQR {
              << "qr( [R1; R2] ), with R1 and R2 " << numCols
              << " by " << numCols << endl << endl;
       }
-      Combine<Ordinal, Scalar> combiner;
-      combiner.factor_pair (R1.view(), R2.view(),
-                            tau_R1R2.data(), work.data());
-      combiner.apply_pair (ApplyType("N"), numCols, numCols,
-                           R2.data(), R2.stride(1), tau_R1R2.data(),
-                           &Q_R1R2(0, 0), Q_R1R2.stride(1),
-                           &Q_R1R2(numCols, 0), Q_R1R2.stride(1),
-                           work.data());
+      combiner.factor_pair (R1.view (), R2.view (),
+                            tau_R1R2.data (), work.data (), lwork);
+      combiner.apply_pair (ApplyType ("N"), R2.view (),
+                           tau_R1R2.data (),
+                           Q_R1_Q_R2.first, Q_R1_Q_R2.second,
+                           work.data (), lwork);
       if (debug) {
         cerr << "Results of first test problem:" << endl;
         cerr << "-- Copy of test problem:" << endl;
-        print_local_matrix (cerr, A_R1R2.extent(0), A_R1R2.extent(1),
-                            A_R1R2.data(), A_R1R2.stride(1));
+        print_local_matrix (cerr, A_R1R2.extent (0),
+                            A_R1R2.extent (1), A_R1R2.data (),
+                            A_R1R2.stride (1));
         cerr << endl << "-- Q factor:" << endl;
-        print_local_matrix (cerr, Q_R1R2.extent(0), Q_R1R2.extent(1),
-                            Q_R1R2.data(), Q_R1R2.stride(1));
+        print_local_matrix (cerr, Q_R1R2.extent (0),
+                            Q_R1R2.extent (1), Q_R1R2.data (),
+                            Q_R1R2.stride (1));
         cerr << endl << "-- R factor:" << endl;
-        print_local_matrix (cerr, R1.extent(0), R1.extent(1),
-                            R1.data(), R1.stride(1));
+        print_local_matrix (cerr, R1.extent (0), R1.extent (1),
+                            R1.data (), R1.stride (1));
         cerr << endl;
       }
       const results_type firstResults =
-        local_verify (A_R1R2.extent(0), A_R1R2.extent(1),
-                      A_R1R2.data(), A_R1R2.stride(1),
-                      Q_R1R2.data(), Q_R1R2.stride(1),
-                      R1.data(), R1.stride(1));
+        local_verify (A_R1R2.extent (0), A_R1R2.extent (1),
+                      A_R1R2.data (), A_R1R2.stride (1),
+                      Q_R1R2.data (), Q_R1R2.stride (1),
+                      R1.data (), R1.stride (1));
       if (debug) {
         cerr << "\\| A - Q*R \\|_F = " << firstResults[0] << endl
              << "\\| I - Q'*Q \\|_F = " << firstResults[1] << endl
              << "\\| A \\|_A = " << firstResults[2] << endl;
-        cerr << endl << "----------------------------------------" << endl
-             << "TSQR::Combine second test problem:" << endl
-             << "qr( [R3; A] ), with R3 " << numCols << " by " << numCols
-             << " and A " << numRows << " by " << numCols << endl << endl;
+        cerr << endl << "----------------------------------------"
+             << endl << "TSQR::Combine second test problem:" << endl
+             << "qr( [R3; A] ), with R3 " << numCols << " by "
+             << numCols << " and A " << numRows << " by " << numCols
+             << endl << endl;
       }
-      combiner.factor_inner (R3.view(), A.view(),
-                             tau_R3A.data(), work.data());
-      combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
-                            A.data(), A.stride(1), tau_R3A.data(),
-                            &Q_R3A(0, 0), Q_R3A.stride(1),
-                            &Q_R3A(numCols, 0), Q_R3A.stride(1),
-                            work.data());
+      combiner.factor_inner (R3.view (), A.view (),
+                             tau_R3A.data (), work.data (), lwork);
+      combiner.apply_inner (ApplyType ("N"), A.view (),
+                            tau_R3A.data (), Q_R3_A.first,
+                            Q_R3_A.second, work.data (), lwork);
       if (debug) {
         cerr << "Results of second test problem:" << endl;
         cerr << "-- Copy of test problem:" << endl;
@@ -416,7 +428,7 @@ namespace TSQR {
              << "\\| I - Q'*Q \\|_F = " << secondResults[1] << endl
              << "\\| A \\|_A = " << secondResults[2] << endl;
       }
-      vector<magnitude_type> finalResults;
+      vector<mag_type> finalResults;
       finalResults.push_back (firstResults[0]);
       finalResults.push_back (firstResults[1]);
       finalResults.push_back (firstResults[2]);
@@ -424,14 +436,74 @@ namespace TSQR {
       finalResults.push_back (secondResults[0]);
       finalResults.push_back (secondResults[1]);
       finalResults.push_back (secondResults[2]);
-      return finalResults;
+
+      const std::string scalarName =
+        Teuchos::TypeNameTraits<Scalar>::name ();
+      printResults (combinerName, scalarName, numRows, numCols,
+                    finalResults);
+    }
+
+    template<class Ordinal,
+             class Scalar>
+    void
+    verifyCombineTemplateAllCombiners (std::vector<int>& iseed,
+                                       const Ordinal numRows,
+                                       const Ordinal numCols,
+                                       const bool debug)
+    {
+      using mag_type =
+        typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
+      const std::string scalarName =
+        Teuchos::TypeNameTraits<Scalar>::name ();
+
+      Random::NormalGenerator<int, Scalar> normgenS (iseed);
+      Random::NormalGenerator<int, mag_type> normgenM (iseed);
+
+      using factory_type = CombineFactory<int, Scalar>;
+      {
+        const std::string combinerName ("Native");
+        auto combiner = factory_type::create (combinerName);
+        TEUCHOS_ASSERT( combiner.get () != nullptr );
+        // Make sure it's the right type.
+        using expected_type = CombineNative<int, Scalar>;
+        expected_type* combinerPtr =
+          dynamic_cast<expected_type*> (combiner.get ());
+        TEUCHOS_ASSERT( combinerPtr != nullptr );
+        verifyCombineTemplate (normgenS, normgenM, *combiner,
+                               combinerName, numRows, numCols,
+                               debug);
+      }
+      {
+        const std::string combinerName ("Default");
+        auto combiner = factory_type::create (combinerName);
+        TEUCHOS_ASSERT( combiner.get () != nullptr );
+        // Make sure it's the right type.
+        using expected_type = CombineDefault<int, Scalar>;
+        expected_type* combinerPtr =
+          dynamic_cast<expected_type*> (combiner.get ());
+        TEUCHOS_ASSERT( combinerPtr != nullptr );
+        verifyCombineTemplate (normgenS, normgenM, *combiner,
+                               combinerName, numRows, numCols,
+                               debug);
+      }
+
+      // Fetch the pseudorandom seed from the previous test.
+      //
+      // Even though normgenS and normgenM each updated the random
+      // seed independently, for now we just fetch the updated seed
+      // from normgenS.  This should still produce reproducible
+      // results.
+      normgenS.getSeed (iseed);
     }
 
     //! Simulate one combine step of Sequential TSQR
-    template<class Ordinal, class Scalar>
-    static std::vector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>
+    template<class Ordinal,
+             class Scalar,
+             class CombineType>
+    std::vector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>
     verifyCombineSeqTemplate (TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
                               TSQR::Random::NormalGenerator<Ordinal, typename Teuchos::ScalarTraits<Scalar>::magnitudeType>& magGen,
+                              CombineType& combiner,
                               const Ordinal numRows,
                               const Ordinal numCols,
                               const bool debug)
@@ -446,12 +518,12 @@ namespace TSQR {
       using std::vector;
 
       typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
+      typedef typename STS::magnitudeType mag_type;
       typedef NormalGenerator< Ordinal, Scalar > normgen_type;
       typedef MatrixGenerator< Ordinal, Scalar, normgen_type > matgen_type;
       typedef Matrix<Ordinal, Scalar> matrix_type;
       typedef MatView<Ordinal, Scalar> mat_view_type;
-      typedef vector<magnitude_type> results_type;
+      typedef vector<mag_type> results_type;
 
       if (numRows < numCols) {
         ostringstream os;
@@ -464,32 +536,24 @@ namespace TSQR {
       }
 
       // Generate two different sets of singular values.
-      vector< magnitude_type > sigma_A1 (numCols);
-      vector< magnitude_type > sigma_A2 (numCols);
+      vector<mag_type> sigma_A1 (numCols);
+      vector<mag_type> sigma_A2 (numCols);
       generateSingularValues (magGen, sigma_A1, numCols);
       generateSingularValues (magGen, sigma_A2, numCols);
 
-      // Matrix consisting of two cache blocks.
-      matrix_type A (Ordinal(2)*numRows, numCols, Scalar(0));
+      // Matrix consisting of two "cache blocks."
+      matrix_type A (Ordinal(2)*numRows, numCols, Scalar{});
+      auto A1_A2 = partition_2x1 (A, numRows);
       // Views of the two cache blocks.
-      mat_view_type A1 (numRows, numCols, &A(0,0), A.stride(1));
-      mat_view_type A2 (numRows, numCols, &A(numRows,0), A.stride(1));
+      mat_view_type A1 = A1_A2.first;
+      mat_view_type A2 = A1_A2.second;
 
       // Fill the two cache blocks with random test problems.
       matgen_type matgen (gen);
-      matgen.fill_random_svd (numRows, numCols, A1.data(), A1.stride(1), &sigma_A1[0]);
-      matgen.fill_random_svd (numRows, numCols, A2.data(), A2.stride(1), &sigma_A2[0]);
-
-      if (false && debug) {
-        cerr << endl << "Test problem:" << endl;
-        cerr << endl << "Original matrix:" << endl;
-        printMatrix (cerr, A);
-        cerr << endl << "First cache block:" << endl;
-        printMatrix (cerr, A1);
-        cerr << endl << "Second cache block:" << endl;
-        printMatrix (cerr, A2);
-        cerr << endl;
-      }
+      matgen.fill_random_svd (numRows, numCols, A1.data(),
+                              A1.stride(1), sigma_A1.data ());
+      matgen.fill_random_svd (numRows, numCols, A2.data(),
+                              A2.stride(1), sigma_A2.data ());
 
       // Copy of the resulting test problem, stored as one dense
       // matrix rather than as two blocks.  We will use A_copy to
@@ -498,20 +562,10 @@ namespace TSQR {
       matrix_type A_copy (A);
 
       // Space to put the explicit Q factor.
-      matrix_type Q (Ordinal(2) * numRows, numCols, Scalar(0));
-
-      // Fill Q with the first numCols columns of the identity matrix.
-      for (Ordinal k = 0; k < numCols; ++k) {
-        // FIXME (mfh 26 Nov 2019) I'm assuming I can write to the
-        // Matrix or MatView on host, outside of Kokkos.  TSQR always
-        // assumed this, but if we want to use Kokkos, we'll need to
-        // get rid of that assumption.
-        Q(k, k) = Scalar(1.0);
-      }
-
-      // Two cache blocks (as views) of Q.
-      mat_view_type Q1 (numRows, numCols, &Q(0,0), Q.stride(1));
-      mat_view_type Q2 (numRows, numCols, &Q(numRows,0), Q.stride(1));
+      matrix_type Q (Ordinal(2) * numRows, numCols, Scalar {});
+      fill_with_identity_columns (Q.view ());
+      // Two "cache blocks" (as views) of Q.
+      auto Q1_Q2 = partition_2x1 (Q.view (), numRows);
 
       // Two tau factor arrays, one for each cache block.
       vector<Scalar> tau1 (numCols);
@@ -519,21 +573,23 @@ namespace TSQR {
 
       // Workspace array for factorization and applying the Q factor.
       // We recycle this workspace for all tests.
-      vector<Scalar> work (numCols);
+      const Ordinal lwork =
+        combiner.work_size (numRows, numCols, numCols);
+      vector<Scalar> work (lwork);
 
       if (debug) {
-        cerr << endl << "----------------------------------------" << endl
-             << "TSQR::Combine SequentialTsqr simulation with 2 cache blocks:"
-             << endl << "qr( [A1; A2] ), with A1 and A2 being each "
-             << numRows << " by " << numCols << endl << endl;
+        cerr << endl << "----------------------------------------"
+          << endl << "TSQR::Combine SequentialTsqr simulation with 2 "
+          "cache blocks:" << endl << "qr( [A1; A2] ), with A1 and A2 "
+          "A2 each " << numRows << " by " << numCols << endl << endl;
       }
-      Combine<Ordinal, Scalar> combiner;
       // qr( A1 )
-      combiner.factor_first (A1, tau1.data(), work.data());
+      combiner.factor_first (A1, tau1.data (), work.data (), lwork);
       // View of numCols by numCols upper triangle of A1.
       mat_view_type R1 (numCols, numCols, A1.data(), A1.stride(1));
       // qr( [R1; A2] )
-      combiner.factor_inner (R1, A2, tau2.data(), work.data());
+      combiner.factor_inner (R1, A2, tau2.data (),
+                             work.data (), lwork);
       // Extract (a deep copy of) the R factor.
       matrix_type R (R1);
       // Zero out everything below the diagonal of R.
@@ -549,14 +605,11 @@ namespace TSQR {
 
       // Compute the explicit Q factor, by starting with A2 and
       // (working up the matrix A,) finishing with A1.
-      combiner.apply_inner (ApplyType::NoTranspose,
-                            numRows, numCols, numCols,
-                            A2.data(), A2.stride(1), tau2.data(),
-                            Q1.data(), Q1.stride(1),
-                            Q2.data(), Q2.stride(1), work.data());
-      combiner.apply_first (ApplyType::NoTranspose,
-                            A1, tau1.data(),
-                            Q1, work.data());
+      combiner.apply_inner (ApplyType::NoTranspose, A2, tau2.data (),
+                            Q1_Q2.first, Q1_Q2.second,
+                            work.data (), lwork);
+      combiner.apply_first (ApplyType::NoTranspose, A1, tau1.data (),
+                            Q1_Q2.first, work.data (), lwork);
       if (debug) {
         cerr << "Results of first test problem:" << endl;
         cerr << "-- Test matrix A:" << endl;
@@ -588,9 +641,9 @@ namespace TSQR {
     {
       using TSQR::Random::NormalGenerator;
       using std::cerr;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
       using std::complex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
       using std::cout;
       using std::endl;
       using std::pair;
@@ -613,130 +666,122 @@ namespace TSQR {
       iseed[2] = 0;
       iseed[3] = 1;
 
-      // Whether to print the field (i.e., column) names for the
-      // output data.
-      bool doPrintFieldNames = printFieldNames;
-
       if (! simulateSequentialTsqr) {
+        printCombineFieldNames ();
         if (testReal) {
           {
-            NormalGenerator<int, float> normgenS (iseed);
-            const vector<float> resultsS =
-              verifyCombineTemplate (normgenS, normgenS, numRows,
-                                     numCols, debug);
-            // Only print field names (if at all) once per run, for
-            // the first data type.
-            printResults (string("float"), numRows, numCols,
-                          resultsS, doPrintFieldNames);
-            // Print field names at most once.
-            doPrintFieldNames = false;
-            // Fetch the pseudorandom seed from the previous test.
-            normgenS.getSeed (iseed);
+            using scalar_type = float;
+            verifyCombineTemplateAllCombiners<int, scalar_type>
+              (iseed, numRows, numCols, debug);
           }
           {
-            NormalGenerator<int, double> normgenD (iseed);
-            const vector<double> resultsD =
-              verifyCombineTemplate (normgenD, normgenD, numRows,
-                                     numCols, debug);
-            printResults (string("double"), numRows, numCols,
-                          resultsD, doPrintFieldNames);
-            doPrintFieldNames = false;
-            normgenD.getSeed (iseed);
+            using scalar_type = double;
+            verifyCombineTemplateAllCombiners<int, scalar_type>
+              (iseed, numRows, numCols, debug);
           }
         }
-
-        if (testComplex)
+        if (testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
           {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-            {
-              NormalGenerator<int, complex<float> > normgenC (iseed);
-              NormalGenerator<int, float> normgenS (iseed);
-              const vector<float> resultsC =
-                verifyCombineTemplate (normgenC, normgenS, numRows,
-                                       numCols, debug);
-              printResults (string("complex<float>"), numRows, numCols,
-                            resultsC, doPrintFieldNames);
-              doPrintFieldNames = false;
-              // Even though normgenC and normgenS each updated the
-              // random seed independently, for now we just fetch the
-              // updated seed from normgenC.  This should still
-              // produce reproducible results.
-              normgenC.getSeed (iseed);
-            }
-            {
-              NormalGenerator<int, complex<double> > normgenZ (iseed);
-              NormalGenerator<int, double> normgenD (iseed);
-              const vector<double> resultsZ =
-                verifyCombineTemplate (normgenZ, normgenD, numRows,
-                                       numCols, debug);
-              printResults (string("complex<double>"), numRows, numCols,
-                            resultsZ, doPrintFieldNames);
-              doPrintFieldNames = false;
-              normgenZ.getSeed (iseed);
-            }
-#else // NOT HAVE_KOKKOSTSQR_COMPLEX
-            TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
-                                       "Trilinos was not built with "
-                                       "complex arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+            using scalar_type = std::complex<float>;
+            verifyCombineTemplateAllCombiners<int, scalar_type>
+              (iseed, numRows, numCols, debug);
           }
+          {
+            using scalar_type = std::complex<double>;
+            verifyCombineTemplateAllCombiners<int, scalar_type>
+              (iseed, numRows, numCols, debug);
+          }
+#else // NOT HAVE_TPETRATSQR_COMPLEX
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::logic_error, "You set testComplex=true, but "
+             "Trilinos was not built with complex arithmetic support "
+             "enabled.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+        }
       }
       else { // simulateSequentialTsqr
+        printSimSeqTsqrFieldNames ();
         if (testReal) {
           {
-            NormalGenerator<int, float> normgenS (iseed);
-            const vector<float> resultsS =
-              verifyCombineSeqTemplate (normgenS, normgenS, numRows,
-                                        numCols, debug);
-            printSimSeqTsqrResults (string("float"), numRows, numCols,
-                                    resultsS, doPrintFieldNames);
-            doPrintFieldNames = false;
+            using scalar_type = float;
+
+            NormalGenerator<int, scalar_type> normgenS (iseed);
+            auto combiner =
+              CombineFactory<int, scalar_type>::create (numCols);
+            const std::string combinerName ("?");
+            const auto results =
+              verifyCombineSeqTemplate (normgenS, normgenS, *combiner,
+                                        numRows, numCols, debug);
+            const std::string scalarName =
+              Teuchos::TypeNameTraits<scalar_type>::name ();
+            printSimSeqTsqrResults (combinerName, scalarName,
+                                    numRows, numCols, results);
             normgenS.getSeed (iseed);
           }
           {
-            NormalGenerator<int, double> normgenD (iseed);
-            const vector<double> resultsD =
-              verifyCombineSeqTemplate (normgenD, normgenD, numRows,
-                                        numCols, debug);
-            printSimSeqTsqrResults (string("double"), numRows, numCols,
-                                    resultsD, doPrintFieldNames);
-            doPrintFieldNames = false;
-            normgenD.getSeed (iseed);
+            using scalar_type = double;
+
+            NormalGenerator<int, scalar_type> normgenS (iseed);
+            auto combiner =
+              CombineFactory<int, scalar_type>::create (numCols);
+            const std::string combinerName ("?");
+            const auto results =
+              verifyCombineSeqTemplate (normgenS, normgenS, *combiner,
+                                        numRows, numCols, debug);
+            const std::string scalarName =
+              Teuchos::TypeNameTraits<scalar_type>::name ();
+            printSimSeqTsqrResults (combinerName, scalarName,
+                                    numRows, numCols, results);
+            normgenS.getSeed (iseed);
           }
         }
 
         if (testComplex) {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
           {
-            NormalGenerator<int, complex<float> > normgenC (iseed);
-            NormalGenerator<int, float> normgenS (iseed);
-            const vector<float> resultsC =
-              verifyCombineSeqTemplate (normgenC, normgenS, numRows,
-                                        numCols, debug);
-            printSimSeqTsqrResults (string("complex<float>"), numRows, numCols,
-                                    resultsC, doPrintFieldNames);
-            doPrintFieldNames = false;
-            normgenC.getSeed (iseed);
+            using scalar_type = complex<float>;
+            using mag_type = float;
+
+            NormalGenerator<int, scalar_type> normgenS (iseed);
+            NormalGenerator<int, mag_type> normgenM (iseed);
+            auto combiner =
+              CombineFactory<int, scalar_type>::create (numCols);
+            const std::string combinerName ("?");
+            const auto results =
+              verifyCombineSeqTemplate (normgenS, normgenM, *combiner,
+                                        numRows, numCols, debug);
+            const std::string scalarName =
+              Teuchos::TypeNameTraits<scalar_type>::name ();
+            printSimSeqTsqrResults (combinerName, scalarName,
+                                    numRows, numCols, results);
+            normgenS.getSeed (iseed);
           }
           {
-            NormalGenerator<int, complex<double> > normgenZ (iseed);
-            NormalGenerator<int, double> normgenD (iseed);
-            const vector<double> resultsZ =
-              verifyCombineSeqTemplate (normgenZ, normgenD, numRows,
-                                        numCols, debug);
-            printSimSeqTsqrResults (string("complex<double>"), numRows,
-                                    numCols, resultsZ, doPrintFieldNames);
-            doPrintFieldNames = false;
-            normgenZ.getSeed (iseed);
+            using scalar_type = complex<double>;
+            using mag_type = double;
+
+            NormalGenerator<int, scalar_type> normgenS (iseed);
+            NormalGenerator<int, mag_type> normgenM (iseed);
+            auto combiner =
+              CombineFactory<int, scalar_type>::create (numCols);
+            const std::string combinerName ("?");
+            const auto results =
+              verifyCombineSeqTemplate (normgenS, normgenM, *combiner,
+                                        numRows, numCols, debug);
+            const std::string scalarName =
+              Teuchos::TypeNameTraits<scalar_type>::name ();
+            printSimSeqTsqrResults (combinerName, scalarName,
+                                    numRows, numCols, results);
+            normgenS.getSeed (iseed);
           }
-#else // NOT HAVE_KOKKOSTSQR_COMPLEX
-          TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
-                                     "Trilinos was not built with "
-                                     "complex arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#else // NOT HAVE_TPETRATSQR_COMPLEX
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::logic_error, "Trilinos was not built with "
+             "complex arithmetic support.");
+#endif // HAVE_TPETRATSQR_COMPLEX
         }
       }
     }
   } // namespace Test
 } // namespace TSQR
-
diff --git a/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp
new file mode 100644
index 000000000000..3f9ef926cc34
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_CuSolverNodeTsqr.hpp
@@ -0,0 +1,1053 @@
+//@HEADER
+// ************************************************************************
+//
+//          Kokkos: Node API and Parallel Node Kernels
+//              Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+
+/// \file Tsqr_CuSolverNodeTsqr.hpp
+/// \brief Declaration and definition of CuSolverNodeTsqr.
+
+#ifndef TSQR_CUSOLVERNODETSQR_HPP
+#define TSQR_CUSOLVERNODETSQR_HPP
+
+#include "TpetraTSQR_config.h"
+
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+#include "Tsqr_NodeTsqr.hpp"
+#include "Tsqr_Impl_CuBlas.hpp"
+#include "Tsqr_Impl_CuSolver.hpp"
+#include "Kokkos_ArithTraits.hpp"
+#include <memory>
+#include <type_traits>
+
+#define TSQR_IMPL_CATCH( message ) \
+  catch (std::exception& e) { \
+    threw = true; \
+    err = std::unique_ptr<std::ostringstream> (new std::ostringstream); \
+    *err << prefix << message << std::endl << e.what (); \
+  } \
+  TEUCHOS_TEST_FOR_EXCEPTION \
+    (threw, std::runtime_error, \
+     (err.get () == nullptr ? "Unknown error" : err->str ())); \
+  do {} while (false)
+
+#define TSQR_IMPL_CHECK_LAST_CUDA_ERROR( location ) \
+  do { \
+    cudaError_t errCode = cudaGetLastError (); \
+    if (errCode != cudaSuccess ) { \
+      const char* errorString = cudaGetErrorString (errCode); \
+      TEUCHOS_TEST_FOR_EXCEPTION \
+        (true, std::runtime_error, "At \"" << (location) << "\", " \
+         "CUDA is in the following error state: " << errorString); \
+    } \
+  } while (false)
+
+namespace TSQR {
+  namespace Impl {
+
+    using cusolver_memory_space = Kokkos::CudaSpace;
+    using cusolver_execution_space = Kokkos::Cuda;
+    using host_device_type = Kokkos::Device<
+      Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>;
+
+    // Mapping from Scalar to Kokkos value type.
+    // e.g., Scalar=std::complex<double> -> Kokkos::complex<double>.
+
+    template<class Scalar>
+    using non_const_kokkos_value_type = typename Kokkos::ArithTraits<
+        typename std::remove_const<Scalar>::type
+      >::val_type;
+
+    template<class Scalar>
+    using kokkos_view_value_type = typename std::conditional<
+        std::is_const<Scalar>::value,
+        const non_const_kokkos_value_type<Scalar>,
+        non_const_kokkos_value_type<Scalar>
+      >::type;
+
+    // vector_type, device_vector_type, and host_vector_type
+
+    template<class T, class MemorySpace>
+    using vector_type = Kokkos::View<T*, MemorySpace>;
+
+    template<class T>
+    using device_vector_type = vector_type<T, cusolver_memory_space>;
+
+    template<class T>
+    using host_vector_type = vector_type<T, host_device_type>;
+
+    template<class T>
+    void
+    reallocDeviceVectorIfNeeded (device_vector_type<T>& vec,
+                                 const char label[],
+                                 const size_t minSize)
+    {
+      using Kokkos::view_alloc;
+      using Kokkos::WithoutInitializing;
+
+      if (size_t (vec.size ()) < minSize) {
+        vec = device_vector_type<T> ();
+        auto alloc = view_alloc (std::string (label), WithoutInitializing);
+        vec = device_vector_type<T> (alloc, minSize);
+      }
+    }
+
+    // vec_view_type & device_vec_view_type
+
+    template<class T, class MemorySpace>
+    using vec_view_type =
+      Kokkos::View<T*, MemorySpace,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+    template<class T>
+    using device_vec_view_type = vec_view_type<T, cusolver_memory_space>;
+
+    // matrix_type & device_matrix_type
+
+    template<class T, class MemorySpace>
+    using matrix_type = Kokkos::View<T**, Kokkos::LayoutLeft, MemorySpace>;
+
+    template<class T>
+    using device_matrix_type = matrix_type<T, cusolver_memory_space>;
+
+    // mat_view_type, device_mat_view_type, & host_mat_view_type
+
+    template<class T, class MemorySpace>
+    using mat_view_type =
+      Kokkos::View<T**, Kokkos::LayoutLeft, MemorySpace,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+    template<class T>
+    using device_mat_view_type =
+      mat_view_type<T, cusolver_memory_space>;
+
+    template<class T>
+    using host_mat_view_type = mat_view_type<T, host_device_type>;
+
+    // get_mat_view, get_host_mat_view, & get_device_mat_view
+
+    template<class Scalar, class MemorySpace>
+    static mat_view_type<kokkos_view_value_type<Scalar>, MemorySpace>
+    get_mat_view (const size_t nrows,
+                  const size_t ncols,
+                  Scalar A[],
+                  const size_t lda)
+    {
+      static_assert
+        (! std::is_const<non_const_kokkos_value_type<Scalar> >::value,
+         "non_const_kokkos_value_type is const.");
+      using KVVT = kokkos_view_value_type<Scalar>; // preserves const
+      static_assert
+        ((std::is_const<Scalar>::value && std::is_const<KVVT>::value) ||
+         (! std::is_const<Scalar>::value && ! std::is_const<KVVT>::value),
+         "kokkos_view_value_type failed to preserve const-ness.");
+      KVVT* A_raw = reinterpret_cast<KVVT*> (A);
+
+      mat_view_type<KVVT, MemorySpace> A_full (A_raw, lda, ncols);
+      const std::pair<size_t, size_t> rowRange (0, nrows);
+      return Kokkos::subview (A_full, rowRange, Kokkos::ALL ());
+    }
+
+    template<class Scalar>
+    static host_mat_view_type<kokkos_view_value_type<Scalar>>
+    get_host_mat_view (const size_t nrows,
+                       const size_t ncols,
+                       Scalar A[],
+                       const size_t lda)
+    {
+      return get_mat_view<Scalar, host_device_type>
+        (nrows, ncols, A, lda);
+    }
+
+    template<class Scalar, class Ordinal>
+    static host_mat_view_type<kokkos_view_value_type<Scalar>>
+    get_host_mat_view (const MatView<Ordinal, Scalar>& A_host)
+    {
+      const size_t nrows (A_host.extent (0));
+      const size_t ncols (A_host.extent (1));
+      const size_t lda (A_host.stride (1));
+      return get_mat_view<Scalar, host_device_type>
+        (nrows, ncols, A_host.data (), lda);
+    }
+
+    template<class Scalar>
+    static device_mat_view_type<kokkos_view_value_type<Scalar>>
+    get_device_mat_view (const size_t nrows,
+                         const size_t ncols,
+                         Scalar A[],
+                         const size_t lda)
+    {
+      return get_mat_view<Scalar, cusolver_memory_space> (nrows, ncols, A, lda);
+    }
+
+    /// \brief Given rank-1 backing storage, return a device matrix
+    ///   view with the given dimensions (numRows by numCols), that
+    ///   has contiguous storage.  Reallocate storage if needed.
+    ///
+    /// "Contiguous storage" means that if A is the matrix view
+    /// result, then A.stride(1) == A.extent(0).
+    template<class T>
+    device_mat_view_type<T>
+    get_contiguous_device_mat_view (device_vector_type<T>& storage,
+                                    const size_t numRows,
+                                    const size_t numCols)
+    {
+      const char prefix[] = "TSQR::Impl::get_contiguous_device_mat_view: ";
+
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( prefix );
+
+      const size_t currentStorageSize (storage.extent (0));
+      const size_t requiredStorageSize = numRows * numCols;
+      if (currentStorageSize < requiredStorageSize) {
+        // It costs about as much to allocate 8B on device as 800B.
+        constexpr size_t minStorageSize = 100;
+        const size_t newStorageSize =
+          std::max (minStorageSize, requiredStorageSize);
+
+        // Free it first, so that two allocations won't coexist.
+        storage = device_vector_type<T> ();
+        using Kokkos::view_alloc;
+        using Kokkos::WithoutInitializing;
+        const char label[] = "matrixStorage";
+
+        TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::Impl::get_contiguous_device_mat_view: Right before allocating" );
+
+        try {
+          storage = device_vector_type<T>
+            (view_alloc (std::string (label), WithoutInitializing),
+             newStorageSize);
+        }
+        catch (std::exception& e) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::runtime_error, prefix << "Allocating rank-1 "
+             "View of size " << newStorageSize << " to represent a "
+             << numRows << " x " << numCols << " matrix threw: "
+             << std::endl << e.what ());
+        }
+      }
+      return device_mat_view_type<T> (storage.data (),
+                                      numRows, numCols);
+    }
+
+    template<class T>
+    host_mat_view_type<T>
+    get_contiguous_host_mat_view (host_vector_type<T>& storage,
+                                  const size_t numRows,
+                                  const size_t numCols)
+    {
+      const char prefix[] = "TSQR::Impl::get_contiguous_host_mat_view: ";
+
+      const size_t currentStorageSize (storage.extent (0));
+      const size_t requiredStorageSize = numRows * numCols;
+      if (currentStorageSize < requiredStorageSize) {
+        // It costs about as much to allocate 8B on host as 800B.
+        constexpr size_t minStorageSize = 100;
+        const size_t newStorageSize =
+          std::max (minStorageSize, requiredStorageSize);
+
+        // Free it first, so that two allocations won't coexist.
+        storage = host_vector_type<T> ();
+        using Kokkos::view_alloc;
+        using Kokkos::WithoutInitializing;
+        const char label[] = "hostMatrixStorage";
+
+        try {
+          storage = host_vector_type<T>
+            (view_alloc (std::string (label), WithoutInitializing),
+             newStorageSize);
+        }
+        catch (std::exception& e) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::runtime_error, prefix << "Allocating rank-1 "
+             "host View of size " << newStorageSize << " to store a "
+             << numRows << " x " << numCols << " matrix threw: "
+             << std::endl << e.what ());
+        }
+      }
+      return host_mat_view_type<T> (storage.data (),
+                                    numRows, numCols);
+    }
+
+    // info_type & const_info_type
+
+    using info_type = Kokkos::View<int, cusolver_memory_space>;
+    using const_info_type = Kokkos::View<const int, cusolver_memory_space>;
+
+    template<class LocalOrdinal, class Scalar>
+    class CuSolverNodeFactorOutput :
+      public NodeFactorOutput<LocalOrdinal, Scalar>
+    {
+    public:
+      //using cuda_value_type = typename Impl::CudaValue<Scalar>::type;
+      using kokkos_value_type = non_const_kokkos_value_type<Scalar>;
+      using const_tau_type = device_vector_type<const kokkos_value_type>;
+      using const_unmanaged_tau_type =
+        device_vec_view_type<const kokkos_value_type>;
+
+      CuSolverNodeFactorOutput (const const_tau_type& tau,
+                                const const_info_type& info) :
+        tau_ (tau), info_ (info)
+      {}
+
+      const_unmanaged_tau_type tau () const { return tau_; }
+
+      int info () const {
+        int info_h = 0;
+        Kokkos::deep_copy (info_h, info_);
+        return info_h;
+      }
+
+    private:
+      const_tau_type tau_;
+      const_info_type info_;
+    };
+
+    template<class ScalarType, class IndexType>
+    class SetDiagonalEntriesToOne {
+      static_assert (! std::is_const<ScalarType>::value,
+        "SetDiagonalEntriesToOne requires a View of nonconst.");
+    public:
+      SetDiagonalEntriesToOne
+        (const device_mat_view_type<ScalarType>& A) : A_ (A) {}
+      KOKKOS_INLINE_FUNCTION void
+      operator() (const IndexType j) const {
+        A_(j,j) = ScalarType (1.0);
+      }
+    private:
+      device_mat_view_type<ScalarType> A_;
+    };
+
+    template<class ScalarType>
+    void
+    set_diagonal_entries_to_one
+      (const device_mat_view_type<ScalarType>& A)
+    {
+      static_assert (! std::is_const<ScalarType>::value,
+        "set_diagonal_entries_to_one requires a View of nonconst.");
+      using LO =
+        typename std::make_signed<decltype (A.extent (1)) >::type;
+      const LO ncols = std::min (A.extent (0), A.extent (1));
+      using Kokkos::RangePolicy;
+      RangePolicy<cusolver_execution_space, LO> range (0, ncols);
+      Kokkos::parallel_for
+        ("set_diagonal_entries_to_one", range,
+         SetDiagonalEntriesToOne<ScalarType, LO> (A));
+    }
+
+  } // namespace Impl
+
+  /// \class CuSolverNodeTsqr
+  /// \brief NodeTsqr implementation based on cuSOLVER.
+  /// \author Mark Hoemmen
+  template<class LocalOrdinal, class Scalar>
+  class CuSolverNodeTsqr : public NodeTsqr<LocalOrdinal, Scalar>
+  {
+  private:
+    using base_type = NodeTsqr<LocalOrdinal, Scalar>;
+    using my_factor_output_type =
+      Impl::CuSolverNodeFactorOutput<LocalOrdinal, Scalar>;
+    using kokkos_value_type =
+      Impl::non_const_kokkos_value_type<Scalar>;
+
+  public:
+    using ordinal_type = typename base_type::ordinal_type;
+    using scalar_type = typename base_type::scalar_type;
+    using factor_output_type = typename base_type::factor_output_type;
+
+    CuSolverNodeTsqr () = default;
+
+    Teuchos::RCP<const Teuchos::ParameterList>
+    getValidParameters () const override
+    {
+      return Teuchos::parameterList ("NodeTsqr");
+    }
+
+    void
+    setParameterList
+      (const Teuchos::RCP<Teuchos::ParameterList>&) override
+    {}
+
+    std::string description () const override {
+      return "CuSolverNodeTsqr";
+    }
+
+    bool wants_device_memory () const override { return true; }
+
+    bool ready () const override {
+      return true;
+    }
+
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      return false;
+    }
+
+    size_t cache_size_hint () const override {
+      return 0;
+    }
+
+  private:
+    using tau_type = Impl::device_vector_type<kokkos_value_type>;
+
+    // must return owning, since we'll pass off to factor output
+    tau_type
+    get_tau (const LocalOrdinal numCols) const
+    {
+      using Impl::reallocDeviceVectorIfNeeded;
+      Impl::reallocDeviceVectorIfNeeded (tau_, "tau", size_t (numCols));
+      return tau_;
+    }
+
+    using work_type = Impl::device_vector_type<kokkos_value_type>;
+    using nonowning_work_type =
+      Impl::device_vec_view_type<kokkos_value_type>;
+
+    nonowning_work_type
+    get_work_for_geqrf (const LocalOrdinal numRows,
+                        const LocalOrdinal numCols,
+                        Scalar A[],
+                        const LocalOrdinal lda) const
+    {
+      using TSQR::Impl::CuSolver;
+      using TSQR::Impl::CuSolverHandle;
+
+      auto info = get_info ();
+      CuSolver<Scalar> solver
+        {CuSolverHandle::getSingleton (), info.data ()};
+      const int lwork =
+        solver.compute_QR_lwork (numRows, numCols, A, lda);
+      // Avoid constant reallocation by setting a minimum lwork.
+      constexpr int min_lwork = 128;
+      const int new_lwork = lwork < min_lwork ? min_lwork : lwork;
+      using Impl::reallocDeviceVectorIfNeeded;
+      reallocDeviceVectorIfNeeded (work_, "work", new_lwork);
+      return nonowning_work_type (work_);
+    }
+
+    nonowning_work_type
+    get_work_for_apply_Q_factor (const ApplyType& apply_type,
+                                 const LocalOrdinal nrows,
+                                 const LocalOrdinal ncols_C,
+                                 const LocalOrdinal ncols_Q,
+                                 const Scalar A[],
+                                 const LocalOrdinal lda,
+                                 const Scalar tau[],
+                                 Scalar C[],
+                                 const LocalOrdinal ldc) const
+    {
+      using TSQR::Impl::CuSolver;
+      using TSQR::Impl::CuSolverHandle;
+
+      auto info = get_info ();
+      CuSolver<Scalar> solver
+        {CuSolverHandle::getSingleton (), info.data ()};
+      const char side = 'L';
+      const char trans = apply_type.toString ()[0];
+      const int lwork =
+        solver.apply_Q_factor_lwork (side, trans,
+                                     nrows, ncols_C, ncols_Q,
+                                     A, lda, tau, C, ldc);
+      // Avoid constant reallocation by setting a minimum lwork.
+      constexpr int min_lwork = 128;
+      const int new_lwork = lwork < min_lwork ? min_lwork : lwork;
+      using Impl::reallocDeviceVectorIfNeeded;
+      reallocDeviceVectorIfNeeded (work_, "work", new_lwork);
+      return nonowning_work_type (work_);
+    }
+
+    // must return owning, since we'll pass off to factor output
+    Impl::info_type
+    get_info () const
+    {
+      if (info_.data () == nullptr) {
+        info_ = Impl::info_type ("info");
+      }
+      // "get last error" model will avoid doing multiple info allocations.
+      return info_;
+    }
+
+    Impl::device_mat_view_type<kokkos_value_type>
+    get_Q_copy (const LocalOrdinal nrows,
+                const LocalOrdinal ncols,
+                const Scalar Q[], // DEVICE MEMORY
+                const LocalOrdinal ldq) const
+    {
+      using Impl::get_contiguous_device_mat_view;
+      auto Q_copy =
+        get_contiguous_device_mat_view (matrixStorage_, nrows, ncols);
+      auto Q_view = Impl::get_device_mat_view (nrows, ncols, Q, ldq);
+      // NOTE (mfh 17 Dec 2019) We're copying device to device, so the
+      // Kokkos::deep_copy noncontiguity problem does not apply.
+      Kokkos::deep_copy (Q_copy, Q_view);
+      return Q_copy;
+    }
+
+    Impl::device_mat_view_type<kokkos_value_type>
+    get_B_copy (const LocalOrdinal nrows_and_ncols,
+                const Scalar B[], // HOST MEMORY
+                const LocalOrdinal ldb) const
+    {
+      auto B_copy =
+        Impl::get_contiguous_device_mat_view (matrixStorage_,
+                                              nrows_and_ncols,
+                                              nrows_and_ncols);
+      // Use copy_from_host, which knows how to avoid the
+      // Kokkos::deep_copy noncontiguity problem.
+      Scalar* B_copy_raw = reinterpret_cast<Scalar*> (B_copy.data ());
+      const LocalOrdinal B_copy_stride (B_copy.extent (1));
+      MatView<LocalOrdinal, Scalar> B_copy_matview
+        (nrows_and_ncols, nrows_and_ncols, B_copy_raw, B_copy_stride);
+      MatView<LocalOrdinal, const Scalar> B_matview
+        (nrows_and_ncols, nrows_and_ncols, B, ldb);
+      this->copy_from_host (B_copy_matview, B_matview);
+      return B_copy;
+    }
+
+    void
+    extract_R (const LocalOrdinal nrows,
+               const LocalOrdinal ncols,
+               const Scalar A[], // DEVICE POINTER
+               const LocalOrdinal lda,
+               Scalar R[], // HOST POINTER
+               const LocalOrdinal ldr,
+               const bool /* contiguous_cache_blocks */) const
+    {
+      using std::endl;
+      const char prefix[] = "TSQR::CuSolverNodeTsqr::extract_R: ";
+
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "Top of TSQR::CuSolverNodeTsqr::extract_R" );
+
+      std::unique_ptr<std::ostringstream> err;
+      bool threw = false;
+
+      using Impl::get_device_mat_view;
+      using a_view_type = decltype (get_device_mat_view<const Scalar>
+                                    (nrows, ncols, A, lda));
+      a_view_type A_view;
+      try {
+        A_view = get_device_mat_view<const Scalar>
+          (nrows, ncols, A, lda);
+      }
+      TSQR_IMPL_CATCH( "get_device_mat_view of A threw: " );
+
+      auto R_view =
+        Impl::get_host_mat_view<Scalar> (ncols, ncols, R, ldr);
+
+      try {
+        // Fill R (including lower triangle) with zeros.
+        //Kokkos::deep_copy (R_view, kokkos_value_type {});
+
+        // The above code throws the following exception, even though
+        // R_view is most definitely a host View:
+        //
+        // TSQR::CuSolverNodeTsqr::extract_R:
+        // Kokkos::deep_copy(R_view, 0) threw an exception:
+        // cudaDeviceSynchronize() error( cudaErrorIllegalAddress): an
+        // illegal memory access was encountered
+        // .../kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp:120
+
+        MatView<LocalOrdinal, Scalar> R_mv (ncols, ncols, R, ldr);
+        deep_copy (R_mv, Scalar {});
+      }
+      TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, 0.0) threw: " );
+
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::extract_R, "
+                                       "after deep_copy(R_mv, 0.0)" );
+
+      // Copy out the upper triangle of the R factor from A into R.
+      //
+      // The following (pseudo)code often does not work:
+      //
+      // auto A_view_top = subview(A_view, {0, ncols}, ALL());
+      // Kokkos::deep_copy(R_view, A_view_top);
+      //
+      // Kokkos throws an exception, claiming "no available copy
+      // mechanism."  This is probably because A_view is not packed.
+      // This means that cudaMemcpy won't work, so Kokkos must execute
+      // a kernel to copy the data.  However, that kernel must be able
+      // to access both Views.  In this case, it (thinks it) can't,
+      // because R_view is a HostSpace View and A_view_top is a device
+      // View (even though it may be a CudaUVMSpace View).
+
+      using Kokkos::ALL;
+      using Kokkos::subview;
+      using LO = LocalOrdinal;
+      const std::pair<LO, LO> rowRange (0, ncols);
+      auto A_view_top = subview (A_view, rowRange, ALL ());
+
+      if (size_t (A_view_top.stride (1)) == size_t (A_view_top.extent (0))) {
+        try {
+          Kokkos::deep_copy (R_view, A_view_top);
+        }
+        TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, A_view_top) "
+                         "for contiguous A_view_top threw: ");
+        TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::extract_R, "
+                                         "after attempting "
+                                         "Kokkos::deep_copy(R_view, A_view_top) "
+                                         "with contiguous A_view_top" );
+      }
+      else { // A_view_top is NOT contiguous
+        // Packed device version of R.
+        Impl::device_mat_view_type<kokkos_value_type> R_copy;
+        try {
+          using Impl::get_contiguous_device_mat_view;
+          R_copy = get_contiguous_device_mat_view (matrixStorage_,
+                                                   ncols, ncols);
+        }
+        TSQR_IMPL_CATCH( "R_copy = get_contiguous_device_mat_view threw: " );
+
+        TEUCHOS_ASSERT( size_t (R_copy.extent (0)) == size_t (ncols) );
+        TEUCHOS_ASSERT( size_t (R_copy.extent (1)) == size_t (ncols) );
+        TEUCHOS_ASSERT( size_t (R_copy.stride (1)) == size_t (ncols) );
+
+        try {
+          Kokkos::deep_copy (R_copy, A_view_top);
+        }
+        TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_copy, A_view_top) threw: ");
+        try {
+          Kokkos::deep_copy (R_view, R_copy);
+        }
+        TSQR_IMPL_CATCH( "Kokkos::deep_copy(R_view, R_copy) threw: ");
+      }
+
+      try {
+        for (LO j = 0; j < ncols; ++j) {
+          auto R_j = subview (R_view, ALL (), j);
+          for (LO i = j + LO(1); i < LO (R_j.extent(0)); ++i) {
+            R_j(i) = kokkos_value_type {};
+          }
+        }
+      }
+      TSQR_IMPL_CATCH( "Filling lower triangle of R_view with zeros threw: ");
+    }
+
+  public:
+    Teuchos::RCP<factor_output_type>
+    factor (const LocalOrdinal nrows,
+            const LocalOrdinal ncols,
+            Scalar A[],
+            const LocalOrdinal lda,
+            Scalar R[],
+            const LocalOrdinal ldr,
+            const bool contigCacheBlocks) const override
+    {
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor (top)" );
+
+      // It's a common case to call factor() again and again with the
+      // same pointers.  In that case, it's wasteful for us to
+      // allocate a new tau array each time, especially since most
+      // users want explicit Q anyway (and thus will never see tau).
+      auto tau = get_tau (ncols);
+      // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes
+      // std::complex, but Kokkos::View stores Kokkos::complex.  We're
+      // assuming they have the same alignment here, but all of Tpetra
+      // assumes that.
+      Scalar* tau_raw = reinterpret_cast<Scalar*> (tau.data ());
+      auto work = get_work_for_geqrf (nrows, ncols, A, lda);
+      Scalar* work_raw = reinterpret_cast<Scalar*> (work.data ());
+      const int lwork (work.extent (0));
+      auto info = get_info ();
+
+      using TSQR::Impl::CuSolver;
+      using TSQR::Impl::CuSolverHandle;
+      CuSolver<Scalar> solver
+        {CuSolverHandle::getSingleton (), info.data ()};
+
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, "
+                                       "before solver.compute_QR" );
+      try {
+        solver.compute_QR (nrows, ncols, A, lda, tau_raw,
+                           work_raw, lwork);
+      }
+      catch (std::exception& e) {
+        std::ostringstream err;
+        err << "TSQR::CuSolverNodeTsqr::factor: CuSolver::compute_QR "
+          "threw an exception: " << std::endl << e.what ();
+        throw std::runtime_error (err.str ());
+      }
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, "
+                                       "after solver.compute_QR, "
+                                       "before extract_R" );
+      try {
+        this->extract_R (nrows, ncols, A, lda, R, ldr,
+                         contigCacheBlocks);
+      }
+      catch (std::exception& e) {
+        std::ostringstream err;
+        err << "TSQR::CuSolverNodeTsqr::factor: extract_R "
+          "threw an exception: " << std::endl << e.what ();
+        throw std::runtime_error (err.str ());
+      }
+
+      TSQR_IMPL_CHECK_LAST_CUDA_ERROR( "TSQR::CuSolverNodeTsqr::factor, "
+                                       "after extract_R" );
+      return Teuchos::rcp (new my_factor_output_type (tau, info));
+    }
+
+  private:
+    const my_factor_output_type&
+    get_my_factor_output (const factor_output_type& factor_output) const
+    {
+      const char prefix[] = "TSQR::CuSolverNodeTsqr: ";
+
+      const my_factor_output_type* output_ptr =
+        dynamic_cast<const my_factor_output_type*> (&factor_output);
+      if (output_ptr == nullptr) {
+        const std::string this_name = Teuchos::typeName (*this);
+        const std::string factor_output_type_name =
+          Teuchos::TypeNameTraits<my_factor_output_type>::name ();
+        const std::string dynamic_type_name =
+          Teuchos::demangleName (typeid (factor_output).name ());
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::invalid_argument, prefix << "Input "
+           "factor_output_type object was not created by the same "
+           "type of CuSolverNodeTsqr object as this one.  This "
+           "object has type " << this_name << " and its subclass of "
+           "factor_output_type has type " << factor_output_type_name
+           << ", but the input factor_output_type object has dynamic "
+           "type " << dynamic_type_name << ".");
+      }
+      return *output_ptr;
+    }
+
+  public:
+    void
+    apply (const ApplyType& apply_type,
+           const LocalOrdinal nrows,
+           const LocalOrdinal ncols_Q,
+           const Scalar Q[],
+           const LocalOrdinal ldq,
+           const factor_output_type& factor_output,
+           const LocalOrdinal ncols_C,
+           Scalar C[],
+           const LocalOrdinal ldc,
+           const bool contigCacheBlocks) const override
+    {
+      const char prefix[] = "TSQR::CuSolverNodeTsqr::apply: ";
+
+      // Quick exit and error tests
+      if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) {
+        return;
+      }
+      else if (ldc < nrows) {
+        std::ostringstream os;
+        os << prefix << "ldc (= " << ldc << ") < nrows (= "
+           << nrows << ")";
+        throw std::invalid_argument (os.str());
+      }
+      else if (ldq < nrows) {
+        std::ostringstream os;
+        os << prefix << "ldq (= " << ldq << ") < nrows (= "
+           << nrows << ")";
+        throw std::invalid_argument (os.str());
+      }
+
+      const char side = 'L';
+      const char trans = apply_type.toString ()[0];
+      auto tau = get_my_factor_output (factor_output).tau ();
+      // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes
+      // std::complex, but Kokkos::View stores Kokkos::complex.  We're
+      // assuming they have the same alignment here, but all of Tpetra
+      // assumes that.
+      const Scalar* tau_raw =
+        reinterpret_cast<const Scalar*> (tau.data ());
+      auto work =
+        get_work_for_apply_Q_factor (apply_type,
+                                     nrows, ncols_C, ncols_Q,
+                                     Q, ldq, tau_raw, C, ldc);
+      Scalar* work_raw = reinterpret_cast<Scalar*> (work.data ());
+      const int lwork (work.extent (0));
+      auto info = get_info ();
+
+      using TSQR::Impl::CuSolver;
+      using TSQR::Impl::CuSolverHandle;
+      CuSolver<Scalar> solver
+        {CuSolverHandle::getSingleton (), info.data ()};
+      solver.apply_Q_factor (side, trans,
+                             nrows, ncols_C, ncols_Q,
+                             Q, ldq, tau_raw,
+                             C, ldc,
+                             work_raw, lwork);
+    }
+
+    /// \brief Copy from a host matrix, to "native" NodeTsqr device
+    ///   storage.
+    virtual void
+    copy_from_host (const MatView<LocalOrdinal, Scalar>& C_dev,
+                    const MatView<LocalOrdinal, const Scalar>& C_host) const
+    {
+      const char prefix[] =
+        "TSQR::CuSolverNodeTsqr::copy_from_host: ";
+
+      const size_t nrows (C_dev.extent (0));
+      const size_t ncols (C_dev.extent (1));
+      TEUCHOS_ASSERT( nrows == size_t (C_host.extent (0)) );
+      TEUCHOS_ASSERT( ncols == size_t (C_host.extent (1)) );
+
+      auto C_dev_view = Impl::get_device_mat_view<Scalar>
+        (nrows, ncols, C_dev.data (), C_dev.stride (1));
+      auto C_host_view = Impl::get_host_mat_view<const Scalar>
+        (nrows, ncols, C_host.data (), C_host.stride (1));
+
+      // NOTE (mfh 17 Dec 2019) If C_host is contiguous, that is, if
+      // C_host.stride(1) == C_host.extent(0), then we can
+      // Kokkos::deep_copy directly.  Otherwise, Kokkos::deep_copy
+      // will throw an exception, claiming "no available copy
+      // mechanism."  This is because cudaMemcpy won't work, so Kokkos
+      // must execute a kernel to copy the data.  (Kokkos doesn't seem
+      // to exploit any of the various 2-D or 3-D array copying
+      // functions that CUDA provides.)  That kernel must be able to
+      // access both Views.  We deal with this with a fall-back path
+      // that uses temporary contiguous storage.
+
+      if (C_dev_view.stride (1) == C_dev_view.extent (0) &&
+          C_host_view.stride (1) == C_host_view.extent (0)) {
+        // Both Views are contiguous.
+        try {
+          Kokkos::deep_copy (C_dev_view, C_host_view);
+        }
+        catch (std::exception& e) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::runtime_error, prefix <<
+             "Kokkos::deep_copy(C_dev_view, C_host_view) (both "
+             "contiguous) threw: " << e.what ());
+        }
+      }
+      else {
+        // We need to make a contiguous copy of host storage.
+        auto C_host_copy = Impl::get_contiguous_host_mat_view
+          (hostMatrixStorage_, nrows, ncols);
+        TEUCHOS_ASSERT( C_host_copy.stride (1) ==
+                        C_host_copy.extent (0) );
+        try {
+          Kokkos::deep_copy (C_host_copy, C_host_view);
+        }
+        catch (std::exception& e) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::runtime_error, prefix <<
+             "Kokkos::deep_copy(C_host_copy, C_host_view) threw: "
+             << e.what ());
+        }
+
+        if (C_dev_view.stride (1) == C_dev_view.extent (0)) {
+          try {
+            Kokkos::deep_copy (C_dev_view, C_host_copy);
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix <<
+               "Kokkos::deep_copy(C_dev_view, C_host_copy) threw: "
+               << e.what ());
+          }
+        }
+        else {
+          auto C_dev_copy = Impl::get_contiguous_device_mat_view
+            (matrixStorage_, nrows, ncols);
+          try {
+            Kokkos::deep_copy (C_dev_copy, C_host_copy);
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix <<
+               "Kokkos::deep_copy(C_dev_copy, C_host_copy) threw: "
+               << e.what ());
+          }
+          try {
+            Kokkos::deep_copy (C_dev_view, C_dev_copy);
+          }
+          catch (std::exception& e) {
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (true, std::runtime_error, prefix <<
+               "Kokkos::deep_copy(C_dev_view, C_dev_copy) threw: "
+               << e.what ());
+          }
+        }
+      }
+    }
+
+    /// \brief Copy from "native" NodeTsqr device storage, to a packed
+    ///   host matrix.
+    Matrix<LocalOrdinal, Scalar>
+    copy_to_host
+      (const MatView<LocalOrdinal, Scalar>& C) const override
+    {
+      using LO = LocalOrdinal;
+      const LO nrows (C.extent (0));
+      const LO ncols (C.extent (1));
+      const LO ldc (C.stride (1));
+      auto C_dev =
+        Impl::get_device_mat_view<const Scalar> (nrows, ncols,
+                                                 C.data (), ldc);
+      Matrix<LO, Scalar> C_copy (nrows, ncols);
+      auto C_host = Impl::get_host_mat_view (C_copy.view ());
+
+      // NOTE (mfh 17 Dec 2019) Directly calling
+      // Kokkos::deep_copy(C_host, C_dev) may not necessarily work,
+      // since C_dev need not be contiguous.  In that case, Kokkos
+      // would throw an exception, claiming "no available copy
+      // mechanism."  The work-around is to create a packed device
+      // View, copy C_dev into it, then copy the packed View to
+      // C_host.
+      try {
+        Kokkos::deep_copy (C_host, C_dev);
+      }
+      catch (std::exception& /* e */) {
+        auto C_dev_copy =
+          Impl::get_contiguous_device_mat_view (matrixStorage_,
+                                                nrows, ncols);
+        Kokkos::deep_copy (C_dev_copy, C_dev);
+        Kokkos::deep_copy (C_host, C_dev_copy);
+      }
+      return C_copy;
+    }
+
+    /// \brief Fill C (DEVICE MEMORY) with the first C.extent(1)
+    ///   columns of the identity matrix.  Assume that C has already
+    ///   been pre-filled with zeros.
+    void
+    set_diagonal_entries_to_one
+      (const MatView<LocalOrdinal, Scalar>& C) const override
+    {
+      auto C_view =
+        Impl::get_device_mat_view (C.extent (0), C.extent (1),
+                                   C.data (), C.stride (1));
+      Impl::set_diagonal_entries_to_one (C_view);
+    }
+
+    void
+    explicit_Q (const LocalOrdinal nrows,
+                const LocalOrdinal ncols_Q,
+                const Scalar Q[], // DEVICE MEMORY
+                const LocalOrdinal ldq,
+                const factor_output_type& factor_output,
+                const LocalOrdinal ncols_C,
+                Scalar C[], // DEVICE MEMORY
+                const LocalOrdinal ldc,
+                const bool contigCacheBlocks) const override
+    {
+      using Impl::get_device_mat_view;
+      auto C_view = get_device_mat_view (nrows, ncols_C, C, ldc);
+      using IST = Impl::non_const_kokkos_value_type<Scalar>;
+      deep_copy (C_view, IST {});
+      Impl::set_diagonal_entries_to_one (C_view);
+      apply (ApplyType::NoTranspose,
+             nrows, ncols_Q, Q, ldq, factor_output,
+             ncols_C, C, ldc, contigCacheBlocks);
+    }
+
+    void
+    Q_times_B (const LocalOrdinal nrows,
+               const LocalOrdinal ncols,
+               Scalar Q[], // DEVICE MEMORY
+               const LocalOrdinal ldq,
+               const Scalar B[], // HOST MEMORY
+               const LocalOrdinal ldb,
+               const bool /* contigCacheBlocks */) const override
+    {
+      // Take the easy exit if available.
+      if (ncols == 0 || nrows == 0) {
+        return;
+      }
+
+      // _GEMM doesn't permit the in/out matrix to alias either of the
+      // two input matrices, so we must make a copy.
+      auto Q_copy = get_Q_copy (nrows, ncols, Q, ldq);
+
+      // We assume that B is in host memory, so we need to copy it to
+      // device before we can use cuBLAS.
+      auto B_copy = get_B_copy (ncols, B, ldb);
+
+      constexpr Scalar ZERO {};
+      constexpr Scalar ONE (1.0);
+
+      using TSQR::Impl::CuBlas;
+      using TSQR::Impl::CuBlasHandle;
+      CuBlas<Scalar> blas {CuBlasHandle::getSingleton ()};
+
+      const char transa = 'N';
+      const char transb = 'N';
+      // FIXME (mfh 11 Dec 2019) TSQR::Impl::CuBlas takes
+      // std::complex, but Kokkos::View stores Kokkos::complex.  We're
+      // assuming they have the same alignment here, but all of Tpetra
+      // assumes that.
+      const Scalar* Q_copy_raw =
+        reinterpret_cast<const Scalar*> (Q_copy.data ());
+      const int Q_copy_stride (Q_copy.stride (1));
+      blas.gemm (transa, transb, nrows, ncols, ncols,
+                 ONE, Q_copy_raw, Q_copy_stride,
+                 B, ldb, ZERO, Q, ldq);
+    }
+
+    void
+    cache_block (const LocalOrdinal /* nrows */,
+                 const LocalOrdinal /* ncols */,
+                 Scalar /* A_out */ [],
+                 const Scalar /*A_in */ [],
+                 const LocalOrdinal /* lda_in */) const override
+    {}
+
+    void
+    un_cache_block (const LocalOrdinal /* nrows */,
+                    const LocalOrdinal /* ncols */,
+                    Scalar /* A_out */[],
+                    const LocalOrdinal /* lda_out */,
+                    const Scalar /* A_in */ []) const override
+    {}
+
+    void
+    fill_with_zeros (const LocalOrdinal nrows,
+                     const LocalOrdinal ncols,
+                     Scalar A[],
+                     const LocalOrdinal lda,
+                     const bool /* contigCacheBlocks */) const override
+    {
+      auto A_view = Impl::get_device_mat_view (nrows, ncols, A, lda);
+      Kokkos::deep_copy (A_view, kokkos_value_type {});
+    }
+
+  private:
+    mutable tau_type tau_;
+    mutable work_type work_;
+    mutable Impl::info_type info_;
+    mutable Impl::device_vector_type<kokkos_value_type> matrixStorage_;
+    mutable Impl::host_vector_type<kokkos_value_type> hostMatrixStorage_;
+  };
+
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+#endif // TSQR_CUSOLVERNODETSQR_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp
index 39aba991f8cc..a0933b4cad5d 100644
--- a/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqr.hpp
@@ -40,8 +40,8 @@
 /// \file Tsqr_DistTsqr.hpp
 /// \brief Internode part of TSQR.
 ///
-#ifndef __TSQR_Tsqr_DistTsqr_hpp
-#define __TSQR_Tsqr_DistTsqr_hpp
+#ifndef TSQR_DISTTSQR_HPP
+#define TSQR_DISTTSQR_HPP
 
 #include "Tsqr_DistTsqrHelper.hpp"
 #include "Tsqr_DistTsqrRB.hpp"
@@ -64,12 +64,16 @@ namespace TSQR {
   template<class LocalOrdinal, class Scalar>
   class DistTsqr : public Teuchos::ParameterListAcceptorDefaultBase {
   public:
-    typedef Scalar scalar_type;
-    typedef LocalOrdinal ordinal_type;
-    typedef MatView<ordinal_type, scalar_type > mat_view_type;
-    typedef std::vector<std::vector<scalar_type> > VecVec;
-    typedef std::pair<VecVec, VecVec> FactorOutput;
-    typedef int rank_type;
+    using scalar_type = Scalar;
+    using ordinal_type = LocalOrdinal;
+
+  private:
+    using VecVec = std::vector<std::vector<scalar_type>>;
+
+  public:
+    using mat_view_type = MatView<ordinal_type, scalar_type>;
+    using FactorOutput = std::pair<VecVec, VecVec>;
+    using rank_type = int;
 
     /// \brief Constructor (that accepts a parameter list).
     ///
@@ -125,10 +129,10 @@ namespace TSQR {
     /// communicator, if the latter is an MPI communicator.  If it's a
     /// serial "communicator," the rank is always zero.
     rank_type rank() const {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       return messenger_->rank();
     }
 
@@ -138,18 +142,14 @@ namespace TSQR {
     /// communicator, if the latter is an MPI communicator.  If it's a
     /// serial "communicator," the size is always one.
     rank_type size() const {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       return messenger_->size();
     }
 
-    /// \brief Destructor.
-    ///
-    /// The destructor doesn't need to do anything, thanks to smart
-    /// pointers.
-    virtual ~DistTsqr () {}
+    virtual ~DistTsqr () = default;
 
     /// \brief Does the R factor have a nonnegative diagonal?
     ///
@@ -159,14 +159,16 @@ namespace TSQR {
     /// negative entries.  This Boolean tells you whether DistTsqr
     /// promises to compute an R factor whose diagonal entries are all
     /// nonnegative.
-    bool QR_produces_R_factor_with_nonnegative_diagonal () const {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
-      typedef Combine<ordinal_type, scalar_type> combine_type;
-      return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() &&
-        reduceBroadcastImpl_->QR_produces_R_factor_with_nonnegative_diagonal();
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const
+    {
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
+      TEUCHOS_ASSERT( reduceBroadcastImpl_.getRawPtr () != nullptr );
+      return reduceBroadcastImpl_->
+        QR_produces_R_factor_with_nonnegative_diagonal ();
     }
 
     /// \brief Internode TSQR with explicit Q factor.
@@ -198,10 +200,10 @@ namespace TSQR {
                     mat_view_type Q_mine,
                     const bool forceNonnegativeDiagonal=false)
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       reduceBroadcastImpl_->factorExplicit (R_mine, Q_mine,
                                             forceNonnegativeDiagonal);
     }
@@ -214,10 +216,10 @@ namespace TSQR {
     void
     getFactorExplicitTimings (std::vector<TimeStats>& stats) const
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       reduceBroadcastImpl_->getStats (stats);
     }
 
@@ -229,10 +231,10 @@ namespace TSQR {
     void
     getFactorExplicitTimingLabels (std::vector<std::string>& labels) const
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       reduceBroadcastImpl_->getStatsLabels (labels);
     }
 
@@ -262,10 +264,10 @@ namespace TSQR {
     FactorOutput
     factor (mat_view_type R_mine)
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       VecVec Q_factors, tau_arrays;
       DistTsqrHelper<ordinal_type, scalar_type> helper;
       const ordinal_type ncols = R_mine.extent(1);
@@ -278,9 +280,13 @@ namespace TSQR {
       const int P = messenger_->size();
       const int my_rank = messenger_->rank();
       const int first_tag = 0;
-      std::vector<scalar_type> work (ncols);
-      helper.factor_helper (ncols, R_local, my_rank, 0, P-1, first_tag,
-                            messenger_.get(), Q_factors, tau_arrays, work);
+
+      const ordinal_type lwork = helper.work_size (ncols);
+      std::vector<scalar_type> work (lwork);
+      helper.factor_helper (ncols, R_local, my_rank, 0, P-1,
+                            first_tag, messenger_.get (),
+                            Q_factors, tau_arrays,
+                            work.data (), lwork);
       deep_copy (R_mine, R_local_view);
       return std::make_pair (Q_factors, tau_arrays);
     }
@@ -294,10 +300,10 @@ namespace TSQR {
            const ordinal_type ldc_mine,
            const FactorOutput& factor_output)
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       const bool transposed = apply_type.transposed();
       TEUCHOS_TEST_FOR_EXCEPTION(transposed, std::logic_error,
                                  "DistTsqr: Applying Q^T or Q^H has not yet "
@@ -306,18 +312,20 @@ namespace TSQR {
       const int my_rank = messenger_->rank();
       const int first_tag = 0;
       std::vector<scalar_type> C_other (ncols_C * ncols_C);
-      std::vector<scalar_type> work (ncols_C);
+      DistTsqrHelper<ordinal_type, scalar_type> helper;
+      const ordinal_type lwork = helper.work_size (ncols_C);
+      std::vector<scalar_type> work (lwork);
 
       const VecVec& Q_factors = factor_output.first;
       const VecVec& tau_arrays = factor_output.second;
 
       // assert (Q_factors.size() == tau_arrays.size());
       const int cur_pos = Q_factors.size() - 1;
-      DistTsqrHelper<ordinal_type, scalar_type> helper;
-      helper.apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
-                           C_other.data(), my_rank, 0, P-1, first_tag,
-                           messenger_.get(), Q_factors, tau_arrays, cur_pos,
-                           work);
+
+      helper.apply_helper (apply_type, ncols_C, ncols_Q, C_mine,
+                           ldc_mine, C_other.data (), my_rank, 0, P-1,
+                           first_tag, messenger_.get (), Q_factors,
+                           tau_arrays, cur_pos, work.data (), lwork);
     }
 
     //! Apply the result of \c factor() to compute the explicit Q factor.
@@ -327,20 +335,18 @@ namespace TSQR {
                 const ordinal_type ldq_mine,
                 const FactorOutput& factor_output)
     {
-      TEUCHOS_TEST_FOR_EXCEPTION(! ready(), std::logic_error,
-                                 "Before using DistTsqr computational methods, "
-                                 "you must first call init() with a valid "
-                                 "MessengerBase instance.");
-      const int myRank = messenger_->rank ();
-
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! ready (), std::logic_error, "Before using DistTsqr "
+         "computational methods, you must first call init() with a "
+         "valid MessengerBase instance.");
       MatView<ordinal_type, scalar_type> Q_mine_view
         (ncols_Q, ncols_Q, Q_mine, ldq_mine);
       deep_copy (Q_mine_view, scalar_type {});
+
+      const int myRank = messenger_->rank ();
       if (myRank == 0) {
         for (ordinal_type j = 0; j < ncols_Q; ++j) {
-          // FIXME (26 Nov 2019) Eventually, we only want to write to
-          // a matrix through a Kokkos kernel or a TPL.
-          Q_mine[j + j*ldq_mine] = scalar_type (1.0);
+          Q_mine_view(j,j) = scalar_type (1.0);
         }
       }
       apply (ApplyType::NoTranspose, ncols_Q, ncols_Q,
@@ -348,17 +354,18 @@ namespace TSQR {
     }
 
   private:
-    Teuchos::RCP<MessengerBase<scalar_type> > messenger_;
-    Teuchos::RCP<DistTsqrRB<ordinal_type, scalar_type> > reduceBroadcastImpl_;
+    Teuchos::RCP<MessengerBase<scalar_type>> messenger_;
+    Teuchos::RCP<DistTsqrRB<ordinal_type, scalar_type>> reduceBroadcastImpl_;
 
     /// \brief Whether this object is ready to perform computations.
     ///
     /// It is <i>not</i> ready until after \c init() has been called.
     bool ready() const {
-      return ! messenger_.is_null() && ! reduceBroadcastImpl_.is_null();
+      return ! messenger_.is_null () &&
+        ! reduceBroadcastImpl_.is_null ();
     }
   };
 
 } // namespace TSQR
 
-#endif // __TSQR_Tsqr_DistTsqr_hpp
+#endif // TSQR_DISTTSQR_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp
index 67ecc2b31e06..6bb60a160535 100644
--- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrHelper.hpp
@@ -37,12 +37,12 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __TSQR_Tsqr_DistTsqrHelper_hpp
-#define __TSQR_Tsqr_DistTsqrHelper_hpp
+#ifndef TSQR_DISTTSQRHELPER_HPP
+#define TSQR_DISTTSQRHELPER_HPP
 
 #include "Tsqr_MatView.hpp"
 #include "Tsqr_MessengerBase.hpp"
-#include "Tsqr_Combine.hpp"
+#include "Tsqr_Impl_CombineUser.hpp"
 #include "Tsqr_Util.hpp"
 
 #include <algorithm> // std::min, std::max
@@ -59,53 +59,61 @@ namespace TSQR {
   /// The only reason to mess with this class is if you want to change
   /// how the internode part of TSQR is implemented.
   template<class LocalOrdinal, class Scalar>
-  class DistTsqrHelper {
+  class DistTsqrHelper :
+    private Impl::CombineUser<LocalOrdinal, Scalar> {
   public:
-    DistTsqrHelper () {}
+    using ordinal_type = LocalOrdinal;
+    using scalar_type = Scalar;
+
+    ordinal_type work_size (const ordinal_type ncols) {
+      auto& combine = this->getCombine (ncols);
+      return combine.work_size (2*ncols, ncols, ncols);
+    }
 
     void
-    factor_pair (const LocalOrdinal ncols,
-                 std::vector< Scalar >& R_mine,
-                 const LocalOrdinal P_mine,
-                 const LocalOrdinal P_other,
-                 const LocalOrdinal tag,
-                 MessengerBase<Scalar>* const messenger,
-                 std::vector<std::vector<Scalar> >& Q_factors,
-                 std::vector<std::vector<Scalar> >& tau_arrays,
-                 std::vector<Scalar >& work)
+    factor_pair (const ordinal_type ncols,
+                 std::vector<scalar_type>& R_mine,
+                 const ordinal_type P_mine,
+                 const ordinal_type P_other,
+                 const ordinal_type tag,
+                 MessengerBase<scalar_type>* const messenger,
+                 std::vector<std::vector<scalar_type>>& Q_factors,
+                 std::vector<std::vector<scalar_type>>& tau_arrays,
+                 scalar_type work[],
+                 const ordinal_type lwork)
     {
       using std::endl;
       using std::ostringstream;
       using std::vector;
-
-      if (P_mine == P_other)
+      using LO = ordinal_type;
+      if (P_mine == P_other) {
         return; // nothing to do
-
+      }
       const int P_top = std::min (P_mine, P_other);
       const int P_bot = std::max (P_mine, P_other);
-      const LocalOrdinal nelts = ncols * ncols;
-      const LocalOrdinal ldr = ncols;
-      MatView<LocalOrdinal, Scalar> R_mine_view
+      const LO nelts = ncols * ncols;
+      const LO ldr = ncols;
+      MatView<LO, scalar_type> R_mine_view
         (ncols, ncols, R_mine.data (), ldr);
-      vector< Scalar > R_other (nelts);
-      MatView<LocalOrdinal, Scalar> R_other_view
+      vector<scalar_type> R_other (nelts);
+      MatView<LO, scalar_type> R_other_view
         (ncols, ncols, R_other.data (), ldr);
-      vector< Scalar > tau (ncols);
+      vector<scalar_type> tau (ncols);
 
       // Send and receive R factor.
-      messenger->swapData (R_mine.data(), R_other.data(),
+      messenger->swapData (R_mine.data (), R_other.data (),
                            nelts, P_other, tag);
 
-      Combine<LocalOrdinal, Scalar> combine;
+      auto& combine = this->getCombine (ncols);
       if (P_mine == P_top) {
         combine.factor_pair (R_mine_view, R_other_view,
-                             tau.data(), work.data());
+                             tau.data(), work, lwork);
         Q_factors.push_back (R_other);
         tau_arrays.push_back (tau);
       }
       else if (P_mine == P_bot) {
         combine.factor_pair (R_other_view, R_mine_view,
-                             tau.data(), work.data());
+                             tau.data (), work, lwork);
         Q_factors.push_back (R_mine);
         // Make sure that the "bottom" processor gets the current R
         // factor, which is returned in R_mine.
@@ -116,255 +124,265 @@ namespace TSQR {
         ostringstream os;
         os << "Should never get here: P_mine (= " << P_mine
            << ") not one of P_top, P_bot = " << P_top << ", " << P_bot;
-        throw std::logic_error (os.str());
+        throw std::logic_error (os.str ());
       }
     }
 
     void
-    factor_helper (const LocalOrdinal ncols,
-                   std::vector< Scalar >& R_mine,
-                   const LocalOrdinal my_rank,
-                   const LocalOrdinal P_first,
-                   const LocalOrdinal P_last,
-                   const LocalOrdinal tag,
-                   MessengerBase< Scalar >* const messenger,
-                   std::vector< std::vector< Scalar > >& Q_factors,
-                   std::vector< std::vector< Scalar > >& tau_arrays,
-                   std::vector< Scalar >& work)
+    factor_helper (const ordinal_type ncols,
+                   std::vector<scalar_type>& R_mine,
+                   const ordinal_type my_rank,
+                   const ordinal_type P_first,
+                   const ordinal_type P_last,
+                   const ordinal_type tag,
+                   MessengerBase<scalar_type>* const messenger,
+                   std::vector<std::vector<scalar_type>>& Q_factors,
+                   std::vector<std::vector<scalar_type>>& tau_arrays,
+                   scalar_type work[],
+                   const ordinal_type lwork)
     {
       using std::endl;
       using std::ostringstream;
       using std::vector;
 
-      if (P_last <= P_first)
+      if (P_last <= P_first) {
         return;
-      else
-        {
-          const int P = P_last - P_first + 1;
-          // Whether the interval [P_first, P_last] has an even number of
-          // elements.  Our interval splitting scheme ensures that the
-          // interval [P_first, P_mid - 1] always has an even number of
-          // elements.
-          const bool b_even = (P % 2 == 0);
-          // We split the interval [P_first, P_last] into 2 intervals:
-          // [P_first, P_mid-1], and [P_mid, P_last].  We bias the
-          // splitting procedure so that the lower interval always has an
-          // even number of processor ranks, and never has fewer processor
-          // ranks than the higher interval.
-          const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1);
-
-          if (my_rank < P_mid) // Interval [P_first, P_mid-1]
-            {
-              factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1,
-                             tag + 1, messenger, Q_factors, tau_arrays, work);
-
-              // If there aren't an even number of processors in the
-              // original interval, then the last processor in the lower
-              // interval has to skip this round.
-              if (b_even || my_rank < P_mid - 1) {
-                const int my_offset = my_rank - P_first;
-                const int P_other = P_mid + my_offset;
-                if (P_other < P_mid || P_other > P_last) {
-                  throw std::logic_error ("P_other not in [P_mid,P_last] range");
-                }
-                factor_pair (ncols, R_mine, my_rank, P_other, tag,
-                             messenger, Q_factors, tau_arrays, work);
-              }
-              // If I'm skipping this round, get the "current" R factor
-              // from P_mid.
-              if (! b_even && my_rank == P_mid - 1) {
-                const int theTag = 142; // magic constant
-                messenger->recv (&R_mine[0], ncols*ncols, P_mid, theTag);
-              }
-            }
-          else // Interval [P_mid, P_last]
-            {
-              factor_helper (ncols, R_mine, my_rank, P_mid, P_last,
-                             tag + 1, messenger, Q_factors, tau_arrays, work);
-
-              const int my_offset = my_rank - P_mid;
-              const int P_other = P_first + my_offset;
-
-              if (P_other < P_first || P_other >= P_mid)
-                throw std::logic_error ("P_other not in [P_first,P_mid-1] range");
-              factor_pair (ncols, R_mine, my_rank, P_other, tag,
-                           messenger, Q_factors, tau_arrays, work);
-
-              // If Proc P_mid-1 is skipping this round, Proc P_mid will
-              // send it the "current" R factor.
-              if (! b_even)
-                {
-                  const int theTag = 142; // magic constant
-                  messenger->send (R_mine.data(), ncols*ncols, P_mid-1, theTag);
-                }
+      }
+      else {
+        const int P = P_last - P_first + 1;
+        // Whether the interval [P_first, P_last] has an even number
+        // of elements.  Our interval splitting scheme ensures that
+        // the interval [P_first, P_mid - 1] always has an even number
+        // of elements.
+        const bool b_even = (P % 2 == 0);
+        // We split the interval [P_first, P_last] into 2 intervals:
+        // [P_first, P_mid-1], and [P_mid, P_last].  We bias the
+        // splitting procedure so that the lower interval always has
+        // an even number of processor ranks, and never has fewer
+        // processor ranks than the higher interval.
+        const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1);
+
+        if (my_rank < P_mid) { // Interval [P_first, P_mid-1]
+          factor_helper (ncols, R_mine, my_rank, P_first, P_mid - 1,
+                         tag + 1, messenger, Q_factors, tau_arrays,
+                         work, lwork);
+
+          // If there aren't an even number of processors in the
+          // original interval, then the last processor in the lower
+          // interval has to skip this round.
+          if (b_even || my_rank < P_mid - 1) {
+            const int my_offset = my_rank - P_first;
+            const int P_other = P_mid + my_offset;
+            if (P_other < P_mid || P_other > P_last) {
+              throw std::logic_error ("P_other not in [P_mid,P_last] range");
             }
+            factor_pair (ncols, R_mine, my_rank, P_other, tag,
+                         messenger, Q_factors, tau_arrays,
+                         work, lwork);
+          }
+          // If I'm skipping this round, get the "current" R factor
+          // from P_mid.
+          if (! b_even && my_rank == P_mid - 1) {
+            const int theTag = 142; // magic constant
+            messenger->recv (R_mine.data (), ncols*ncols, P_mid,
+                             theTag);
+          }
         }
+        else { // Interval [P_mid, P_last]
+          factor_helper (ncols, R_mine, my_rank, P_mid, P_last,
+                         tag + 1, messenger, Q_factors, tau_arrays,
+                         work, lwork);
+          const int my_offset = my_rank - P_mid;
+          const int P_other = P_first + my_offset;
+
+          if (P_other < P_first || P_other >= P_mid) {
+            throw std::logic_error ("P_other not in [P_first,"
+                                    "P_mid-1] range");
+          }
+          factor_pair (ncols, R_mine, my_rank, P_other, tag,
+                       messenger, Q_factors, tau_arrays, work, lwork);
+
+          // If Proc P_mid-1 is skipping this round, Proc P_mid will
+          // send it the "current" R factor.
+          if (! b_even) {
+            const int theTag = 142; // magic constant
+            messenger->send (R_mine.data(), ncols*ncols, P_mid-1, theTag);
+          }
+        }
+      }
     }
 
     void
     apply_pair (const ApplyType& apply_type,
-                const LocalOrdinal ncols_C,
-                const LocalOrdinal ncols_Q,
-                Scalar C_mine[],
-                const LocalOrdinal ldc_mine,
-                Scalar C_other[], // contiguous ncols_C x ncols_C scratch
-                const LocalOrdinal P_mine,
-                const LocalOrdinal P_other,
-                const LocalOrdinal tag,
-                MessengerBase< Scalar >* const messenger,
-                const std::vector< Scalar >& Q_cur,
-                const std::vector< Scalar >& tau_cur,
-                std::vector< Scalar >& work)
+                const ordinal_type ncols_C,
+                const ordinal_type ncols_Q,
+                scalar_type C_mine[],
+                const ordinal_type ldc_mine,
+                scalar_type C_other[], // contiguous ncols_C x ncols_C scratch
+                const ordinal_type P_mine,
+                const ordinal_type P_other,
+                const ordinal_type tag,
+                MessengerBase<scalar_type>* const messenger,
+                const std::vector<scalar_type>& Q_cur,
+                const std::vector<scalar_type>& tau_cur,
+                scalar_type work[],
+                const ordinal_type lwork)
     {
       using std::endl;
       using std::ostringstream;
       using std::vector;
+      using LO = ordinal_type;
+      using const_mat_view_type = MatView<LO, const scalar_type>;
+      using mat_view_type = MatView<LO, scalar_type>;
 
-      if (P_mine == P_other)
+      if (P_mine == P_other) {
         return; // nothing to do
-
+      }
       const int P_top = std::min (P_mine, P_other);
       const int P_bot = std::max (P_mine, P_other);
-
-      const LocalOrdinal nelts = ncols_C * ncols_C;
-      const LocalOrdinal ldq = ncols_Q;
-      const LocalOrdinal ldc_other = ncols_C;
+      const LO nelts = ncols_C * ncols_C;
+      const LO ldq = ncols_Q;
+      const LO ldc_other = ncols_C;
 
       // Send and receive C_mine resp. C_other to the other processor of
       // the pair.
-      messenger->swapData (&C_mine[0], &C_other[0], nelts, P_other, tag);
+      messenger->swapData (C_mine, C_other, nelts, P_other, tag);
 
-      Combine< LocalOrdinal, Scalar > combine;
-      if (P_mine == P_top)
-        combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq,
-                            &tau_cur[0], C_mine, ldc_mine, C_other, ldc_other,
-                            &work[0]);
-      else if (P_mine == P_bot)
-        combine.apply_pair (apply_type, ncols_C, ncols_Q, &Q_cur[0], ldq,
-                            &tau_cur[0], C_other, ldc_other, C_mine, ldc_mine,
-                            &work[0]);
-      else
-        {
-          ostringstream os;
-          os << "Should never get here: P_mine (= " << P_mine
-             << ") not one of P_top, P_bot = " << P_top << ", " << P_bot;
-          throw std::logic_error (os.str());
-        }
+      const_mat_view_type Q_bot
+        (ncols_Q, ncols_Q, Q_cur.data (), ldq);
+      auto& combine = this->getCombine (std::max (ncols_Q, ncols_C));
+      if (P_mine == P_top) {
+        mat_view_type C_top (ncols_Q, ncols_C, C_mine, ldc_mine);
+        mat_view_type C_bot (ncols_Q, ncols_C, C_other, ldc_other);
+        combine.apply_pair (apply_type, Q_bot, tau_cur.data (),
+                            C_top, C_bot, work, lwork);
+      }
+      else if (P_mine == P_bot) {
+        mat_view_type C_top (ncols_Q, ncols_C, C_other, ldc_other);
+        mat_view_type C_bot (ncols_Q, ncols_C, C_mine, ldc_mine);
+        combine.apply_pair (apply_type, Q_bot, tau_cur.data (),
+                            C_top, C_bot, work, lwork);
+      }
+      else {
+        ostringstream os;
+        os << "Should never get here: P_mine (= " << P_mine
+           << ") not one of P_top, P_bot = " << P_top << ", "
+           << P_bot;
+        throw std::logic_error (os.str ());
+      }
     }
 
     void
     apply_helper (const ApplyType& apply_type,
-                  const LocalOrdinal ncols_C,
-                  const LocalOrdinal ncols_Q,
-                  Scalar C_mine[],
-                  const LocalOrdinal ldc_mine,
-                  Scalar C_other[], // contiguous ncols_C x ncols_C scratch
-                  const LocalOrdinal my_rank,
-                  const LocalOrdinal P_first,
-                  const LocalOrdinal P_last,
-                  const LocalOrdinal tag,
-                  MessengerBase< Scalar >* const messenger,
-                  const std::vector< std::vector< Scalar > >& Q_factors,
-                  const std::vector< std::vector< Scalar > >& tau_arrays,
-                  const LocalOrdinal cur_pos,
-                  std::vector< Scalar >& work)
+                  const ordinal_type ncols_C,
+                  const ordinal_type ncols_Q,
+                  scalar_type C_mine[],
+                  const ordinal_type ldc_mine,
+                  scalar_type C_other[], // contiguous ncols_C x ncols_C scratch
+                  const ordinal_type my_rank,
+                  const ordinal_type P_first,
+                  const ordinal_type P_last,
+                  const ordinal_type tag,
+                  MessengerBase<scalar_type>* const messenger,
+                  const std::vector<std::vector<scalar_type>>& Q_factors,
+                  const std::vector<std::vector<scalar_type>>& tau_arrays,
+                  const ordinal_type cur_pos,
+                  scalar_type work[],
+                  const ordinal_type lwork)
     {
       using std::endl;
       using std::ostringstream;
       using std::vector;
 
-      if (P_last <= P_first)
+      if (P_last <= P_first) {
         return;
-      else
-        {
-          const int P = P_last - P_first + 1;
-          // Whether the interval [P_first, P_last] has an even number of
-          // elements.  Our interval splitting scheme ensures that the
-          // interval [P_first, P_mid - 1] always has an even number of
-          // elements.
-          const bool b_even = (P % 2 == 0);
-          // We split the interval [P_first, P_last] into 2 intervals:
-          // [P_first, P_mid-1], and [P_mid, P_last].  We bias the
-          // splitting procedure so that the lower interval always has an
-          // even number of processor ranks, and never has fewer processor
-          // ranks than the higher interval.
-          const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1);
-
-          if (my_rank < P_mid) // Interval [P_first, P_mid - 1]
-            {
-              const bool b_participating = b_even || my_rank < P_mid - 1;
-
-              if (cur_pos < 0)
-                {
-                  ostringstream os;
-                  os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos
-                     << ") < 0; lower interval [" << P_first << "," << (P_mid-1)
-                     << "]; original interval [" << P_first << "," << P_last
-                     << "]" << endl;
-                  throw std::logic_error (os.str());
-                }
-
-              // If there aren't an even number of processors in the
-              // original interval, then the last processor in the lower
-              // interval has to skip this round.  Since we skip this
-              // round, don't decrement cur_pos (else we'll skip an entry
-              // and eventually fall off the front of the array.
-              int new_cur_pos;
-              if (b_even || my_rank < P_mid - 1)
-                {
-                  if (! b_participating)
-                    throw std::logic_error("Should never get here");
-
-                  const int my_offset = my_rank - P_first;
-                  const int P_other = P_mid + my_offset;
-                  // assert (P_mid <= P_other && P_other <= P_last);
-                  if (P_other < P_mid || P_other > P_last)
-                    throw std::logic_error("Should never get here");
-
-                  apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
-                              C_other, my_rank, P_other, tag, messenger,
-                              Q_factors[cur_pos], tau_arrays[cur_pos], work);
-                  new_cur_pos = cur_pos - 1;
-                }
-              else
-                {
-                  if (b_participating)
-                    throw std::logic_error("Should never get here");
-
-                  new_cur_pos = cur_pos;
-                }
-              apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
-                            C_other, my_rank, P_first, P_mid - 1, tag + 1,
-                            messenger, Q_factors, tau_arrays, new_cur_pos,
-                            work);
+      }
+      else {
+        const int P = P_last - P_first + 1;
+        // Whether the interval [P_first, P_last] has an even number
+        // of elements.  Our interval splitting scheme ensures that
+        // the interval [P_first, P_mid - 1] always has an even number
+        // of elements.
+        const bool b_even = (P % 2 == 0);
+        // We split the interval [P_first, P_last] into 2 intervals:
+        // [P_first, P_mid-1], and [P_mid, P_last].  We bias the
+        // splitting procedure so that the lower interval always has
+        // an even number of processor ranks, and never has fewer
+        // processor ranks than the higher interval.
+        const int P_mid = b_even ? (P_first + P/2) : (P_first + P/2 + 1);
+
+        if (my_rank < P_mid) { // Interval [P_first, P_mid - 1]
+          const bool b_participating = b_even || my_rank < P_mid - 1;
+
+          if (cur_pos < 0) {
+            ostringstream os;
+            os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos
+               << ") < 0; lower interval [" << P_first << "," << (P_mid-1)
+               << "]; original interval [" << P_first << "," << P_last
+               << "]" << endl;
+            throw std::logic_error (os.str());
+          }
+
+          // If there aren't an even number of processors in the
+          // original interval, then the last processor in the lower
+          // interval has to skip this round.  Since we skip this
+          // round, don't decrement cur_pos (else we'll skip an entry
+          // and eventually fall off the front of the array.
+          int new_cur_pos;
+          if (b_even || my_rank < P_mid - 1) {
+            if (! b_participating) {
+              throw std::logic_error("Should never get here");
             }
-          else
-            {
-              if (cur_pos < 0)
-                {
-                  ostringstream os;
-                  os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos
-                     << ") < 0; upper interval [" << P_mid << "," << P_last
-                     << "]; original interval [" << P_first << "," << P_last
-                     << "]" << endl;
-                  throw std::logic_error (os.str());
-                }
 
-              const int my_offset = my_rank - P_mid;
-              const int P_other = P_first + my_offset;
-              // assert (0 <= P_other && P_other < P_mid);
-              apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
-                          C_other, my_rank, P_other, tag, messenger,
-                          Q_factors[cur_pos], tau_arrays[cur_pos], work);
-              apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
-                            C_other, my_rank, P_mid, P_last, tag + 1,
-                            messenger, Q_factors, tau_arrays, cur_pos - 1,
-                            work);
+            const int my_offset = my_rank - P_first;
+            const int P_other = P_mid + my_offset;
+            // assert (P_mid <= P_other && P_other <= P_last);
+            if (P_other < P_mid || P_other > P_last) {
+              throw std::logic_error("Should never get here");
+            }
+            apply_pair (apply_type, ncols_C, ncols_Q, C_mine,
+                        ldc_mine, C_other, my_rank, P_other,
+                        tag, messenger, Q_factors[cur_pos],
+                        tau_arrays[cur_pos], work, lwork);
+            new_cur_pos = cur_pos - 1;
+          }
+          else {
+            if (b_participating) {
+              throw std::logic_error("Should never get here");
             }
+            new_cur_pos = cur_pos;
+          }
+          apply_helper (apply_type, ncols_C, ncols_Q, C_mine,
+                        ldc_mine, C_other, my_rank, P_first,
+                        P_mid - 1, tag + 1, messenger, Q_factors,
+                        tau_arrays, new_cur_pos, work, lwork);
         }
+        else {
+          if (cur_pos < 0) {
+            ostringstream os;
+            os << "On Proc " << my_rank << ": cur_pos (= " << cur_pos
+               << ") < 0; upper interval [" << P_mid << "," << P_last
+               << "]; original interval [" << P_first << "," << P_last
+               << "]" << endl;
+            throw std::logic_error (os.str ());
+          }
+
+          const int my_offset = my_rank - P_mid;
+          const int P_other = P_first + my_offset;
+          // assert (0 <= P_other && P_other < P_mid);
+          apply_pair (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
+                      C_other, my_rank, P_other, tag, messenger,
+                      Q_factors[cur_pos], tau_arrays[cur_pos],
+                      work, lwork);
+          apply_helper (apply_type, ncols_C, ncols_Q, C_mine, ldc_mine,
+                        C_other, my_rank, P_mid, P_last, tag + 1,
+                        messenger, Q_factors, tau_arrays, cur_pos - 1,
+                        work, lwork);
+        }
+      }
     }
   };
 
 } // namespace TSQR
 
-#endif // __TSQR_Tsqr_DistTsqrHelper_hpp
+#endif // TSQR_DISTTSQRHELPER_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp
index 10035b80c6df..472fd700142c 100644
--- a/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_DistTsqrRB.hpp
@@ -39,11 +39,11 @@
 //@HEADER
 */
 
-#ifndef __TSQR_DistTsqrRB_hpp
-#define __TSQR_DistTsqrRB_hpp
+#ifndef TSQR_DISTTSQRRB_HPP
+#define TSQR_DISTTSQRRB_HPP
 
 #include "Tsqr_ApplyType.hpp"
-#include "Tsqr_Combine.hpp"
+#include "Tsqr_Impl_CombineUser.hpp"
 #include "Tsqr_Matrix.hpp"
 #include "Tsqr_StatTimeMonitor.hpp"
 
@@ -129,7 +129,6 @@ namespace TSQR {
     };
   } // namespace details
 
-
   /// \class DistTsqrRB
   /// \brief Reduce-and-Broadcast (RB) version of DistTsqr.
   /// \author Mark Hoemmen
@@ -146,15 +145,15 @@ namespace TSQR {
   /// broadcast.  The implicit Q factor data stay on the MPI process
   /// where they were computed.
   template<class LocalOrdinal, class Scalar>
-  class DistTsqrRB {
+  class DistTsqrRB : private Impl::CombineUser<LocalOrdinal, Scalar> {
   public:
-    typedef LocalOrdinal ordinal_type;
-    typedef Scalar scalar_type;
-    typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type;
-    typedef MatView<ordinal_type, scalar_type> mat_view_type;
-    typedef Matrix<ordinal_type, scalar_type> matrix_type;
-    typedef int rank_type;
-    typedef Combine<ordinal_type, scalar_type> combine_type;
+    using ordinal_type = LocalOrdinal;
+    using scalar_type = Scalar;
+    using magnitude_type =
+      typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
+    using mat_view_type = MatView<ordinal_type, scalar_type>;
+    using matrix_type = Matrix<ordinal_type, scalar_type>;
+    using rank_type = int;
 
     /// \brief Constructor
     ///
@@ -193,10 +192,10 @@ namespace TSQR {
     /// timings from factorExplicit().  The vector gets resized if
     /// necessary to fit all the labels.
     void
-    getStatsLabels (std::vector< std::string >& labels) const
+    getStatsLabels (std::vector<std::string>& labels) const
     {
       const int numTimers = 5;
-      labels.resize (std::max (labels.size(), static_cast<size_t>(numTimers)));
+      labels.resize (std::max (labels.size (), size_t (numTimers)));
 
       labels[0] = totalTime_->name();
       labels[1] = reduceCommTime_->name();
@@ -208,7 +207,12 @@ namespace TSQR {
     /// Whether or not all diagonal entries of the R factor computed
     /// by the QR factorization are guaranteed to be nonnegative.
     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
-      return combine_type::QR_produces_R_factor_with_nonnegative_diagonal();
+      // FIXME (20 Dec 2019) If the combine type is dynamic, we can't
+      // answer this question without knowing the number of columns.
+      // Just guess for now.
+      constexpr LocalOrdinal fakeNumCols = 10;
+      auto& c = this->getCombine (fakeNumCols);
+      return c.QR_produces_R_factor_with_nonnegative_diagonal ();
     }
 
     /// \brief Internode TSQR with explicit Q factor
@@ -244,25 +248,23 @@ namespace TSQR {
       // R_mine has columns, but Q_mine may have any number of
       // columns.  (It depends on how many columns of the explicit Q
       // factor we want to compute.)
-      if (R_mine.extent(0) < R_mine.extent(1))
-        {
-          std::ostringstream os;
-          os << "R factor input has fewer rows (" << R_mine.extent(0)
-             << ") than columns (" << R_mine.extent(1) << ")";
-          // This is a logic error because TSQR users should not be
-          // calling this method directly.
-          throw std::logic_error (os.str());
-        }
-      else if (Q_mine.extent(0) != R_mine.extent(1))
-        {
-          std::ostringstream os;
-          os << "Q factor input must have the same number of rows as the R "
-            "factor input has columns.  Q has " << Q_mine.extent(0)
-             << " rows, but R has " << R_mine.extent(1) << " columns.";
-          // This is a logic error because TSQR users should not be
-          // calling this method directly.
-          throw std::logic_error (os.str());
-        }
+      if (R_mine.extent(0) < R_mine.extent(1)) {
+        std::ostringstream os;
+        os << "R factor input has fewer rows (" << R_mine.extent(0)
+           << ") than columns (" << R_mine.extent(1) << ")";
+        // This is a logic error because TSQR users should not be
+        // calling this method directly.
+        throw std::logic_error (os.str());
+      }
+      else if (Q_mine.extent(0) != R_mine.extent(1)) {
+        std::ostringstream os;
+        os << "Q factor input must have the same number of rows as the R "
+          "factor input has columns.  Q has " << Q_mine.extent(0)
+           << " rows, but R has " << R_mine.extent(1) << " columns.";
+        // This is a logic error because TSQR users should not be
+        // calling this method directly.
+        throw std::logic_error (os.str());
+      }
 
       // The factorization is a recursion over processors [P_first, P_last].
       const rank_type P_mine = messenger_->rank();
@@ -389,13 +391,13 @@ namespace TSQR {
           recv_R (R_other, P_mid);
 
           std::vector<scalar_type> tau (numCols);
-          // Don't shrink the workspace array; doing so may
-          // require expensive reallocation every time we send /
-          // receive data.
-          resizeWork (numCols);
 
-          combine_.factor_pair (R_mine, R_other.view (),
-                                tau.data(), work_.data());
+          auto& combine = this->getCombine (numCols);
+          const ordinal_type lwork =
+            combine.work_size (2 * numCols, numCols, numCols);
+          work_.resize (lwork);
+          combine.factor_pair (R_mine, R_other.view (),
+                               tau.data (), work_.data (), lwork);
           QFactors.push_back (R_other);
           tauArrays.push_back (tau);
         }
@@ -413,9 +415,11 @@ namespace TSQR {
                         const rank_type P_first,
                         const rank_type P_last,
                         const rank_type curpos,
-                        std::vector< matrix_type >& QFactors,
-                        std::vector< std::vector< scalar_type > >& tauArrays)
+                        std::vector<matrix_type>& QFactors,
+                        std::vector<std::vector<scalar_type>>& tauArrays)
     {
+      using LO = LocalOrdinal;
+
       if (P_last < P_first) {
         std::ostringstream os;
         os << "explicitQBroadcast: interval [P_first=" << P_first
@@ -444,8 +448,8 @@ namespace TSQR {
             throw std::logic_error (os.str());
           }
           // Q_impl, tau: implicitly stored local Q factor.
-          matrix_type& Q_impl = QFactors[curpos];
-          std::vector<scalar_type>& tau = tauArrays[curpos];
+          auto Q_bot = QFactors[curpos].view ();
+          const scalar_type* tau = tauArrays[curpos].data ();
 
           // Apply implicitly stored local Q factor to
           //   [Q_mine;
@@ -453,13 +457,18 @@ namespace TSQR {
           // where Q_other = zeros(Q_mine.extent(0), Q_mine.extent(1)).
           // Overwrite both Q_mine and Q_other with the result.
           deep_copy (Q_other, scalar_type {});
-          combine_.apply_pair (ApplyType::NoTranspose,
-                               Q_mine.extent(1), Q_impl.extent(1),
-                               Q_impl.data(), Q_impl.stride(1),
-                               tau.data(),
-                               Q_mine.data(), Q_mine.stride(1),
-                               Q_other.data(), Q_other.stride(1),
-                               work_.data());
+
+          const LO pair_nrows
+            (Q_mine.extent (0) + Q_other.extent (0));
+          const LO pair_ncols (Q_mine.extent (1));
+          auto& combine = this->getCombine (pair_ncols);
+          const LO lwork =
+            combine.work_size (pair_nrows, pair_ncols, pair_ncols);
+          if (lwork > LO (work_.size ())) {
+            work_.resize (lwork);
+          }
+          combine.apply_pair (ApplyType::NoTranspose, Q_bot, tau,
+                              Q_mine, Q_other, work_.data (), lwork);
           // Send the resulting Q_other, and the final R factor, to P_mid.
           send_Q_R (Q_other, R_mine, P_mid);
           newpos = curpos - 1;
@@ -476,9 +485,9 @@ namespace TSQR {
                               newpos, QFactors, tauArrays);
         }
         else { // Interval [P_mid, P_last]
-            explicitQBroadcast (R_mine, Q_mine, Q_other,
-                                P_mine, P_mid, P_last,
-                                newpos, QFactors, tauArrays);
+          explicitQBroadcast (R_mine, Q_mine, Q_other,
+                              P_mine, P_mid, P_last,
+                              newpos, QFactors, tauArrays);
         }
       }
     }
@@ -499,14 +508,15 @@ namespace TSQR {
       // Don't shrink the workspace array; doing so would still be
       // correct, but may require reallocation of data when it needs
       // to grow again.
-      resizeWork (numElts);
+      work_.resize (numElts);
 
       // Pack the Q data into the workspace array.
-      mat_view_type Q_contig (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0));
+      mat_view_type Q_contig (Q.extent (0), Q.extent (1),
+                              work_.data (), Q.extent (0));
       deep_copy (Q_contig, Q);
       // Pack the R data into the workspace array.
       pack_R (R, &work_[Q_size]);
-      messenger_->send (work_.data(), numElts, destProc, 0);
+      messenger_->send (work_.data (), numElts, destProc, 0);
     }
 
     template< class MatrixType1, class MatrixType2 >
@@ -525,12 +535,13 @@ namespace TSQR {
       // Don't shrink the workspace array; doing so would still be
       // correct, but may require reallocation of data when it needs
       // to grow again.
-      resizeWork (numElts);
+      work_.resize (numElts);
 
-      messenger_->recv (work_.data(), numElts, srcProc, 0);
+      messenger_->recv (work_.data (), numElts, srcProc, 0);
 
       // Unpack the C data from the workspace array.
-      deep_copy (Q, mat_view_type (Q.extent(0), Q.extent(1), work_.data(), Q.extent(0)));
+      deep_copy (Q, mat_view_type (Q.extent (0), Q.extent (1),
+                                   work_.data (), Q.extent (0)));
       // Unpack the R data from the workspace array.
       unpack_R (R, &work_[Q_size]);
     }
@@ -547,10 +558,10 @@ namespace TSQR {
       // Don't shrink the workspace array; doing so would still be
       // correct, but may require reallocation of data when it needs
       // to grow again.
-      resizeWork (numElts);
+      work_.resize (numElts);
       // Pack the R data into the workspace array.
-      pack_R (R, work_.data());
-      messenger_->send (work_.data(), numElts, destProc, 0);
+      pack_R (R, work_.data ());
+      messenger_->send (work_.data (), numElts, destProc, 0);
     }
 
     template< class MatrixType >
@@ -565,23 +576,26 @@ namespace TSQR {
       // Don't shrink the workspace array; doing so would still be
       // correct, but may require reallocation of data when it needs
       // to grow again.
-      resizeWork (numElts);
-      messenger_->recv (work_.data(), numElts, srcProc, 0);
+      work_.resize (numElts);
+      messenger_->recv (work_.data (), numElts, srcProc, 0);
       // Unpack the R data from the workspace array.
-      unpack_R (R, work_.data());
+      unpack_R (R, work_.data ());
     }
 
     template< class MatrixType >
     static void
     unpack_R (MatrixType& R, const scalar_type buf[])
     {
+      // FIXME (mfh 08 Dec 2019) Rewrite to use deep_copy; we don't
+      // want to access Matrix or MatView entries on host directly any
+      // more.
       ordinal_type curpos = 0;
-      for (ordinal_type j = 0; j < R.extent(1); ++j)
-        {
-          scalar_type* const R_j = &R(0, j);
-          for (ordinal_type i = 0; i <= j; ++i)
-            R_j[i] = buf[curpos++];
+      for (ordinal_type j = 0; j < R.extent(1); ++j) {
+        scalar_type* const R_j = &R(0, j);
+        for (ordinal_type i = 0; i <= j; ++i) {
+          R_j[i] = buf[curpos++];
         }
+      }
     }
 
     template< class ConstMatrixType >
@@ -589,37 +603,33 @@ namespace TSQR {
     pack_R (const ConstMatrixType& R, scalar_type buf[])
     {
       ordinal_type curpos = 0;
-      for (ordinal_type j = 0; j < R.extent(1); ++j)
-        {
-          const scalar_type* const R_j = &R(0, j);
-          for (ordinal_type i = 0; i <= j; ++i)
-            buf[curpos++] = R_j[i];
+      for (ordinal_type j = 0; j < R.extent(1); ++j) {
+        const scalar_type* const R_j = &R(0, j);
+        for (ordinal_type i = 0; i <= j; ++i) {
+          buf[curpos++] = R_j[i];
         }
-    }
-
-    void
-    resizeWork (const ordinal_type numElts)
-    {
-      typedef typename std::vector< scalar_type >::size_type vec_size_type;
-      work_.resize (std::max (work_.size(), static_cast< vec_size_type >(numElts)));
+      }
     }
 
   private:
-    combine_type combine_;
-    Teuchos::RCP< MessengerBase< scalar_type > > messenger_;
-    std::vector< scalar_type > work_;
+    Teuchos::RCP<MessengerBase<scalar_type>> messenger_;
+    std::vector<scalar_type> work_;
 
     // Timers for various phases of the factorization.  Time is
     // cumulative over all calls of factorExplicit().
-    Teuchos::RCP< Teuchos::Time > totalTime_;
-    Teuchos::RCP< Teuchos::Time > reduceCommTime_;
-    Teuchos::RCP< Teuchos::Time > reduceTime_;
-    Teuchos::RCP< Teuchos::Time > bcastCommTime_;
-    Teuchos::RCP< Teuchos::Time > bcastTime_;
-
-    TimeStats totalStats_, reduceCommStats_, reduceStats_, bcastCommStats_, bcastStats_;
+    Teuchos::RCP<Teuchos::Time> totalTime_;
+    Teuchos::RCP<Teuchos::Time> reduceCommTime_;
+    Teuchos::RCP<Teuchos::Time> reduceTime_;
+    Teuchos::RCP<Teuchos::Time> bcastCommTime_;
+    Teuchos::RCP<Teuchos::Time> bcastTime_;
+
+    TimeStats totalStats_;
+    TimeStats reduceCommStats_;
+    TimeStats reduceStats_;
+    TimeStats bcastCommStats_;
+    TimeStats bcastStats_;
   };
 
 } // namespace TSQR
 
-#endif // __TSQR_DistTsqrRB_hpp
+#endif // TSQR_DISTTSQRRB_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp
index 89f91f788cdc..f9d3647e3a21 100644
--- a/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_FullTsqrTest.hpp
@@ -37,17 +37,20 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __TSQR_Test_FullTsqrTest_hpp
-#define __TSQR_Test_FullTsqrTest_hpp
+#ifndef TSQR_TEST_FULLTSQRTEST_HPP
+#define TSQR_TEST_FULLTSQRTEST_HPP
 
 #include "Tsqr.hpp"
+#include "Tsqr_NodeTsqrFactory.hpp"
 #include "Tsqr_Random_NormalGenerator.hpp"
 #include "Tsqr_Random_GlobalMatrix.hpp"
+#include "Tsqr_SequentialTsqr.hpp"
 #include "Tsqr_TestSetup.hpp"
 #include "Tsqr_GlobalVerify.hpp"
 #include "Tsqr_TeuchosMessenger.hpp"
 #include "Tsqr_TestUtils.hpp"
 #include "Teuchos_ScalarTraits.hpp"
+#include "Teuchos_TypeNameTraits.hpp"
 
 #include <iostream>
 #include <stdexcept>
@@ -56,93 +59,181 @@
 namespace TSQR {
   namespace Test {
 
-    /// \class TsqrInaccurate
-    /// \brief Signals that a TSQR test failed due to insufficient accuracy.
-    class TsqrInaccurate : public std::exception {
-    public:
-      //! Constructor
-      TsqrInaccurate (const std::string& msg) : msg_ (msg) {}
-
-      //! The error message
-      const char* what() const throw() { return msg_.c_str(); }
-
-      //! Destructor (declared virtual for memory safety of subclasses).
-      virtual ~TsqrInaccurate() throw() {}
-
-    private:
-      std::string msg_;
-    };
+    template<class Scalar>
+    using kokkos_value_type = typename std::conditional<
+        std::is_const<Scalar>::value,
+        const typename Kokkos::ArithTraits<
+          typename std::remove_const<Scalar>::type>::val_type,
+        typename Kokkos::ArithTraits<Scalar>::val_type
+      >::type;
+
+    template<class LO, class Scalar>
+    Kokkos::View<kokkos_value_type<Scalar>**,
+                 Kokkos::LayoutLeft, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    getHostMatrixView (const MatView<LO, Scalar>& A)
+    {
+      using Kokkos::ALL;
+      using Kokkos::subview;
+      using IST = kokkos_value_type<Scalar>;
+      using host_mat_view_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft, Kokkos::HostSpace,
+          Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+      const size_t nrows (A.extent (0));
+      const size_t ncols (A.extent (1));
+      const size_t lda (A.stride (1));
+      IST* A_raw = reinterpret_cast<IST*> (A.data ());
+      host_mat_view_type A_full (A_raw, lda, ncols);
+      const std::pair<size_t, size_t> rowRange (0, nrows);
+      return Kokkos::subview (A_full, rowRange, Kokkos::ALL ());
+    }
+
+    template<class LO, class Scalar>
+    Kokkos::View<typename Kokkos::ArithTraits<Scalar>::val_type**,
+                 Kokkos::LayoutLeft>
+    getDeviceMatrixCopy (const MatView<LO, Scalar>& A,
+                         const std::string& label)
+    {
+      using Kokkos::view_alloc;
+      using Kokkos::WithoutInitializing;
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      const size_t nrows (A.extent (0));
+      const size_t ncols (A.extent (1));
+      device_matrix_type A_dev
+        (view_alloc (label, WithoutInitializing), nrows, ncols);
+      auto A_host = getHostMatrixView (A);
+      Kokkos::deep_copy (A_dev, A_host);
+      return A_dev;
+    }
 
     /// \class FullTsqrVerifier
-    /// \brief Test (correctness and) accuracy of Tsqr for one Scalar type.
+    /// \brief Test (correctness and) accuracy of Tsqr for one Scalar
+    ///   type.
     /// \author Mark Hoemmen
     ///
-    /// This class is meant to be used only by \c
-    /// FullTsqrVerifierCaller.  It performs one accuracy test of \c
-    /// Tsqr for the given Scalar type (that is, the type of the
-    /// matrix entries).  An accuracy test is also a correctness test.
-    /// This test computes accuracy bounds for both orthogonality and
-    /// forward errors, and if those bounds are exceeded and the
-    /// failIfInaccurate option is enabled, the test will throw a \c
+    /// \tparam Scalar Type of each matrix entry.
+    ///
+    /// This class is meant to be used only by FullTsqrVerifierCaller.
+    /// It performs one accuracy test of Tsqr for the given Scalar
+    /// type.  An accuracy test is also a correctness test.  This test
+    /// computes accuracy bounds for both orthogonality and forward
+    /// errors, and if those bounds are exceeded and the
+    /// failIfInaccurate option is enabled, the test will throw a
     /// TsqrInaccurate exception.
     ///
-    /// The test takes a \c Teuchos::ParameterList input.  For a
+    /// The test takes a Teuchos::ParameterList input.  For a
     /// ParameterList with all parameters, their default values, and
-    /// documentation, see the relevant class method in \c
+    /// documentation, see the relevant class method in
     /// FullTsqrVerifierCaller.
-    ///
-    /// This class currently only tests the version of Tsqr that is
-    /// the composition of NodeTsqrType=SequentialTsqr and
-    /// DistTsqrType=DistTsqr.  This should suffice to test
-    /// correctness, as long as the other NodeTsqrType possibilities
-    /// (such as TbbTsqr) are tested separately.
-    ///
     template<class Scalar>
     class FullTsqrVerifier {
     public:
-      typedef Scalar scalar_type;
-      typedef int ordinal_type;
-      typedef SequentialTsqr<ordinal_type, scalar_type> node_tsqr_type;
-      typedef DistTsqr<ordinal_type, scalar_type> dist_tsqr_type;
-      typedef Tsqr<ordinal_type, scalar_type, node_tsqr_type> tsqr_type;
+      using scalar_type = Scalar;
+      using ordinal_type = int;
+      using node_tsqr_type = NodeTsqr<ordinal_type, scalar_type>;
+      using dist_tsqr_type = DistTsqr<ordinal_type, scalar_type>;
+      using tsqr_type = Tsqr<ordinal_type, scalar_type>;
 
     private:
+      static Teuchos::RCP<node_tsqr_type>
+      getNodeTsqr (const Teuchos::RCP<Teuchos::ParameterList>& testParams,
+                   const bool myRank,
+                   const bool verbose,
+                   const std::string inputPrefix)
+      {
+        using Teuchos::RCP;
+        using Teuchos::rcp;
+        using Teuchos::rcp_implicit_cast;
+        using std::cerr;
+        using std::endl;
+        using device_type =
+          Kokkos::DefaultExecutionSpace::device_type;
+        const char cacheSizeHintParamName[] = "Cache Size Hint";
+        const std::string prefix = inputPrefix + "  ";
+
+        auto nodeTsqrParams = Teuchos::parameterList ("NodeTsqr");
+
+        size_t cacheSizeHint = 0;
+        if (testParams->isType<size_t> (cacheSizeHintParamName)) {
+          cacheSizeHint =
+            testParams->get<size_t> (cacheSizeHintParamName);
+          nodeTsqrParams->set (cacheSizeHintParamName, cacheSizeHint);
+        }
+        else if (testParams->isType<int> (cacheSizeHintParamName)) {
+          cacheSizeHint = static_cast<size_t>
+            (testParams->get<int> (cacheSizeHintParamName));
+          nodeTsqrParams->set (cacheSizeHintParamName, cacheSizeHint);
+        }
+
+        std::string nodeTsqrName ("Default");
+        if (testParams->isType<std::string> ("NodeTsqr")) {
+          nodeTsqrName = testParams->get<std::string> ("NodeTsqr");
+        }
+        if (myRank == 0 && verbose) {
+          cerr << prefix << "getNodeTsqr:" << endl
+               << prefix << "  - NodeTsqr: " << nodeTsqrName << endl
+               << prefix << "  - Cache Size Hint: " << cacheSizeHint
+               << endl;
+        }
+
+        RCP<node_tsqr_type> nodeTsqr;
+        using node_tsqr_factory_type = TSQR::NodeTsqrFactory<
+          scalar_type, ordinal_type, device_type>;
+        nodeTsqr = node_tsqr_factory_type::getNodeTsqr (nodeTsqrName);
+        TEUCHOS_ASSERT( ! nodeTsqr.is_null () );
+
+        if (myRank == 0 && verbose) {
+          using execution_space = device_type::execution_space;
+          const std::string spaceName =
+            Teuchos::TypeNameTraits<execution_space>::name ();
+          const std::string myPrefix = prefix + "  * ";
+
+          cerr << myPrefix << "execution_space: " << spaceName << endl
+               << myPrefix << "concurrency: "
+               << execution_space ().concurrency () << endl
+               << myPrefix << "Requested NodeTsqr subclass type: "
+               << nodeTsqrName << endl
+               << myPrefix << "Actual NodeTsqr subclass type: "
+               << Teuchos::typeName (*nodeTsqr) << endl;
+        }
+        return nodeTsqr;
+      }
 
       //! Instantiate and return a (full) Tsqr instance.
       static Teuchos::RCP<tsqr_type>
       getTsqr (const Teuchos::RCP<Teuchos::ParameterList>& testParams,
-               const Teuchos::RCP<const Teuchos::Comm<int> >& comm)
+               const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
+               const bool verbose)
       {
-        using Teuchos::ParameterList;
-        using Teuchos::parameterList;
-        using Teuchos::rcp_implicit_cast;
         using Teuchos::RCP;
         using Teuchos::rcp;
+        using Teuchos::rcp_implicit_cast;
+        using std::cerr;
+        using std::endl;
+        const int myRank = comm->getRank ();
 
-        const size_t cacheSizeHint = testParams->get<size_t> ("cacheSizeHint");
-        //const int numTasks = testParams->get<int> ("numTasks");
-
-        //RCP<ParameterList> tsqrParams = parameterList ("NodeTsqr");
-        //tsqrParams->set ("Cache Size Hint", cacheSizeHint);
-        //tsqrParams->set ("Num Tasks", numCores);
-
-        // TODO (mfh 21 Oct 2011) Some node_tsqr_type classes need a
-        // Kokkos Node instance.  SequentialTsqr doesn't, so this code
-        // should be fine for now.
-        RCP<node_tsqr_type> seqTsqr = rcp (new node_tsqr_type (cacheSizeHint));
+        const std::string prefix ("  ");
 
-        RCP<TeuchosMessenger<scalar_type> > scalarMess =
+        if (myRank == 0 && verbose) {
+          cerr << prefix << "- Set up TSQR::Tsqr instance" << endl;
+        }
+        auto nodeTsqr =
+          getNodeTsqr (testParams, myRank, verbose, prefix);
+        auto scalarMess =
           rcp (new TeuchosMessenger<scalar_type> (comm));
-        RCP<MessengerBase<scalar_type> > scalarMessBase =
-          rcp_implicit_cast<MessengerBase<scalar_type> > (scalarMess);
-        RCP<dist_tsqr_type> distTsqr = rcp (new dist_tsqr_type);
+        auto scalarMessBase =
+          rcp_implicit_cast<MessengerBase<scalar_type>> (scalarMess);
+        RCP<dist_tsqr_type> distTsqr (new dist_tsqr_type);
         distTsqr->init (scalarMessBase);
 
-        return rcp (new tsqr_type (seqTsqr, distTsqr));
+        return rcp (new tsqr_type (nodeTsqr, distTsqr));
       }
 
     public:
-
       /// \brief Run the test for the Scalar type.
       ///
       /// \param comm [in] Communicator over which to run the test.
@@ -151,7 +242,9 @@ namespace TSQR {
       /// \param randomSeed [in/out] On input: the random seed for
       ///   LAPACK's pseudorandom number generator.  On output: the
       ///   updated random seed.
-      static void
+      ///
+      /// \return Whether the test passed.
+      static bool
       run (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
            const Teuchos::RCP<Teuchos::ParameterList>& testParams,
            std::vector<int>& randomSeed)
@@ -159,31 +252,52 @@ namespace TSQR {
         using std::cerr;
         using std::cout;
         using std::endl;
-        using Teuchos::arcp;
         using Teuchos::ParameterList;
         using Teuchos::parameterList;
         using Teuchos::RCP;
         using Teuchos::rcp;
-        using Teuchos::rcp_const_cast;
         using Teuchos::rcp_implicit_cast;
-        typedef Matrix<ordinal_type, scalar_type> matrix_type;
-        typedef MatView<ordinal_type, scalar_type> mat_view_type;
-        typedef typename tsqr_type::FactorOutput factor_output_type;
-
-        const int myRank = Teuchos::rank (*comm);
-        const int numProcs = Teuchos::size (*comm);
-
-        // Construct TSQR implementation instance.
-        RCP<tsqr_type> tsqr = getTsqr (testParams, comm);
+        using matrix_type = Matrix<ordinal_type, scalar_type>;
+        using mat_view_type = MatView<ordinal_type, scalar_type>;
+
+        bool success = true;
+
+        TEUCHOS_ASSERT( ! comm.is_null () );
+        TEUCHOS_ASSERT( ! testParams.is_null () );
+
+        const int myRank = comm->getRank ();
+        const int numProcs = comm->getSize ();
+        const bool verbose = testParams->get<bool> ("verbose");
+        const ordinal_type numRowsLocal =
+          testParams->get<ordinal_type> ("numRowsLocal");
+        const ordinal_type numCols =
+          testParams->get<ordinal_type> ("numCols");
+        //const int numCores = testParams->get<int> ("numCores");
+        const bool contiguousCacheBlocks =
+          testParams->get<bool> ("contiguousCacheBlocks");
+        const bool testFactorExplicit =
+          testParams->get<bool> ("testFactorExplicit");
+        const bool testRankRevealing =
+          testParams->get<bool> ("testRankRevealing");
+
+        if (myRank == 0 && verbose) {
+          cerr << "Full TSQR test: Scalar="
+               << Teuchos::TypeNameTraits<Scalar>::name () << endl
+               << "  - Command-line arguments:" << endl
+               << "    * numRowsLocal: " << numRowsLocal << endl
+               << "    * numCols: " << numCols << endl
+               << "    * contiguousCacheBlocks: "
+               << (contiguousCacheBlocks ? "true" : "false") << endl
+               << "    * testFactorExplicit: "
+               << (testFactorExplicit ? "true" : "false") << endl
+               << "    * testRankRevealing: "
+               << (testRankRevealing ? "true" : "false") << endl
+               << "    * verbose: "
+               << (verbose ? "true" : "false") << endl;
+        }
 
-        // Fetch test parameters from the input parameter list.
-        const ordinal_type numRowsLocal = testParams->get<ordinal_type> ("numRowsLocal");
-        const ordinal_type numCols = testParams->get<ordinal_type> ("numCols");
-        const int numCores = testParams->get<int> ("numCores");
-        const bool contiguousCacheBlocks = testParams->get<bool> ("contiguousCacheBlocks");
-        const bool testFactorExplicit = testParams->get<bool> ("testFactorExplicit");
-        const bool testRankRevealing = testParams->get<bool> ("testRankRevealing");
-        const bool debug = testParams->get<bool> ("debug");
+        RCP<tsqr_type> tsqr = getTsqr (testParams, comm, verbose);
+        TEUCHOS_ASSERT( ! tsqr.is_null () );
 
         // Space for each process's local part of the test problem.
         // A_local, A_copy, and Q_local are distributed matrices, and
@@ -193,7 +307,7 @@ namespace TSQR {
         matrix_type Q_local (numRowsLocal, numCols);
         matrix_type R (numCols, numCols);
 
-        // Start out by filling the test problem with zeros.
+        // Start by filling the test problem with zeros.
         deep_copy (A_local, Scalar {});
         deep_copy (A_copy, Scalar {});
         deep_copy (Q_local, Scalar {});
@@ -222,15 +336,21 @@ namespace TSQR {
 
         // We need a Messenger for Ordinal-type data, so that we can
         // build a global random test matrix.
-        RCP<MessengerBase<ordinal_type>> ordinalMessenger =
-          rcp_implicit_cast<MessengerBase<ordinal_type>> (rcp (new TeuchosMessenger<ordinal_type> (comm)));
+        auto ordinalMessenger =
+          rcp_implicit_cast<MessengerBase<ordinal_type>>
+            (rcp (new TeuchosMessenger<ordinal_type> (comm)));
 
         // We also need a Messenger for Scalar-type data.  The TSQR
         // implementation already constructed one, but it's OK to
         // construct another one; TeuchosMessenger is just a thin
         // wrapper over the Teuchos::Comm object.
-        RCP<MessengerBase<scalar_type>> scalarMessenger =
-          rcp_implicit_cast<MessengerBase<scalar_type>> (rcp (new TeuchosMessenger<scalar_type> (comm)));
+        auto scalarMessenger =
+          rcp_implicit_cast<MessengerBase<scalar_type>>
+            (rcp (new TeuchosMessenger<scalar_type> (comm)));
+
+        if (myRank == 0 && verbose) {
+          cerr << "  - Generate test problem" << endl;
+        }
 
         {
           // Generate a global distributed matrix (whose part local to
@@ -239,68 +359,201 @@ namespace TSQR {
           using TSQR::Random::randomGlobalMatrix;
           mat_view_type A_local_view (A_local.extent(0),
                                       A_local.extent(1),
-                                      A_local.data(), A_local.stride(1));
+                                      A_local.data(),
+                                      A_local.stride(1));
           const magnitude_type* const singVals = singularValues.data();
-          randomGlobalMatrix<mat_view_type, generator_type> (&gen, A_local_view, singVals,
-                                                             ordinalMessenger.getRawPtr(),
-                                                             scalarMessenger.getRawPtr());
+          randomGlobalMatrix (&gen, A_local_view, singVals,
+                              ordinalMessenger.getRawPtr(),
+                              scalarMessenger.getRawPtr());
         }
         // Save the pseudorandom number generator's seed for any later
         // tests.  The generator keeps its own copy of the seed and
         // updates it internally, so we have to ask for its copy.
         gen.getSeed (randomSeed);
 
+        if (myRank == 0 && verbose) {
+          cerr << "-- tsqr->wants_device_memory() = "
+               << (tsqr->wants_device_memory () ? "true" : "false")
+               << endl;
+        }
+
+        using IST =
+          typename Kokkos::ArithTraits<scalar_type>::val_type;
+        using device_matrix_type =
+          Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+        auto A_h = getHostMatrixView (A_local.view ());
+        auto A_copy_h = getHostMatrixView (A_copy.view ());
+        auto Q_h = getHostMatrixView (Q_local.view ());
+        device_matrix_type A_d;
+        device_matrix_type A_copy_d;
+        device_matrix_type Q_d;
+        if (tsqr->wants_device_memory ()) {
+          A_d = getDeviceMatrixCopy (A_local.view (), "A_d");
+          // Don't copy A_copy yet; see below.
+          A_copy_d = device_matrix_type ("A_copy_d",
+                                         numRowsLocal, numCols);
+          Q_d = device_matrix_type ("Q_d", numRowsLocal, numCols);
+        }
+
         // If specified in the test parameters, rearrange cache blocks
         // in the copy.  Otherwise, just copy the test problem into
         // A_copy.  The factorization overwrites the input matrix, so
         // we have to make a copy in order to validate the final
         // result.
-        if (contiguousCacheBlocks) {
-          tsqr->cache_block (numRowsLocal, numCols, A_copy.data(),
-                             A_local.data(), A_local.stride(1));
-          if (debug) {
-            Teuchos::barrier (*comm);
-            if (myRank == 0)
-              cerr << "-- Finished Tsqr::cache_block" << endl;
+
+        if (! contiguousCacheBlocks) {
+          if (myRank == 0 && verbose) {
+            cerr << "  - Copy A into A_copy" << endl;
+          }
+          deep_copy (A_copy, A_local);
+          if (tsqr->wants_device_memory ()) {
+            deep_copy (A_copy_d, A_d);
           }
         }
         else {
-          deep_copy (A_copy, A_local);
+          if (myRank == 0 && verbose) {
+            cerr << "  - Copy A into A_copy via cache_block" << endl;
+          }
+          if (tsqr->wants_device_memory ()) {
+            Scalar* A_copy_d_raw =
+              reinterpret_cast<Scalar*> (A_copy_d.data ());
+            const Scalar* A_d_raw =
+              reinterpret_cast<const Scalar*> (A_d.data ());
+            tsqr->cache_block (numRowsLocal, numCols, A_copy_d_raw,
+                               A_d_raw, A_d.stride (1));
+            deep_copy (A_copy_h, A_copy_d);
+          }
+          else {
+            tsqr->cache_block (numRowsLocal, numCols, A_copy.data (),
+                               A_local.data (), A_local.stride (1));
+          }
+          if (myRank == 0 && verbose) {
+            cerr << "  - Finished cache-blocking the test problem"
+                 << endl;
+          }
         }
 
-        // "factorExplicit" is an alternate, hopefully faster way of
-        // factoring the matrix, when only the explicit Q factor is
-        // wanted.
         if (testFactorExplicit) {
-          tsqr->factorExplicitRaw (A_copy.extent (0), A_copy.extent (1),
-                                   A_copy.data (), A_copy.stride (1),
-                                   Q_local.data (), Q_local.stride (1),
-                                   R.data (), R.stride (1),
-                                   contiguousCacheBlocks);
-          if (debug) {
-            Teuchos::barrier (*comm);
-            if (myRank == 0)
-              cerr << "-- Finished Tsqr::factorExplicit" << endl;
+          if (myRank == 0 && verbose) {
+            cerr << "  - Call factorExplicitRaw" << endl;
+          }
+          try {
+            if (tsqr->wants_device_memory ()) {
+              Scalar* A_raw =
+                reinterpret_cast<Scalar*> (A_copy_d.data ());
+              Scalar* Q_raw = reinterpret_cast<Scalar*> (Q_d.data ());
+              tsqr->factorExplicitRaw (A_copy_d.extent (0),
+                                       A_copy_d.extent (1),
+                                       A_raw,
+                                       A_copy_d.stride (1),
+                                       Q_raw,
+                                       Q_d.stride (1),
+                                       R.data (), R.stride (1),
+                                       contiguousCacheBlocks);
+              if (myRank == 0 && verbose) {
+                cerr << "  - Finished factorExplicitRaw; now "
+                  "deep_copy(Q_h, Q_d)" << endl;
+              }
+              deep_copy (Q_h, Q_d);
+            }
+            else {
+              Scalar* A_raw = A_copy.data ();
+              Scalar* Q_raw = Q_local.data ();
+              tsqr->factorExplicitRaw (A_copy.extent (0),
+                                       A_copy.extent (1),
+                                       A_raw,
+                                       A_copy.stride (1),
+                                       Q_raw,
+                                       Q_local.stride (1),
+                                       R.data (), R.stride (1),
+                                       contiguousCacheBlocks);
+              if (myRank == 0 && verbose) {
+                cerr << "  - Finished factorExplicitRaw" << endl;
+              }
+            }
+          }
+          catch (std::exception& e) {
+            std::ostringstream os;
+            os << "Proc " << myRank << " threw an exception: "
+               << e.what () << endl;
+            cerr << os.str ();
+            MPI_Abort (MPI_COMM_WORLD, -1);
+          }
+
+          bool found_nonzero_in_R = false;
+          for (ordinal_type j = 0; j < numCols; ++j) {
+            for (ordinal_type i = 0; i < numCols; ++i) {
+              if (R(i,j) != scalar_type {}) {
+                found_nonzero_in_R = true;
+              }
+            }
+          }
+
+          if (! found_nonzero_in_R) {
+            success = false;
+            if (myRank == 0) {
+              const std::string prefix
+                (verbose ? "  - *** " : "*** ");
+              const std::string scalarName =
+                Teuchos::TypeNameTraits<scalar_type>::name ();
+              cerr << prefix << "For Scalar=" << scalarName
+                   << ": R factor resulting from factorExplicitRaw "
+                   << "is zero." << endl;
+            }
           }
         }
         else {
-          // Factor the (copy of the) matrix.
-          factor_output_type factorOutput =
-            tsqr->factor (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1),
-                          R.data(), R.stride(1), contiguousCacheBlocks);
-          if (debug) {
-            Teuchos::barrier (*comm);
-            if (myRank == 0)
-              cerr << "-- Finished Tsqr::factor" << endl;
+          if (myRank == 0 && verbose) {
+            cerr << "  - Call factor" << endl;
           }
-          // Compute the explicit Q factor in Q_local.
-          tsqr->explicit_Q (numRowsLocal, numCols, A_copy.data(), A_copy.stride(1),
-                            factorOutput, numCols, Q_local.data(), Q_local.stride(1),
-                            contiguousCacheBlocks);
-          if (debug) {
-            Teuchos::barrier (*comm);
-            if (myRank == 0)
-              cerr << "-- Finished Tsqr::explicit_Q" << endl;
+          auto factorOutput = [&] () {
+            if (tsqr->wants_device_memory ()) {
+              Scalar* A_raw =
+                reinterpret_cast<Scalar*> (A_copy_d.data ());
+              auto result =
+                tsqr->factor (numRowsLocal, numCols,
+                              A_raw, A_copy_d.stride (1),
+                              R.data (), R.stride (1),
+                              contiguousCacheBlocks);
+              deep_copy (A_copy_h, A_copy_d);
+              return result;
+            }
+            else {
+              Scalar* A_raw =
+                reinterpret_cast<Scalar*> (A_copy_d.data ());
+              return tsqr->factor (numRowsLocal, numCols,
+                                   A_raw, A_copy.stride (1),
+                                   R.data (), R.stride (1),
+                                   contiguousCacheBlocks);
+            }
+          } ();
+
+          if (myRank == 0 && verbose) {
+            cerr << "  - Finished factor; call explicit_Q" << endl;
+          }
+          if (tsqr->wants_device_memory ()) {
+            const Scalar* A_raw =
+              reinterpret_cast<const Scalar*> (A_copy_d.data ());
+            Scalar* Q_raw = reinterpret_cast<Scalar*> (Q_d.data ());
+            tsqr->explicit_Q (numRowsLocal, numCols,
+                              A_raw, A_copy_d.stride (1),
+                              factorOutput, numCols,
+                              Q_raw, Q_d.stride (1),
+                              contiguousCacheBlocks);
+            deep_copy (Q_h, Q_d);
+          }
+          else {
+            const Scalar* A_raw = A_copy.data ();
+            Scalar* Q_raw = Q_local.data ();
+            tsqr->explicit_Q (numRowsLocal, numCols,
+                              A_raw, A_copy.stride (1),
+                              factorOutput, numCols,
+                              Q_raw, Q_local.stride (1),
+                              contiguousCacheBlocks);
+          }
+          if (myRank == 0 && verbose) {
+            cerr << "  - Finished explicit_Q" << endl;
           }
         }
 
@@ -318,12 +571,23 @@ namespace TSQR {
           // tolerance of zero to test the purported rank with the
           // actual numerical rank.
           const magnitude_type tol = STM::zero();
-          const ordinal_type rank =
-            tsqr->revealRankRaw (Q_local.extent (0), Q_local.extent (1),
-                                 Q_local.data (), Q_local.stride (1),
-                                 R.data (), R.stride (1), tol,
-                                 contiguousCacheBlocks);
-
+          if (myRank == 0 && verbose) {
+            cerr << "  - Call revealRankRaw" << endl;
+          }
+          const ordinal_type rank = [&] () {
+            Scalar* Q_raw = tsqr->wants_device_memory () ?
+              reinterpret_cast<Scalar*> (Q_d.data ()) :
+              Q_local.data ();
+            const ordinal_type ldq = tsqr->wants_device_memory () ?
+              Q_d.stride (1) : Q_local.stride (1);
+            return tsqr->revealRankRaw (numRowsLocal, numCols,
+                                        Q_raw, ldq,
+                                        R.data (), R.stride (1),
+                                        tol, contiguousCacheBlocks);
+          } ();
+          if (myRank == 0 && verbose) {
+            cerr << "  - Finished revealRankRaw" << endl;
+          }
           magnitude_type two_to_the_numCols = STM::one();
           for (int k = 0; k < numCols; ++k) {
             const magnitude_type two = STM::one() + STM::one();
@@ -333,22 +597,19 @@ namespace TSQR {
           // rounding error (so the test only fails if something is
           // really broken).
           if (two_to_the_numCols > magnitude_type(10) * STM::eps ()) {
-            TEUCHOS_TEST_FOR_EXCEPTION(
-              rank != numCols, std::logic_error, "The matrix of " << numCols
-              << " columns should have full numerical rank, but Tsqr reports "
-              "that it has rank " << rank << ".  Please report this bug to "
-              "the Kokkos developers.");
-            if (debug) {
-              Teuchos::barrier (*comm);
-              if (myRank == 0)
-                cerr << "-- Tested rank-revealing capability" << endl;
+            TEUCHOS_TEST_FOR_EXCEPTION
+              (rank != numCols, std::logic_error, "The matrix of " <<
+               numCols << " columns should have full numerical rank, "
+               "but Tsqr reports that it has rank " << rank << ".  "
+               "Please report this bug to the Kokkos developers.");
+            if (myRank == 0 && verbose) {
+              cerr << "  - Tested rank-revealing capability" << endl;
             }
           }
           else {
-            if (debug) {
-              Teuchos::barrier (*comm);
-              if (myRank == 0)
-                cerr << "-- Not testing rank-revealing capability; too many columns" << endl;
+            if (myRank == 0 && verbose) {
+              cerr << "  - Not testing rank-revealing capability; "
+                "too many columns" << endl;
             }
           }
         }
@@ -356,29 +617,49 @@ namespace TSQR {
         // were used.  This is only necessary because global_verify()
         // doesn't currently support contiguous cache blocks.
         if (contiguousCacheBlocks) {
-          // We can use A_copy as scratch space for
-          // un-cache-blocking Q_local, since we're done using
-          // A_copy for other things.
-          tsqr->un_cache_block (numRowsLocal, numCols, A_copy.data(),
-                                A_copy.stride(1), Q_local.data());
-          // Overwrite Q_local with the un-cache-blocked Q factor.
-          deep_copy (Q_local, A_copy);
-          if (debug) {
-            Teuchos::barrier (*comm);
-            if (myRank == 0)
-              cerr << "-- Finished Tsqr::un_cache_block" << endl;
+          // Use A_copy(_d) as scratch for un-cache-blocking Q_local.
+          if (myRank == 0 && verbose) {
+            cerr << "  - Call Tsqr::un_cache_block" << endl;
+          }
+          if (tsqr->wants_device_memory ()) {
+            Scalar* A_copy_d_raw =
+              reinterpret_cast<Scalar*> (A_copy_d.data ());
+            const Scalar* Q_d_raw =
+              reinterpret_cast<const Scalar*> (Q_d.data ());
+            tsqr->un_cache_block (numRowsLocal, numCols,
+                                  A_copy_d_raw,
+                                  A_copy_d.stride (1),
+                                  Q_d_raw);
+            deep_copy (Q_h, A_copy_d);
+          }
+          else {
+            tsqr->un_cache_block (numRowsLocal, numCols,
+                                  A_copy.data (),
+                                  A_copy.stride (1),
+                                  Q_local.data ());
+            deep_copy (Q_local, A_copy);
+          }
+          if (myRank == 0 && verbose) {
+            cerr << "  - Finished Tsqr::un_cache_block" << endl;
+          }
+        }
+        else {
+          if (tsqr->wants_device_memory ()) {
+            deep_copy (Q_h, Q_d);
           }
         }
 
-        // Test accuracy of the factorization.
-        const std::vector<magnitude_type> results =
-          global_verify (numRowsLocal, numCols, A_local.data(), A_local.stride(1),
-                         Q_local.data(), Q_local.stride(1), R.data(), R.stride(1),
+        if (myRank == 0 && verbose) {
+          cerr << "  - Call global_verify" << endl;
+        }
+        const auto results =
+          global_verify (numRowsLocal, numCols,
+                         A_local.data(), A_local.stride(1),
+                         Q_local.data(), Q_local.stride(1),
+                         R.data(), R.stride(1),
                          scalarMessenger.getRawPtr());
-        if (debug) {
-          Teuchos::barrier (*comm);
-          if (myRank == 0)
-            cerr << "-- Finished global_verify" << endl;
+        if (myRank == 0 && verbose) {
+          cerr << "  - Finished global_verify" << endl;
         }
 
         // Print the results on Proc 0.
@@ -390,7 +671,6 @@ namespace TSQR {
                  << ",numRowsLocal"
                  << ",numCols"
                  << ",numProcs"
-                 << ",numCores"
                  << ",cacheSizeHint"
                  << ",contiguousCacheBlocks"
                  << ",absFrobResid"
@@ -401,12 +681,13 @@ namespace TSQR {
             testParams->set ("printFieldNames", false);
           }
           if (testParams->get<bool> ("printResults")) {
+            const std::string scalarName =
+              Teuchos::TypeNameTraits<scalar_type>::name ();
             cout << "Tsqr"
-                 << "," << Teuchos::TypeNameTraits<scalar_type>::name()
+                 << "," << scalarName
                  << "," << numRowsLocal
                  << "," << numCols
                  << "," << numProcs
-                 << "," << numCores
                  << "," << tsqr->cache_size_hint()
                  << "," << contiguousCacheBlocks
                  << "," << results[0]
@@ -414,7 +695,7 @@ namespace TSQR {
                  << "," << results[2]
                  << endl;
           }
-        } // if (myRank == 0)
+        }
 
         // If requested, check accuracy and fail if results are not
         // sufficiently accurate.
@@ -447,28 +728,52 @@ namespace TSQR {
             magnitude_type(10*numCols*numCols) * STM::eps();
 
           // Avoid division by zero.
-          const magnitude_type relResidError =
-            results[0] / (results[2] == STM::zero() ? STM::one() : results[2]);
-          TEUCHOS_TEST_FOR_EXCEPTION(
-            relResidError > relResidBound, TsqrInaccurate, "Full Tsqr "
-            "has an inaccurate relative residual ||A - QR||_F"
-            << (results[2] == STM::zero() ? " / ||A||_F" : "")
-            << " = " << relResidError << ", which is greater than the bound "
-            << relResidBound << " by a factor of "
-            << relResidError / relResidBound << ".");
+          const magnitude_type relResidError = results[0] /
+            (results[2] == STM::zero() ? STM::one() : results[2]);
+
+          if (relResidError > relResidBound) {
+            success = false;
+            if (myRank == 0) {
+              const std::string prefix
+                (verbose ? "  - *** " : "*** ");
+              const std::string scalarName =
+                Teuchos::TypeNameTraits<scalar_type>::name ();
+              const std::string relResStr
+                (results[2] == STM::zero() ? " / ||A||_F" : "");
+              cerr << prefix << "For Scalar=" << scalarName
+                   << ": Inaccurate residual ||A - QR||_F"
+                   << relResStr
+                   << (results[2] == STM::zero() ? " / ||A||_F" : "")
+                   << " = " << relResidError << "." << endl
+                   << prefix << "It's greater than the bound "
+                   << relResidBound << " by a factor of "
+                   << relResidError / relResidBound << "." << endl;
+            }
+          }
           const magnitude_type orthoError = results[1];
-          TEUCHOS_TEST_FOR_EXCEPTION(
-            orthoError > orthoBound, TsqrInaccurate,
-            "Full Tsqr has an inaccurate orthogonality measure ||I - Q^* Q||_F"
-            << results[1] << " = " << orthoError << ", which is greater than "
-            "the bound " << orthoBound << " by a factor of "
-            << orthoError / orthoBound << ".");
+          if (orthoError > orthoBound) {
+            success = false;
+            if (myRank == 0) {
+              const std::string prefix
+                (verbose ? "  - *** " : "*** ");
+              const std::string scalarName =
+                Teuchos::TypeNameTraits<scalar_type>::name ();
+              cerr << prefix << "For Scalar=" << scalarName
+                   << ": Inaccurate orthogonality measure "
+                   << "||I - Q^* Q||_F = " << orthoError << "."
+                   << endl << prefix << "It's greater than the bound "
+                   << orthoBound << " by a factor of "
+                   << orthoError / orthoBound << "." << endl;
+            }
+          }
         } // if (the tests should fail on inaccuracy)
+        return success;
       }
     };
 
     /// \class FullTsqrVerifierCallerImpl
-    /// \brief This class implements a "function template specialization."
+    /// \brief This class implements a "function template
+    ///   specialization."
     /// \author Mark Hoemmen
     ///
     /// We want to make FullTsqrVerifierCaller::run() a template
@@ -489,7 +794,7 @@ namespace TSQR {
     template<class TypeListType>
     class FullTsqrVerifierCallerImpl {
     public:
-      static void
+      static bool
       run (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
            const Teuchos::RCP<Teuchos::ParameterList>& testParams,
            std::vector<int>& randomSeed);
@@ -499,17 +804,21 @@ namespace TSQR {
     // Partial specialization for Cons<CarType, CdrType>.
     //
     template<class CarType, class CdrType>
-    class FullTsqrVerifierCallerImpl<TSQR::Test::Cons<CarType, CdrType> > {
+    class FullTsqrVerifierCallerImpl<TSQR::Test::Cons<CarType, CdrType>>
+    {
     public:
-      static void
+      static bool
       run (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
            const Teuchos::RCP<Teuchos::ParameterList>& testParams,
            std::vector<int>& randomSeed)
       {
-        typedef CarType car_type;
-        typedef CdrType cdr_type;
-        FullTsqrVerifier<car_type>::run (comm, testParams, randomSeed);
-        FullTsqrVerifierCallerImpl<cdr_type>::run (comm, testParams, randomSeed);
+        using car_type = FullTsqrVerifier<CarType>;
+        using cdr_type = FullTsqrVerifierCallerImpl<CdrType>;
+        const bool success1 =
+          car_type::run (comm, testParams, randomSeed);
+        const bool success2 =
+          cdr_type::run (comm, testParams, randomSeed);
+        return success1 && success2;
       }
     };
 
@@ -519,22 +828,23 @@ namespace TSQR {
     template<>
     class FullTsqrVerifierCallerImpl<TSQR::Test::NullCons> {
     public:
-      static void
+      static bool
       run (const Teuchos::RCP<const Teuchos::Comm<int> >&,
            const Teuchos::RCP<Teuchos::ParameterList>&,
            std::vector<int>&)
       {
-        // We're at the end of the type list, so do nothing.
+        return true;
       }
     };
 
     /// \class FullTsqrVerifierCaller
-    /// \brief Invokes FullTsqrVerifier::run() over all Scalar types in a type list.
+    /// \brief Invokes FullTsqrVerifier::run() over all Scalar types
+    ///   in a type list.
     /// \author Mark Hoemmen
     ///
     /// Use this class to test the full TSQR implementation in Tsqr.
     /// It will test Tsqr over a list of Scalar types that you define,
-    /// using \c Cons and \c NullCons.
+    /// using Cons and NullCons.
     class FullTsqrVerifierCaller {
     public:
       /// \typedef ordinal_type
@@ -556,7 +866,7 @@ namespace TSQR {
         RCP<ParameterList> plist = parameterList ("FullTsqrVerifier");
 
         const size_t cacheSizeHint = 0;
-        const int numCores = 1;
+        // const int numCores = 1;
         const ordinal_type numRowsLocal = 100;
         const ordinal_type numCols = 10;
         const bool contiguousCacheBlocks = false;
@@ -565,67 +875,67 @@ namespace TSQR {
         const bool printFieldNames = true;
         const bool printResults = true;
         const bool failIfInaccurate = true;
-        const bool debug = false;
+        const std::string nodeTsqr ("Default");
+        const bool verbose = false;
 
         // Parameters for configuring Tsqr itself.
-        plist->set ("cacheSizeHint", cacheSizeHint,
+        plist->set ("Cache Size Hint", cacheSizeHint,
                     "Cache size hint in bytes.  "
                     "Zero means TSQR picks a reasonable default.");
-        plist->set ("numCores", numCores,
-                    "Number of partition(s) to use for TbbTsqr (if "
-                    "applicable).  Must be a positive integer.");
 
         // Parameters for testing Tsqr.
         plist->set ("numRowsLocal", numRowsLocal,
-                    "Number of rows per (MPI) process in the test matrix.  "
-                    "Must be >= the number of columns.");
+                    "Number of rows per (MPI) process in the test "
+                    "matrix.  Must be >= the number of columns.");
         plist->set ("numCols", numCols,
                     "Number of columns in the test matrix.");
         plist->set ("contiguousCacheBlocks", contiguousCacheBlocks,
-                    "Whether to test the factorization with contiguously "
-                    "stored cache blocks.");
+                    "Whether to test the factorization with "
+                    "contiguously stored cache blocks.");
         plist->set ("testFactorExplicit", testFactorExplicit,
-                    "Whether to test TSQR's factorExplicit() (a hopefully "
-                    "faster path than calling factor() and explicit_Q() in "
-                    "sequence).");
+                    "Whether to test TSQR's factorExplicit() (a "
+                    "hopefully faster path than calling factor() and "
+                    "explicit_Q() in sequence).");
         plist->set ("testRankRevealing", testRankRevealing,
                     "Whether to test TSQR's rank-revealing capability.");
         plist->set ("printFieldNames", printFieldNames,
-                    "Whether to print field names (this is only done once, "
-                    "for all Scalar types tested).");
+                    "Whether to print field names (this is only done "
+                    "once, for all Scalar types tested).");
         plist->set ("printResults", printResults,
                     "Whether to print test results.");
         plist->set ("failIfInaccurate", failIfInaccurate,
                     "Whether to fail the test if the factorization "
                     "is not sufficiently accurate.");
-        plist->set ("debug", debug,
-                    "Whether to print debugging output.");
+        plist->set ("NodeTsqr", nodeTsqr, "NodeTsqr subclass to use; "
+                    "\"Default\" means let TSQR pick it");
+        plist->set ("verbose", verbose,
+                    "Whether to print verbose debugging output.");
         return plist;
       }
 
-      /// \brief Run TsqrVerifier<T>::run() for every type in the type list.
+      /// \brief Run TsqrVerifier<T>::run() for every type in the type
+      ///   list.
       ///
-      /// TypeListType should be either a \c NullCons (representing an
+      /// TypeListType should be either a NullCons (representing an
       /// empty type list, in which case this function does nothing),
-      /// or a \c Cons (whose CarType is a Scalar type to test, and
-      /// whose CdrType is either a NullCons or a Cons).
+      /// or a Cons (whose CarType is a Scalar type to test, and whose
+      /// CdrType is either a NullCons or a Cons).
       ///
       /// \param testParams [in/out] List of parameters for all tests
-      ///   to run.  Call \c getValidParameterList() to get a valid
-      ///   list of parameters with default values and documentation.
+      ///   to run.  Call getValidParameterList() to get a valid list
+      ///   of parameters with default values and documentation.
       ///
       template<class TypeListType>
-      void
+      bool
       run (const Teuchos::RCP<Teuchos::ParameterList>& testParams)
       {
         // Using a class with a static method is a way to implement
         // "partial specialization of function templates" (which by
         // itself is not allowed in C++).
-        typedef FullTsqrVerifierCallerImpl<TypeListType> impl_type;
-        impl_type::run (comm_, testParams, randomSeed_);
+        using impl_type = FullTsqrVerifierCallerImpl<TypeListType>;
+        return impl_type::run (comm_, testParams, randomSeed_);
       }
 
-
       /// \brief Full constructor.
       ///
       /// \param comm [in] Communicator (with one or more processes)
@@ -660,17 +970,19 @@ namespace TSQR {
       static std::vector<int>
       validateRandomSeed (const std::vector<int>& seed)
       {
-        TEUCHOS_TEST_FOR_EXCEPTION(
-          seed.size () < 4, std::invalid_argument, "Invalid random seed: "
-          "Need an array of four integers.");
-        for (std::vector<int>::size_type k = 0; k < seed.size (); ++k) {
-          TEUCHOS_TEST_FOR_EXCEPTION(
-            seed[k] < 0 || seed[k] > 4095, std::invalid_argument, "Invalid "
-            "random seed: Each of the four integers must be in [0, 4095].");
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (seed.size () < 4, std::invalid_argument, "Invalid random "
+           "seed: Need an array of four integers, but you gave us "
+           << seed.size () << " of them.");
+        for (size_t k = 0; k < seed.size (); ++k) {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (seed[k] < 0 || seed[k] > 4095, std::invalid_argument,
+             "seed[" << k << "]=" << seed[k] << " is invalid.  "
+             "Each of the four seeds must be in [0, 4095].");
         }
-        TEUCHOS_TEST_FOR_EXCEPTION(
-          seed[3] % 2 != 1, std::invalid_argument, "Invalid random seed: "
-          "The last of the four integers must be odd.");
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (seed[3] % 2 != 1, std::invalid_argument, "seed[3]="
+           << seed[3] << " is invalid: it must be odd.");
         return seed;
       }
 
@@ -691,7 +1003,7 @@ namespace TSQR {
       ///
       /// This communicator may include one or more processes.
       /// MPI is not required (it may be a "serial communicator").
-      Teuchos::RCP<const Teuchos::Comm<int> > comm_;
+      Teuchos::RCP<const Teuchos::Comm<int>> comm_;
 
       /// \brief The seed for LAPACK's pseudorandom number generator.
       ///
@@ -704,5 +1016,4 @@ namespace TSQR {
   } // namespace Test
 } // namespace TSQR
 
-#endif // __TSQR_Test_FullTsqrTest_hpp
-
+#endif // TSQR_TEST_FULLTSQRTEST_HPP
diff --git a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp
similarity index 55%
rename from packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp
rename to packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp
index 4e5d22e1403c..fab3efa79671 100644
--- a/packages/tpetra/tsqr/src/TsqrFactory_TbbTsqr.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CombineUser.hpp
@@ -37,54 +37,49 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp
-#define __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp
+#ifndef TSQR_COMBINEUSER_HPP
+#define TSQR_COMBINEUSER_HPP
 
-/// \file TsqrFactory_TbbTsqr.hpp
-///
-/// \warning Trilinos users should _not_ include this file directly.
-
-#include "Tsqr_ConfigDefs.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_TBB
-#  include "TbbTsqr.hpp"
-#endif // HAVE_KOKKOSTSQR_TBB
+#include "Tsqr_CombineFactory.hpp"
 
 namespace TSQR {
-  namespace Trilinos {
+namespace Impl {
 
-#ifdef HAVE_KOKKOSTSQR_TBB
-    /// \class TbbTsqrFactory
-    /// \brief Subclass of TsqrFactory that uses \c TSQR::TBB::TbbTsqr.
-    /// \author Mark Hoemmen
-    ///
-    /// \tparam LO "LocalOrdinal": the type of indices into the
-    ///   node-local part of the matrix.
-    ///
-    /// \tparam S "Scalar": the type of entries in the node-local part
-    ///   of the matrix.
-    ///
-    /// All of this class' public methods, other than the constructor
-    /// and destructor, are implemented in the parent class.
-    template<class LO, class S>
-    class TbbTsqrFactory :
-      public TsqrFactory<LO, S, TSQR::TBB::TbbTsqr<LO, S>, DistTsqr<LO, S> > {
-    public:
-      // Help C++ pull in the typedefs from the base class.  C++ needs
-      // help when both the base and the derived classes are
-      // templated.
-      typedef typename base_type::node_tsqr_type node_tsqr_type;
-      typedef typename base_type::dist_tsqr_type dist_tsqr_type;
-      typedef typename base_type::tsqr_type tsqr_type;
-      typedef typename base_type::scalar_messenger_type scalar_messenger_type;
+/// \class CombineUser
+/// \brief Private base class for TSQR classes that use Combine.
+///
+/// Classes that use Combine should inherit privately from this class,
+/// in order to reuse getCombine.
+template<class LocalOrdinal, class Scalar>
+class CombineUser {
+public:
+  /// \brief Given the maximum number of columns that the caller
+  ///   intends to give to Combine functions, return the best choice
+  ///   of Combine implementation.
+  Combine<LocalOrdinal, Scalar>&
+  getCombine (const LocalOrdinal maxNumCols) const {
+    if (combine_.get () == nullptr) {
+      using factory_type = CombineFactory<LocalOrdinal, Scalar>;
+      combine_ = factory_type::create (maxNumCols);
+    }
+    return *combine_;
+  }
 
-      TbbTsqrFactory () {}
-      virtual ~TbbTsqrFactory () {}
-    };
-#endif // HAVE_KOKKOSTSQR_TBB
+  //! Return a specific Combine implementation.
+  Combine<LocalOrdinal, Scalar>&
+  getCombine (const std::string& combineType) const {
+    if (combine_.get () == nullptr) {
+      using factory_type = CombineFactory<LocalOrdinal, Scalar>;
+      combine_ = factory_type::create (combineType);
+    }
+    return *combine_;
+  }
 
-  } // namespace Trilinos
-} // namespace TSQR
+private:
+  mutable std::unique_ptr<Combine<LocalOrdinal, Scalar>> combine_;
+};
 
+} // namespace Impl
+} // namespace TSQR
 
-#endif // __TSQR_Trilinos_TsqrFactory_TbbTsqr_hpp
+#endif // TSQR_COMBINEUSER_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp
new file mode 100644
index 000000000000..4a7fdaccf368
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.cpp
@@ -0,0 +1,149 @@
+#include "Tsqr_Impl_CuBlas.hpp"
+#if defined(HAVE_TPETRATSQR_CUBLAS)
+#include "Tsqr_Impl_CuBlasHandle.hpp"
+#include "Tsqr_Impl_CuTypes.hpp"
+#include "Teuchos_Assert.hpp"
+
+namespace TSQR {
+namespace Impl {
+
+template<class T>
+class RawCuBlas {};
+
+template<>
+class RawCuBlas<double> {
+public:
+  using impl_scalar_type = double;
+
+  static cublasStatus_t
+  gemm (cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        const int m, const int n, const int k,
+        const impl_scalar_type* alpha,
+        const impl_scalar_type* A, const int lda,
+        const impl_scalar_type* B, const int ldb,
+        const impl_scalar_type* beta,
+        impl_scalar_type* C, const int ldc)
+  {
+    return cublasDgemm (handle, transa, transb, m, n, k,
+                        alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+
+template<>
+class RawCuBlas<float> {
+public:
+  using impl_scalar_type = float;
+
+  static cublasStatus_t
+  gemm (cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        const int m, const int n, const int k,
+        const impl_scalar_type* alpha,
+        const impl_scalar_type* A, const int lda,
+        const impl_scalar_type* B, const int ldb,
+        const impl_scalar_type* beta,
+        impl_scalar_type* C, const int ldc)
+  {
+    return cublasSgemm (handle, transa, transb, m, n, k,
+                        alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+template<>
+class RawCuBlas<CudaValue<std::complex<double>>::type> {
+public:
+  using impl_scalar_type = CudaValue<std::complex<double>>::type;
+
+  static cublasStatus_t
+  gemm (cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        const int m, const int n, const int k,
+        const impl_scalar_type* alpha,
+        const impl_scalar_type* A, const int lda,
+        const impl_scalar_type* B, const int ldb,
+        const impl_scalar_type* beta,
+        impl_scalar_type* C, const int ldc)
+  {
+    return cublasZgemm (handle, transa, transb, m, n, k,
+                        alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+
+template<>
+class RawCuBlas<CudaValue<std::complex<float>>::type> {
+public:
+  using impl_scalar_type = CudaValue<std::complex<float>>::type;
+
+  static cublasStatus_t
+  gemm (cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        const int m, const int n, const int k,
+        const impl_scalar_type* alpha,
+        const impl_scalar_type* A, const int lda,
+        const impl_scalar_type* B, const int ldb,
+        const impl_scalar_type* beta,
+        impl_scalar_type* C, const int ldc)
+  {
+    return cublasCgemm (handle, transa, transb, m, n, k,
+                        alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+template<class Scalar>
+CuBlas<Scalar>::CuBlas (CuBlasHandle handle) :
+  handle_ (handle) {}
+
+template<class Scalar>
+void
+CuBlas<Scalar>::
+gemm (const char transa,
+      const char transb,
+      const int m, const int n, const int k,
+      const Scalar alpha,
+      const Scalar* A, const int lda,
+      const Scalar* B, const int ldb,
+      const Scalar beta,
+      Scalar* C, const int ldc)
+{
+  auto rawHandle =
+    reinterpret_cast<cublasHandle_t> (handle_.getHandle ());
+  const cublasOperation_t cuTransa = cuBlasTrans (transa);
+  const cublasOperation_t cuTransb = cuBlasTrans (transb);
+
+  using IST = typename CudaValue<Scalar>::type;
+  const IST alpha_raw = CudaValue<Scalar>::makeValue (alpha);
+  const IST* A_raw = reinterpret_cast<const IST*> (A);
+  const IST* B_raw = reinterpret_cast<const IST*> (B);
+  const IST beta_raw = CudaValue<Scalar>::makeValue (beta);
+  IST* C_raw = reinterpret_cast<IST*> (C);
+
+  using impl_type = RawCuBlas<IST>;
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm
+  // says that alpha and beta may be host or device pointers.
+  const auto status =
+    impl_type::gemm (rawHandle, cuTransa, cuTransb,
+                     m, n, k,
+                     &alpha_raw, A_raw, lda,
+                     B_raw, ldb,
+                     &beta_raw, C_raw, ldc);
+  TEUCHOS_ASSERT( status == CUBLAS_STATUS_SUCCESS );
+}
+
+template class CuBlas<double>;
+template class CuBlas<float>;
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+template class CuBlas<std::complex<double>>;
+template class CuBlas<std::complex<float>>;
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp
new file mode 100644
index 000000000000..08ef1c989878
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlas.hpp
@@ -0,0 +1,44 @@
+#ifndef TSQR_IMPL_CUBLAS_HPP
+#define TSQR_IMPL_CUBLAS_HPP
+
+#include "TpetraTSQR_config.h"
+#if defined(HAVE_TPETRATSQR_CUBLAS)
+#  include "Tsqr_Impl_CuBlasHandle.hpp"
+#  if defined(HAVE_TPETRATSQR_COMPLEX)
+#    include <complex>
+#  endif // HAVE_TPETRATSQR_COMPLEX
+
+namespace TSQR {
+namespace Impl {
+
+template<class Scalar>
+class CuBlas {
+public:
+  CuBlas (CuBlasHandle handle);
+
+  void
+  gemm (const char transa,
+        const char transb,
+        const int m, const int n, const int k,
+        const Scalar alpha,
+        const Scalar* A, const int lda,
+        const Scalar* B, const int ldb,
+        const Scalar beta,
+        Scalar* C, const int ldc);
+
+private:
+  CuBlasHandle handle_;
+};
+
+extern template class CuBlas<double>;
+extern template class CuBlas<float>;
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+extern template class CuBlas<std::complex<double>>;
+extern template class CuBlas<std::complex<float>>;
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS
+#endif // TSQR_IMPL_CUBLAS_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp
new file mode 100644
index 000000000000..352fe743b725
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.cpp
@@ -0,0 +1,38 @@
+#include "Tsqr_Impl_CuBlasHandle.hpp"
+
+#ifdef HAVE_TPETRATSQR_CUBLAS
+#include "Kokkos_Core.hpp"
+#include "Teuchos_Assert.hpp"
+#include <cublas_v2.h>
+
+namespace TSQR {
+namespace Impl {
+
+cublasHandle_t cuBlasRawHandle_ = nullptr;
+
+CuBlasHandle::CuBlasHandle (void* handle) :
+  handle_ (handle)
+{}
+
+CuBlasHandle CuBlasHandle::getSingleton ()
+{
+  static int called_before = 0;
+  if (called_before == 0) {
+    auto finalizer = [] () {
+      if (cuBlasRawHandle_ != nullptr) {
+        (void) cublasDestroy (cuBlasRawHandle_);
+        cuBlasRawHandle_ = nullptr;
+      }
+    };
+    Kokkos::push_finalize_hook (finalizer);
+    auto status = cublasCreate (&cuBlasRawHandle_);
+    TEUCHOS_ASSERT( status == CUBLAS_STATUS_SUCCESS );
+    called_before = 1;
+  }
+  TEUCHOS_ASSERT( cuBlasRawHandle_ != nullptr );
+  return CuBlasHandle (cuBlasRawHandle_);
+}
+
+} // namespace Impl
+} // namespace TSQR
+#endif // HAVE_TPETRATSQR_CUBLAS
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp
new file mode 100644
index 000000000000..05899aaeb28d
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuBlasHandle.hpp
@@ -0,0 +1,33 @@
+#ifndef TSQR_IMPL_CUBLASHANDLE_HPP
+#define TSQR_IMPL_CUBLASHANDLE_HPP
+
+#include "TpetraTSQR_config.h"
+#ifdef HAVE_TPETRATSQR_CUBLAS
+
+namespace TSQR {
+namespace Impl {
+
+class CuBlasHandle {
+private:
+  // This is actually a cublasHandle_t, which is a pointer type.
+  void* handle_ {nullptr};
+
+  CuBlasHandle (void* handle);
+
+public:
+  static CuBlasHandle getSingleton ();
+
+  // This is not really encapsulation, because the "handle" type is
+  // just a pointer.  However, it lets us define cuBlas wrapper
+  // functions without needing to make them friends of CuBlasHandle.
+  void* getHandle () const {
+    return handle_;
+  }
+};
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS
+
+#endif // TSQR_IMPL_CUBLASHANDLE_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp
new file mode 100644
index 000000000000..e4f01e920285
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.cpp
@@ -0,0 +1,603 @@
+#include "Tsqr_Impl_CuSolver.hpp"
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+#include "Tsqr_Impl_CuSolverHandle.hpp"
+#include "Tsqr_Impl_CuTypes.hpp"
+#include "Teuchos_Assert.hpp"
+
+namespace TSQR {
+namespace Impl {
+
+template<class T>
+class RawCuSolver {};
+
+template<>
+class RawCuSolver<double> {
+public:
+  using impl_scalar_type = double;
+
+  static cusolverStatus_t
+  compute_QR_lwork (cusolverDnHandle_t handle,
+                    int m,
+                    int n,
+                    impl_scalar_type* A,
+                    int lda,
+                    int *lwork)
+  {
+    return cusolverDnDgeqrf_bufferSize (handle, m, n, A, lda, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_QR (cusolverDnHandle_t handle,
+              int m,
+              int n,
+              impl_scalar_type* A,
+              int lda,
+              impl_scalar_type* tau,
+              impl_scalar_type* work,
+              int lwork,
+              int* info)
+  {
+    return cusolverDnDgeqrf (handle, m, n, A, lda, tau,
+                             work, lwork, info);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor_lwork (cusolverDnHandle_t handle,
+                        cublasSideMode_t side,
+                        cublasOperation_t trans,
+                        int m,
+                        int n,
+                        int k,
+                        const impl_scalar_type* A,
+                        int lda,
+                        const impl_scalar_type* tau,
+                        const impl_scalar_type* C,
+                        int ldc,
+                        int *lwork)
+  {
+    return cusolverDnDormqr_bufferSize (handle, side, trans,
+                                        m, n, k, A, lda, tau,
+                                        C, ldc, lwork);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor (cusolverDnHandle_t handle,
+                  cublasSideMode_t side,
+                  cublasOperation_t trans,
+                  int m,
+                  int n,
+                  int k,
+                  const impl_scalar_type* A,
+                  int lda,
+                  const impl_scalar_type* tau,
+                  impl_scalar_type* C,
+                  int ldc,
+                  impl_scalar_type* work,
+                  int lwork,
+                  int* devInfo)
+  {
+    return cusolverDnDormqr (handle, side, trans, m, n, k,
+                             A, lda, tau, C, ldc,
+                             work, lwork, devInfo);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q_lwork (cusolverDnHandle_t handle,
+                            int m,
+                            int n,
+                            int k,
+                            const impl_scalar_type *A,
+                            int lda,
+                            const impl_scalar_type *tau,
+                            int *lwork)
+  {
+    return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda,
+                                       tau, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q (cusolverDnHandle_t handle,
+                      int m,
+                      int n,
+                      int k,
+                      impl_scalar_type *A,
+                      int lda,
+                      const impl_scalar_type *tau,
+                      impl_scalar_type *work,
+                      int lwork,
+                      int *devInfo)
+  {
+    return cusolverDnDorgqr(handle, m, n, k, A, lda, tau,
+                            work, lwork, devInfo);
+  }
+};
+
+template<>
+class RawCuSolver<float> {
+public:
+  using impl_scalar_type = float;
+
+  static cusolverStatus_t
+  compute_QR_lwork (cusolverDnHandle_t handle,
+                    int m,
+                    int n,
+                    impl_scalar_type* A,
+                    int lda,
+                    int *lwork)
+  {
+    return cusolverDnSgeqrf_bufferSize (handle, m, n, A, lda, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_QR (cusolverDnHandle_t handle,
+              int m,
+              int n,
+              impl_scalar_type* A,
+              int lda,
+              impl_scalar_type* tau,
+              impl_scalar_type* work,
+              int lwork,
+              int* info)
+  {
+    return cusolverDnSgeqrf (handle, m, n, A, lda, tau,
+                             work, lwork, info);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor_lwork (cusolverDnHandle_t handle,
+                        cublasSideMode_t side,
+                        cublasOperation_t trans,
+                        int m,
+                        int n,
+                        int k,
+                        const impl_scalar_type* A,
+                        int lda,
+                        const impl_scalar_type* tau,
+                        const impl_scalar_type* C,
+                        int ldc,
+                        int *lwork)
+  {
+    return cusolverDnSormqr_bufferSize (handle, side, trans,
+                                        m, n, k, A, lda, tau,
+                                        C, ldc, lwork);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor (cusolverDnHandle_t handle,
+                  cublasSideMode_t side,
+                  cublasOperation_t trans,
+                  int m,
+                  int n,
+                  int k,
+                  const impl_scalar_type* A,
+                  int lda,
+                  const impl_scalar_type* tau,
+                  impl_scalar_type* C,
+                  int ldc,
+                  impl_scalar_type* work,
+                  int lwork,
+                  int* devInfo)
+  {
+    return cusolverDnSormqr (handle, side, trans, m, n, k,
+                             A, lda, tau, C, ldc,
+                             work, lwork, devInfo);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q_lwork (cusolverDnHandle_t handle,
+                            int m,
+                            int n,
+                            int k,
+                            const impl_scalar_type *A,
+                            int lda,
+                            const impl_scalar_type *tau,
+                            int *lwork)
+  {
+    return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda,
+                                       tau, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q (cusolverDnHandle_t handle,
+                      int m,
+                      int n,
+                      int k,
+                      impl_scalar_type *A,
+                      int lda,
+                      const impl_scalar_type *tau,
+                      impl_scalar_type *work,
+                      int lwork,
+                      int *devInfo)
+  {
+    return cusolverDnSorgqr(handle, m, n, k, A, lda, tau,
+                            work, lwork, devInfo);
+  }
+};
+
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+template<>
+class RawCuSolver<CudaValue<std::complex<double>>::type> {
+public:
+  using impl_scalar_type = CudaValue<std::complex<double>>::type;
+
+  static cusolverStatus_t
+  compute_QR_lwork (cusolverDnHandle_t handle,
+                    int m,
+                    int n,
+                    impl_scalar_type* A,
+                    int lda,
+                    int *lwork)
+  {
+    return cusolverDnZgeqrf_bufferSize (handle, m, n, A, lda, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_QR (cusolverDnHandle_t handle,
+              int m,
+              int n,
+              impl_scalar_type* A,
+              int lda,
+              impl_scalar_type* tau,
+              impl_scalar_type* work,
+              int lwork,
+              int* info)
+  {
+    return cusolverDnZgeqrf (handle, m, n, A, lda, tau,
+                             work, lwork, info);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor_lwork (cusolverDnHandle_t handle,
+                        cublasSideMode_t side,
+                        cublasOperation_t trans,
+                        int m,
+                        int n,
+                        int k,
+                        const impl_scalar_type* A,
+                        int lda,
+                        const impl_scalar_type* tau,
+                        const impl_scalar_type* C,
+                        int ldc,
+                        int *lwork)
+  {
+    return cusolverDnZunmqr_bufferSize (handle, side, trans,
+                                        m, n, k, A, lda, tau,
+                                        C, ldc, lwork);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor (cusolverDnHandle_t handle,
+                  cublasSideMode_t side,
+                  cublasOperation_t trans,
+                  int m,
+                  int n,
+                  int k,
+                  const impl_scalar_type* A,
+                  int lda,
+                  const impl_scalar_type* tau,
+                  impl_scalar_type* C,
+                  int ldc,
+                  impl_scalar_type* work,
+                  int lwork,
+                  int* devInfo)
+  {
+    return cusolverDnZunmqr (handle, side, trans, m, n, k,
+                             A, lda, tau, C, ldc,
+                             work, lwork, devInfo);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q_lwork (cusolverDnHandle_t handle,
+                            int m,
+                            int n,
+                            int k,
+                            const impl_scalar_type *A,
+                            int lda,
+                            const impl_scalar_type *tau,
+                            int *lwork)
+  {
+    return cusolverDnZungqr_bufferSize(handle, m, n, k, A, lda,
+                                       tau, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q (cusolverDnHandle_t handle,
+                      int m,
+                      int n,
+                      int k,
+                      impl_scalar_type *A,
+                      int lda,
+                      const impl_scalar_type *tau,
+                      impl_scalar_type *work,
+                      int lwork,
+                      int *devInfo)
+  {
+    return cusolverDnZungqr(handle, m, n, k, A, lda, tau,
+                            work, lwork, devInfo);
+  }
+};
+
+template<>
+class RawCuSolver<CudaValue<std::complex<float>>::type> {
+public:
+  using impl_scalar_type = CudaValue<std::complex<float>>::type;
+
+  static cusolverStatus_t
+  compute_QR_lwork (cusolverDnHandle_t handle,
+                    int m,
+                    int n,
+                    impl_scalar_type* A,
+                    int lda,
+                    int *lwork)
+  {
+    return cusolverDnCgeqrf_bufferSize (handle, m, n, A, lda, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_QR (cusolverDnHandle_t handle,
+              int m,
+              int n,
+              impl_scalar_type* A,
+              int lda,
+              impl_scalar_type* tau,
+              impl_scalar_type* work,
+              int lwork,
+              int* info)
+  {
+    return cusolverDnCgeqrf (handle, m, n, A, lda, tau,
+                             work, lwork, info);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor_lwork (cusolverDnHandle_t handle,
+                        cublasSideMode_t side,
+                        cublasOperation_t trans,
+                        int m,
+                        int n,
+                        int k,
+                        const impl_scalar_type* A,
+                        int lda,
+                        const impl_scalar_type* tau,
+                        const impl_scalar_type* C,
+                        int ldc,
+                        int *lwork)
+  {
+    return cusolverDnCunmqr_bufferSize (handle, side, trans,
+                                        m, n, k, A, lda, tau,
+                                        C, ldc, lwork);
+  }
+
+  static cusolverStatus_t
+  apply_Q_factor (cusolverDnHandle_t handle,
+                  cublasSideMode_t side,
+                  cublasOperation_t trans,
+                  int m,
+                  int n,
+                  int k,
+                  const impl_scalar_type* A,
+                  int lda,
+                  const impl_scalar_type* tau,
+                  impl_scalar_type* C,
+                  int ldc,
+                  impl_scalar_type* work,
+                  int lwork,
+                  int* devInfo)
+  {
+    return cusolverDnCunmqr (handle, side, trans, m, n, k,
+                             A, lda, tau, C, ldc,
+                             work, lwork, devInfo);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q_lwork (cusolverDnHandle_t handle,
+                            int m,
+                            int n,
+                            int k,
+                            const impl_scalar_type *A,
+                            int lda,
+                            const impl_scalar_type *tau,
+                            int *lwork)
+  {
+    return cusolverDnCungqr_bufferSize(handle, m, n, k, A, lda,
+                                       tau, lwork);
+  }
+
+  static cusolverStatus_t
+  compute_explicit_Q (cusolverDnHandle_t handle,
+                      int m,
+                      int n,
+                      int k,
+                      impl_scalar_type *A,
+                      int lda,
+                      const impl_scalar_type *tau,
+                      impl_scalar_type *work,
+                      int lwork,
+                      int *devInfo)
+  {
+    return cusolverDnCungqr(handle, m, n, k, A, lda, tau,
+                            work, lwork, devInfo);
+  }
+};
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+template<class Scalar>
+CuSolver<Scalar>::CuSolver (CuSolverHandle handle, int* const info) :
+  handle_ (handle), info_ (info)
+{}
+
+template<class Scalar>
+int
+CuSolver<Scalar>::
+compute_QR_lwork (const int nrows,
+                  const int ncols,
+                  Scalar A[],
+                  const int lda) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+  int lwork = 0;
+
+  using IST = typename CudaValue<Scalar>::type;
+  IST* A_raw = reinterpret_cast<IST*> (A);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::compute_QR_lwork (rawHandle, nrows, ncols,
+                                 A_raw, lda, &lwork);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+  return lwork;
+}
+
+template<class Scalar>
+void
+CuSolver<Scalar>::
+compute_QR (const int nrows,
+            const int ncols,
+            Scalar A[],
+            const int lda,
+            Scalar tau[],
+            Scalar work[],
+            const int lwork) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+
+  using IST = typename CudaValue<Scalar>::type;
+  IST* A_raw = reinterpret_cast<IST*> (A);
+  IST* tau_raw = reinterpret_cast<IST*> (tau);
+  IST* work_raw = reinterpret_cast<IST*> (work);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::compute_QR (rawHandle, nrows, ncols, A_raw, lda,
+                           tau_raw, work_raw, lwork, info_);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+}
+
+template<class Scalar>
+int
+CuSolver<Scalar>::
+apply_Q_factor_lwork (const char side,
+                      const char trans,
+                      const int nrows,
+                      const int ncols_C,
+                      const int ncols_Q,
+                      const Scalar Q[],
+                      const int ldq,
+                      const Scalar tau[],
+                      Scalar C[],
+                      const int ldc) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+  const cublasSideMode_t cuSide = cuBlasSide (side);
+  const cublasOperation_t cuTrans = cuBlasTrans (trans);
+  int lwork = 0;
+
+  using IST = typename CudaValue<Scalar>::type;
+  const IST* Q_raw = reinterpret_cast<const IST*> (Q);
+  const IST* tau_raw = reinterpret_cast<const IST*> (tau);
+  const IST* C_raw = reinterpret_cast<const IST*> (C);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::apply_Q_factor_lwork (rawHandle, cuSide, cuTrans,
+                                     nrows, ncols_C, ncols_Q,
+                                     Q_raw, ldq, tau_raw,
+                                     C_raw, ldc, &lwork);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+  return lwork;
+}
+
+template<class Scalar>
+void
+CuSolver<Scalar>::
+apply_Q_factor (const char side,
+                const char trans,
+                const int nrows,
+                const int ncols_C,
+                const int ncols_Q,
+                const Scalar Q[],
+                const int ldq,
+                const Scalar tau[],
+                Scalar C[],
+                const int ldc,
+                Scalar work[],
+                const int lwork) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+  const cublasSideMode_t cuSide = cuBlasSide (side);
+  const cublasOperation_t cuTrans = cuBlasTrans (trans);
+
+  using IST = typename CudaValue<Scalar>::type;
+  const IST* Q_raw = reinterpret_cast<const IST*> (Q);
+  const IST* tau_raw = reinterpret_cast<const IST*> (tau);
+  IST* C_raw = reinterpret_cast<IST*> (C);
+  IST* work_raw = reinterpret_cast<IST*> (work);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::apply_Q_factor (rawHandle, cuSide, cuTrans,
+                               nrows, ncols_C, ncols_Q,
+                               Q_raw, ldq, tau_raw, C_raw, ldc,
+                               work_raw, lwork, info_);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+}
+
+template<class Scalar>
+int
+CuSolver<Scalar>::
+compute_explicit_Q_lwork(const int m, const int n, const int k,
+                         Scalar A[], const int lda,
+                         const Scalar tau[]) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+  int lwork = 0;
+
+  using IST = typename CudaValue<Scalar>::type;
+  const IST* A_raw = reinterpret_cast<const IST*> (A);
+  const IST* tau_raw = reinterpret_cast<const IST*> (tau);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::compute_explicit_Q_lwork (rawHandle, m, n, k,
+                                         A_raw, lda, tau_raw, &lwork);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+  return lwork;
+}
+
+template<class Scalar>
+void
+CuSolver<Scalar>::
+compute_explicit_Q(const int m, const int n, const int k,
+                   Scalar A[], const int lda,
+                   const Scalar tau[],
+                   Scalar work[], const int lwork) const
+{
+  auto rawHandle =
+    reinterpret_cast<cusolverDnHandle_t> (handle_.getHandle ());
+  using IST = typename CudaValue<Scalar>::type;
+  IST* A_raw = reinterpret_cast<IST*> (A);
+  const IST* tau_raw = reinterpret_cast<const IST*> (tau);
+  IST* work_raw = reinterpret_cast<IST*> (work);
+
+  using impl_type = RawCuSolver<IST>;
+  const auto status =
+    impl_type::compute_explicit_Q (rawHandle, m, n, k, A_raw, lda,
+                                   tau_raw, work_raw, lwork, info_);
+  TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+}
+
+template class CuSolver<double>;
+template class CuSolver<float>;
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+template class CuSolver<std::complex<double>>;
+template class CuSolver<std::complex<float>>;
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp
new file mode 100644
index 000000000000..7123b8d4479c
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolver.hpp
@@ -0,0 +1,91 @@
+#ifndef TSQR_IMPL_CUSOLVER_HPP
+#define TSQR_IMPL_CUSOLVER_HPP
+
+#include "TpetraTSQR_config.h"
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+#include "Tsqr_Impl_CuBlasHandle.hpp"
+#include "Tsqr_Impl_CuSolverHandle.hpp"
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+#  include <complex>
+#endif // HAVE_TPETRATSQR_COMPLEX
+#include "Tsqr_Impl_RawQR.hpp"
+
+namespace TSQR {
+namespace Impl {
+
+template<class Scalar>
+class CuSolver : public RawQR<Scalar> {
+public:
+  CuSolver(CuSolverHandle handle, int* const info);
+
+  virtual bool wants_device_memory () const { return true; }
+
+  int
+  compute_QR_lwork(const int nrows,
+                   const int ncols,
+                   Scalar A_raw[],
+                   const int lda) const override;
+
+  void
+  compute_QR(const int nrows,
+             const int ncols,
+             Scalar A[],
+             const int lda,
+             Scalar tau[],
+             Scalar work[],
+             const int lwork) const override;
+
+  int
+  apply_Q_factor_lwork(const char side,
+                       const char trans,
+                       const int nrows,
+                       const int ncols_C,
+                       const int ncols_Q,
+                       const Scalar Q[],
+                       const int ldq,
+                       const Scalar tau[],
+                       Scalar C[],
+                       const int ldc) const override;
+
+  void
+  apply_Q_factor(const char side,
+                 const char trans,
+                 const int nrows,
+                 const int ncols_C,
+                 const int ncols_Q,
+                 const Scalar Q[],
+                 const int ldq,
+                 const Scalar tau[],
+                 Scalar C[],
+                 const int ldc,
+                 Scalar work[],
+                 const int lwork) const override;
+
+  int
+  compute_explicit_Q_lwork(const int m, const int n, const int k,
+                           Scalar A[], const int lda,
+                           const Scalar tau[]) const override;
+
+  void
+  compute_explicit_Q(const int m, const int n, const int k,
+                     Scalar A[], const int lda,
+                     const Scalar tau[],
+                     Scalar work[], const int lwork) const override;
+
+private:
+  CuSolverHandle handle_;
+  int* info_; // DEVICE MEMORY
+};
+
+extern template class CuSolver<double>;
+extern template class CuSolver<float>;
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+extern template class CuSolver<std::complex<double>>;
+extern template class CuSolver<std::complex<float>>;
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+#endif // TSQR_IMPL_CUSOLVER_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp
new file mode 100644
index 000000000000..23be0a6cec51
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.cpp
@@ -0,0 +1,38 @@
+#include "Tsqr_Impl_CuSolverHandle.hpp"
+
+#ifdef HAVE_TPETRATSQR_CUSOLVER
+#include "Kokkos_Core.hpp"
+#include "Teuchos_Assert.hpp"
+#include <cusolverDn.h>
+
+namespace TSQR {
+namespace Impl {
+
+cusolverDnHandle_t cuSolverRawHandle_ = nullptr;
+
+CuSolverHandle::CuSolverHandle (void* handle) :
+  handle_ (handle)
+{}
+
+CuSolverHandle CuSolverHandle::getSingleton ()
+{
+  static int called_before = 0;
+  if (called_before == 0) {
+    auto finalizer = [] () {
+      if (cuSolverRawHandle_ != nullptr) {
+        (void) cusolverDnDestroy (cuSolverRawHandle_);
+        cuSolverRawHandle_ = nullptr;
+      }
+    };
+    Kokkos::push_finalize_hook (finalizer);
+    auto status = cusolverDnCreate (&cuSolverRawHandle_);
+    TEUCHOS_ASSERT( status == CUSOLVER_STATUS_SUCCESS );
+    called_before = 1;
+  }
+  TEUCHOS_ASSERT( cuSolverRawHandle_ != nullptr );
+  return CuSolverHandle (cuSolverRawHandle_);
+}
+
+} // namespace Impl
+} // namespace TSQR
+#endif // HAVE_TPETRATSQR_CUSOLVER
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp
new file mode 100644
index 000000000000..802f81e3c742
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuSolverHandle.hpp
@@ -0,0 +1,33 @@
+#ifndef TSQR_IMPL_CUSOLVERHANDLE_HPP
+#define TSQR_IMPL_CUSOLVERHANDLE_HPP
+
+#include "TpetraTSQR_config.h"
+#ifdef HAVE_TPETRATSQR_CUSOLVER
+
+namespace TSQR {
+namespace Impl {
+
+class CuSolverHandle {
+private:
+  // This is actually a cusolverDnHandle_t, which is a pointer type.
+  void* handle_ {nullptr};
+
+  CuSolverHandle (void* handle);
+
+public:
+  static CuSolverHandle getSingleton ();
+
+  // This is not really encapsulation, because the "handle" type is
+  // just a pointer.  However, it lets us define cuSolver wrapper
+  // functions without needing to make them friends of CuSolverHandle.
+  void* getHandle () const {
+    return handle_;
+  }
+};
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUSOLVER
+
+#endif // TSQR_IMPL_CUSOLVERHANDLE_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp
new file mode 100644
index 000000000000..edccc391d01a
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.cpp
@@ -0,0 +1,33 @@
+#include "Tsqr_Impl_CuTypes.hpp"
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+
+namespace TSQR {
+namespace Impl {
+
+cublasSideMode_t cuBlasSide (const char side)
+{
+  if (side == 'L' || side == 'l') {
+    return CUBLAS_SIDE_LEFT;
+  }
+  else {
+    return CUBLAS_SIDE_RIGHT;
+  }
+}
+
+cublasOperation_t cuBlasTrans (const char trans)
+{
+  if (trans == 'C' || trans == 'c') {
+    return CUBLAS_OP_C;
+  }
+  else if (trans == 'T' || trans == 't') {
+    return CUBLAS_OP_T;
+  }
+  else {
+    return CUBLAS_OP_N;
+  }
+}
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp
new file mode 100644
index 000000000000..6f271895dc08
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_CuTypes.hpp
@@ -0,0 +1,96 @@
+#ifndef TSQR_IMPL_CUTYPES_HPP
+#define TSQR_IMPL_CUTYPES_HPP
+
+#include "TpetraTSQR_config.h"
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+#include <cublas_v2.h> // for cublasSideMode_t etc.
+#include <cusolverDn.h>
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+#  include <complex>
+#endif // HAVE_TPETRATSQR_COMPLEX
+
+namespace TSQR {
+namespace Impl {
+
+template<class Scalar>
+struct CudaValue {};
+
+template<>
+struct CudaValue<double> {
+  using type = double;
+
+  static type makeValue (const double x) {
+    return x;
+  }
+
+  static bool arrayCorrectlyAligned (const double* const /* x */) {
+    return true;
+  }
+};
+
+template<>
+struct CudaValue<float> {
+  using type = float;
+
+  static type makeValue (const float x) {
+    return x;
+  }
+
+  static bool arrayCorrectlyAligned (const double* const /* x */) {
+    return true;
+  }
+};
+
+#if defined(HAVE_TPETRATSQR_COMPLEX)
+// FIXME (mfh 10 Dec 2019) CUDA's built-in complex types must be
+// aligned to the whole type, not just to double or float (as with
+// std::complex or (currently) Kokkos::complex).
+template<>
+struct CudaValue<std::complex<double>> {
+  using type = cuDoubleComplex;
+
+  static type makeValue (const std::complex<double> x) {
+    return make_cuDoubleComplex (std::real (x), std::imag (x));
+  }
+
+  static bool
+  arrayCorrectlyAligned (const std::complex<double>* const x)
+  {
+    // CUDA requires arrays of complex to be aligned to the full type,
+    // not just to one of the two numbers (as with std::complex).
+    constexpr size_t requiredAlignment =
+      sizeof (std::complex<double>);
+    return x == nullptr ||
+      reinterpret_cast<size_t> (x) % requiredAlignment == 0;
+  }
+};
+
+template<>
+struct CudaValue<std::complex<float>> {
+  using type = cuFloatComplex;
+
+  static type makeValue (const std::complex<float> x) {
+    return make_cuFloatComplex (std::real (x), std::imag (x));
+  }
+
+  static bool
+  arrayCorrectlyAligned (const std::complex<float>* const x)
+  {
+    // CUDA requires arrays of complex to be aligned to the full type,
+    // not just to one of the two numbers (as with std::complex).
+    constexpr size_t requiredAlignment =
+      sizeof (std::complex<float>);
+    return x == nullptr ||
+      reinterpret_cast<size_t> (x) % requiredAlignment == 0;
+  }
+};
+#endif // defined(HAVE_TPETRATSQR_COMPLEX)
+
+cublasSideMode_t cuBlasSide (const char side);
+cublasOperation_t cuBlasTrans (const char trans);
+
+} // namespace Impl
+} // namespace TSQR
+
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+#endif // TSQR_IMPL_CUTYPES_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp
index 51d105b6bc68..fed10d62136e 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.cpp
@@ -6,119 +6,215 @@
 namespace TSQR {
 namespace Impl {
 
-#define TSQR_IMPL_LAPACK_IMPL( Scalar ) \
-void Lapack<Scalar>:: \
-LARNV(const int idist, int seed[], const int n, \
-      value_type v[]) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  lapack.LARNV(idist, seed, n, v); \
-} \
-  \
-void Lapack<Scalar>:: \
-POTRF(const char UPLO, const int n, \
-      value_type A[], const int lda) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  int info = 0; \
-  lapack.POTRF(UPLO, n, A, lda, &info); \
-  if (info != 0) { \
-    std::ostringstream os; \
-    os << "LAPACK POTRF (Cholesky factorization) " \
-       << "failed with INFO = " << info << "."; \
-    throw std::logic_error (os.str ()); \
-  } \
-} \
-  \
-void Lapack<Scalar>:: \
-GESVD(const char JOBU, const char JOBVT, \
-      const int m, const int n, \
-      value_type A[], const int lda, \
-      magnitude_type S[], value_type U[], const int ldu, \
-      value_type V[], const int ldv, \
-      value_type WORK[], const int lwork, \
-      magnitude_type RWORK[]) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  int info = 0; \
-  lapack.GESVD(JOBU, JOBVT, m, n, A, lda, S, \
-               U, ldu, V, ldv, WORK, lwork, RWORK, &info); \
-  if (info != 0) { \
-    std::ostringstream os; \
-    os << "LAPACK GESVD (singular value decomposition) " \
-       << "failed with INFO = " << info << "."; \
-    throw std::logic_error (os.str ()); \
-  } \
-} \
-  \
-void Lapack<Scalar>:: \
-LARFG(const int n, value_type& alpha, value_type x[], \
-      const int incx, value_type& tau) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  lapack.LARFG(n, &alpha, x, incx, &tau); \
-} \
-  \
-void Lapack<Scalar>:: \
-compute_QR(const int m, const int n, value_type A[], const int lda, \
-           value_type TAU[], value_type WORK[], const int lwork) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  int info = 0; \
-  lapack.GEQRF(m, n, A, lda, TAU, WORK, lwork, &info); \
-  if (info != 0) { \
-    std::ostringstream os; \
-    os << "LAPACK GEQRF (QR factorization) failed with INFO = " \
-       << info << "."; \
-    throw std::logic_error (os.str()); \
-  } \
-} \
-  \
-void Lapack<Scalar>:: \
-apply_Q_factor(const char SIDE, const char TRANS, \
-               const int m, const int n, const int k, \
-               const value_type A[], const int lda, \
-               const value_type TAU[], \
-               value_type C[], const int ldc, \
-               value_type WORK[], const int lwork) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  int info = 0; \
-  value_type* A_nc = const_cast<value_type*>(A); \
-  lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, WORK, \
-               lwork, &info); \
-  if (info != 0) { \
-    std::ostringstream os; \
-    os << "LAPACK UNMQR (apply Q factor from GEQRF) failed with " \
-       "INFO = " << info << "."; \
-    throw std::logic_error (os.str()); \
-  } \
-} \
-  \
-void Lapack<Scalar>:: \
-compute_explicit_Q(const int m, const int n, const int k, \
-                   value_type A[], const int lda, \
-                   const value_type TAU[], value_type WORK[], \
-                   const int lwork) const \
-{ \
-  Teuchos::LAPACK<int, value_type> lapack; \
-  int info = 0; \
-  lapack.UNGQR(m, n, k, A, lda, TAU, WORK, lwork, &info); \
-  if (info != 0) { \
-    std::ostringstream os; \
-    os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) " \
-      "failed with INFO = " << info << "."; \
-    throw std::logic_error (os.str()); \
-  } \
+template<class Scalar>
+void Lapack<Scalar>::
+LARNV(const int idist, int seed[], const int n,
+      value_type v[]) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  lapack.LARNV(idist, seed, n, v);
 }
 
-TSQR_IMPL_LAPACK_IMPL( float )
-TSQR_IMPL_LAPACK_IMPL( double )
+template<class Scalar>
+void Lapack<Scalar>::
+POTRF(const char UPLO, const int n,
+      value_type A[], const int lda) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  int info = 0;
+  lapack.POTRF(UPLO, n, A, lda, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK POTRF (Cholesky factorization) "
+       << "failed with INFO = " << info << ".";
+    throw std::logic_error (os.str ());
+  }
+}
+
+template<class Scalar>
+void Lapack<Scalar>::
+GESVD(const char JOBU, const char JOBVT,
+      const int m, const int n,
+      value_type A[], const int lda,
+      magnitude_type S[], value_type U[], const int ldu,
+      value_type V[], const int ldv,
+      value_type WORK[], const int lwork,
+      magnitude_type RWORK[]) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  int info = 0;
+  lapack.GESVD(JOBU, JOBVT, m, n, A, lda, S,
+               U, ldu, V, ldv, WORK, lwork, RWORK, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK GESVD (singular value decomposition) "
+       << "failed with INFO = " << info << ".";
+    throw std::logic_error (os.str ());
+  }
+}
+
+template<class Scalar>
+void Lapack<Scalar>::
+LARFG(const int n, value_type& alpha, value_type x[],
+      const int incx, value_type& tau) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  lapack.LARFG(n, &alpha, x, incx, &tau);
+}
+
+template<class Scalar>
+int Lapack<Scalar>::
+compute_QR_lwork (const int m, const int n,
+                  value_type A[], const int lda) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  Scalar WORK {};
+  int lwork = -1;
+  int info = 0;
+  lapack.GEQRF(m, n, A, lda, nullptr, &WORK, lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK GEQRF (QR factorization) LWORK query "
+      "failed with INFO = " << info << ".";
+    throw std::logic_error (os.str ());
+  }
+  using STS = Teuchos::ScalarTraits<Scalar>;
+  using mag_type = typename STS::magnitudeType;
+  lwork = mag_type (STS::real (WORK));
+  if (lwork < mag_type {}) {
+    std::ostringstream os;
+    os << "LAPACK GEQRF (QR factorization) LWORK query "
+      "returned INFO=0, but WORK=" << lwork << " < 0.";
+    throw std::logic_error (os.str ());
+  }
+  return lwork;
+}
+
+template<class Scalar>
+void Lapack<Scalar>::
+compute_QR(const int m, const int n, value_type A[], const int lda,
+           value_type TAU[], value_type WORK[], const int lwork) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  int info = 0;
+  lapack.GEQRF(m, n, A, lda, TAU, WORK, lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK GEQRF (QR factorization) failed with INFO = "
+       << info << ".";
+    throw std::logic_error (os.str());
+  }
+}
+
+template<class Scalar>
+int Lapack<Scalar>::
+apply_Q_factor_lwork(const char SIDE, const char TRANS,
+                     const int m, const int n, const int k,
+                     const value_type A[], const int lda,
+                     const value_type TAU[],
+                     value_type C[], const int ldc) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  value_type WORK {};
+  int lwork = -1;
+  int info = 0;
+  value_type* A_nc = const_cast<value_type*>(A);
+  lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, &WORK,
+               lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK UNMQR (apply Q factor from GEQRF) LWORK query "
+      "failed with INFO = " << info << ".";
+    throw std::logic_error (os.str());
+  }
+  using STS = Teuchos::ScalarTraits<Scalar>;
+  using mag_type = typename STS::magnitudeType;
+  lwork = mag_type (STS::real (WORK));
+  if (lwork < mag_type {}) {
+    std::ostringstream os;
+    os << "LAPACK UNMQR (apply Q factor from GEQRF) LWORK query "
+      "returned INFO=0, but WORK=" << lwork << " < 0.";
+    throw std::logic_error (os.str ());
+  }
+  return lwork;
+}
+
+template<class Scalar>
+void Lapack<Scalar>::
+apply_Q_factor(const char SIDE, const char TRANS,
+               const int m, const int n, const int k,
+               const value_type A[], const int lda,
+               const value_type TAU[],
+               value_type C[], const int ldc,
+               value_type WORK[], const int lwork) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  int info = 0;
+  value_type* A_nc = const_cast<value_type*>(A);
+  lapack.UNMQR(SIDE, TRANS, m, n, k, A_nc, lda, TAU, C, ldc, WORK,
+               lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK UNMQR (apply Q factor from GEQRF) failed with "
+       "INFO = " << info << ".";
+    throw std::logic_error (os.str());
+  }
+}
+
+template<class Scalar>
+int Lapack<Scalar>::
+compute_explicit_Q_lwork (const int m, const int n, const int k,
+                          value_type A[], const int lda,
+                          const value_type TAU[]) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  Scalar WORK {};
+  int lwork = -1;
+  int info = 0;
+  lapack.UNGQR(m, n, k, A, lda, TAU, &WORK, lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) "
+      "LWORK query failed with INFO = " << info << ".";
+    throw std::logic_error (os.str());
+  }
+  using STS = Teuchos::ScalarTraits<Scalar>;
+  using mag_type = typename STS::magnitudeType;
+  lwork = mag_type (STS::real (WORK));
+  if (lwork < mag_type {}) {
+    std::ostringstream os;
+    os << "LAPACK UNGQR (compute explicit Q factor form GEQRF) "
+      "LWORK query returned INFO=0, but WORK=" << lwork << " < 0.";
+    throw std::logic_error (os.str ());
+  }
+  return lwork;
+}
+
+template<class Scalar>
+void Lapack<Scalar>::
+compute_explicit_Q(const int m, const int n, const int k,
+                   value_type A[], const int lda,
+                   const value_type TAU[], value_type WORK[],
+                   const int lwork) const
+{
+  Teuchos::LAPACK<int, value_type> lapack;
+  int info = 0;
+  lapack.UNGQR(m, n, k, A, lda, TAU, WORK, lwork, &info);
+  if (info != 0) {
+    std::ostringstream os;
+    os << "LAPACK UNGQR (compute explicit Q factor from GEQRF) "
+      "failed with INFO = " << info << ".";
+    throw std::logic_error (os.str());
+  }
+}
+
+template class Lapack<float>;
+template class Lapack<double>;
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-TSQR_IMPL_LAPACK_IMPL( std::complex<float> )
-TSQR_IMPL_LAPACK_IMPL( std::complex<double> )
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
+template class Lapack<std::complex<float>>;
+template class Lapack<std::complex<double>>;
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 } // namespace Impl
 } // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp
index 392f2aa4f6c4..8dc20b55b4d5 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_Lapack.hpp
@@ -8,73 +8,92 @@
 namespace TSQR {
 namespace Impl {
 
+/// \brief Implementation of RawQR that uses the system's LAPACK
+///   library via Teuchos::LAPACK.
+///
+/// This class provides functions not in RawQR for the sake of
+/// CombineNative.  CombineNative needs LARFG, but it's not properly
+/// part of RawQR.  It doesn't make sense to launch a device kernel
+/// from host for every column of the matrix, especially not when
+/// cuSOLVER already has all the needed QR factorization and apply Q
+/// factor functions.
 template<class Scalar>
-class Lapack {};
-
-// CombineNative needs LARFG, but it's not properly part of RawQR.
-// RawQR needs to be able to wrap lots of different functions,
-// including whatever cuSOLVER provides.  It doesn't make sense to
-// launch a device kernel from host for ever column of the matrix,
-// especially not when cuSOLVER already has all the needed QR
-// factorization and apply Q factor functions.
-
-#define TSQR_IMPL_LAPACK_DECL( Scalar ) \
-template<> \
-class Lapack<Scalar> : public RawQR<Scalar> { \
-public: \
-  using value_type = Scalar; \
-  using magnitude_type = decltype(std::abs(Scalar{})); \
-  \
-  ~Lapack() = default; \
-  \
-  void \
-  compute_QR(const int m, const int n, value_type A[], \
-             const int lda, value_type TAU[], value_type WORK[], \
-             const int lwork) const override; \
-  \
-  void \
-  apply_Q_factor(const char SIDE, const char TRANS, \
-                 const int m, const int n, const int k, \
-                 const value_type A[], const int lda, \
-                 const value_type TAU[], \
-                 value_type C[], const int ldc, \
-                 value_type WORK[], const int lwork) const override; \
-  \
-  void \
-  compute_explicit_Q(const int m, const int n, const int k, \
-                     value_type A[], const int lda, \
-                     const value_type TAU[], value_type WORK[], \
-                     const int lwork) const override; \
-  \
-  void \
-  GESVD(const char JOBU, const char JOBVT, \
-        const int m, const int n, \
-        value_type A[], const int lda, \
-        magnitude_type S[], value_type U[], const int ldu, \
-        value_type V[], const int ldv, \
-        value_type WORK[], const int lwork, \
-        magnitude_type RWORK[]) const; \
-  \
-  void \
-  LARFG(const int n, value_type& alpha, value_type x[], \
-        const int incx, value_type& tau) const; \
-  \
-  void \
-  POTRF(const char UPLO, const int n, \
-        value_type A[], const int lda) const; \
-  \
-  void \
-  LARNV(const int idist, int seed[], const int n, \
-        value_type v[]) const; \
+class Lapack : public RawQR<Scalar> {
+public:
+  using value_type = Scalar;
+  using magnitude_type = decltype(std::abs(Scalar{}));
+
+  // NOTE (mfh 22 Dec 2019) I would normally write "= default;" here,
+  // but Intel 17 appears to have a bug that requires an explicit
+  // nondefault definition.  See discussion here:
+  //
+  // https://github.com/trilinos/Trilinos/pull/6488#issuecomment-568351758
+  ~Lapack() override {}
+
+  int
+  compute_QR_lwork(const int m, const int n,
+                   value_type A[], const int lda) const override;
+
+  void
+  compute_QR(const int m, const int n, value_type A[],
+             const int lda, value_type TAU[], value_type WORK[],
+             const int lwork) const override;
+
+  int
+  apply_Q_factor_lwork(const char SIDE, const char TRANS,
+                       const int m, const int n, const int k,
+                       const value_type A[], const int lda,
+                       const value_type TAU[],
+                       value_type C[], const int ldc) const override;
+
+  void
+  apply_Q_factor(const char SIDE, const char TRANS,
+                 const int m, const int n, const int k,
+                 const value_type A[], const int lda,
+                 const value_type TAU[],
+                 value_type C[], const int ldc,
+                 value_type WORK[], const int lwork) const override;
+
+  int
+  compute_explicit_Q_lwork(const int m, const int n, const int k,
+                           value_type A[], const int lda,
+                           const value_type TAU[]) const override;
+
+  void
+  compute_explicit_Q(const int m, const int n, const int k,
+                     value_type A[], const int lda,
+                     const value_type TAU[], value_type WORK[],
+                     const int lwork) const override;
+
+  void
+  GESVD(const char JOBU, const char JOBVT,
+        const int m, const int n,
+        value_type A[], const int lda,
+        magnitude_type S[], value_type U[], const int ldu,
+        value_type V[], const int ldv,
+        value_type WORK[], const int lwork,
+        magnitude_type RWORK[]) const;
+
+  void
+  LARFG(const int n, value_type& alpha, value_type x[],
+        const int incx, value_type& tau) const;
+
+  void
+  POTRF(const char UPLO, const int n,
+        value_type A[], const int lda) const;
+
+  void
+  LARNV(const int idist, int seed[], const int n,
+        value_type v[]) const;
 };
 
-TSQR_IMPL_LAPACK_DECL( float )
-TSQR_IMPL_LAPACK_DECL( double )
+extern template class Lapack<float>;
+extern template class Lapack<double>;
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-TSQR_IMPL_LAPACK_DECL( std::complex<float> )
-TSQR_IMPL_LAPACK_DECL( std::complex<double> )
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
+extern template class Lapack<std::complex<float>>;
+extern template class Lapack<std::complex<double>>;
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 } // namespace Impl
 } // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp
index 307aa103e9a9..f078bb72dec9 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_RawQR.hpp
@@ -13,12 +13,6 @@ namespace Impl {
 /// CUDA stream instance (cudaStream_t) and a cuSOLVER handle
 /// (cusolverDnHandle_t).
 ///
-/// WORK size query ("LWORK query") happens as in LAPACK, by passing
-/// in lwork = -1.  A cuSOLVER Implementation would just check if
-/// lwork is -1, and call cusolverDn?geqrf_bufferSize in that case
-/// (replace the question mark with S, D, C, or Z as appropriate for
-/// the Scalar type).
-///
 /// Methods are virtual because they are meant to be called from host.
 /// (For the CUDA case, we plan to make cuSOLVER calls from host; we
 /// don't need to call QR from device.)
@@ -29,6 +23,18 @@ class RawQR {
 
   virtual ~RawQR() = default;
 
+  /// \brief Whether the subclass takes arrays and pointers as
+  ///   "device" (GPU) memory.
+  ///
+  /// Unlike with NodeTsqr, this means <i>all</i> array and pointers,
+  /// not just "large" ones.
+  virtual bool wants_device_memory() const { return false; }
+
+  //! Get recommended work array size for compute_QR.
+  virtual int
+  compute_QR_lwork(const int m, const int n,
+                   value_type A[], const int lda) const = 0;
+
   //! Compute QR factorization of a general m by n matrix A.
   virtual void
   compute_QR(const int m, const int n,
@@ -36,6 +42,14 @@ class RawQR {
              value_type TAU[],
              value_type WORK[], const int lwork) const = 0;
 
+  //! Get recommended work array size for apply_Q_factor.
+  virtual int
+  apply_Q_factor_lwork(const char SIDE, const char TRANS,
+                       const int m, const int n, const int k,
+                       const value_type A[], const int lda,
+                       const value_type TAU[],
+                       value_type C[], const int ldc) const = 0;
+
   /// \brief Apply Householder reflectors.
   ///
   /// Overwrite the general complex m by n matrix C with the product
@@ -52,6 +66,12 @@ class RawQR {
                  value_type C[], const int ldc,
                  value_type WORK[], const int lwork) const = 0;
 
+  //! Get recommended work array size for compute_explicit_Q.
+  virtual int
+  compute_explicit_Q_lwork(const int m, const int n, const int k,
+                           value_type A[], const int lda,
+                           const value_type TAU[]) const = 0;
+
   /// \brief Compute explicit QR factor from QR factorization (GEQRF).
   ///
   /// Generate the m by n matrix Q with orthonormal (or unitary, if
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp
index bc19ef78be03..25219f6d28b7 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.cpp
@@ -90,10 +90,10 @@ TRSM(const Teuchos::ESide side, const Teuchos::EUplo uplo, \
 TSQR_IMPL_SYSTEMBLAS_IMPL( float )
 TSQR_IMPL_SYSTEMBLAS_IMPL( double )
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
 TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex<float> )
 TSQR_IMPL_SYSTEMBLAS_IMPL( std::complex<double> )
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 } // namespace Impl
 } // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp
index 1e49ddc266c8..7b1599e41df1 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Impl_SystemBlas.hpp
@@ -58,10 +58,10 @@ public: \
 TSQR_IMPL_SYSTEMBLAS_DECL( float )
 TSQR_IMPL_SYSTEMBLAS_DECL( double )
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
 TSQR_IMPL_SYSTEMBLAS_DECL( std::complex<float> )
 TSQR_IMPL_SYSTEMBLAS_DECL( std::complex<double> )
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 } // namespace Impl
 } // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp
deleted file mode 100644
index 71b823b19558..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqr.hpp
+++ /dev/null
@@ -1,1728 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-/// \file Tsqr_KokkosNodeTsqr.hpp
-/// \brief Parallel intranode TSQR implemented using Kokkos::parallel_for.
-
-#ifndef __TSQR_KokkosNodeTsqr_hpp
-#define __TSQR_KokkosNodeTsqr_hpp
-
-#include "Tsqr_CacheBlocker.hpp"
-#include "Tsqr_Combine.hpp"
-#include "Tsqr_NodeTsqr.hpp"
-#include "Tsqr_Impl_SystemBlas.hpp"
-
-#include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
-#include "Kokkos_Core.hpp"
-
-namespace TSQR {
-  namespace details {
-    /// \brief Half-exclusive range of my partition's cache block indices.
-    ///
-    /// \c FactorFirstPass (used by the factor() method of \c
-    /// KokkosNodeTsqr) breaks up the matrix into contiguous
-    /// partitions of row blocks.  The index argument of Kokkos'
-    /// parallel_for is the (zero-based) partition index.  This
-    /// function returns the half-exclusive range of the cache block
-    /// indices belonging to the partition partitionIndex.
-    ///
-    /// \param numRows [in] Number of rows in the matrix.
-    /// \param numCols [in] Number of columns in the matrix.
-    /// \param partitionIndex [in] Zero-based index of the partition.
-    ///   This is specifically an int and not a LocalOrdinal, because
-    ///   partition indices are arguments to Kokkos Node API methods
-    ///   parallel_for and parallel_reduce.  Cache block indices are
-    ///   of LocalOrdinal type and should not be mixed with partition
-    ///   indices, even though in most cases LocalOrdinal == int.
-    /// \param numPartitions [in] Total number of partitions; a
-    ///   positive integer.
-    /// \param strategy [in] The cache blocking strategy to use.
-    ///
-    /// \return (start cache block index, end cache block index).
-    ///   This is a half-exclusive range: it does not include the end
-    ///   point.  Thus, if the two indices are equal, the range is
-    ///   empty.
-    template<class LocalOrdinal, class Scalar>
-    std::pair<LocalOrdinal, LocalOrdinal>
-    cacheBlockIndexRange (const LocalOrdinal numRows,
-                          const LocalOrdinal numCols,
-                          const int partitionIndex,
-                          const int numPartitions,
-                          const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy)
-    {
-      using LO = LocalOrdinal;
-      // The input index is a zero-based index of the current
-      // partition (not the "current cache block" -- a partition
-      // contains zero or more cache blocks).  If the input index is
-      // out of range, then return, since there is nothing to do.
-      //
-      // The nice thing about partitioning over cache blocks is that
-      // the cache blocking strategy guarantees that exactly one of
-      // the following is true:
-      //
-      // 1. The partition is empty (contains zero cache blocks)
-      // 2. All cache blocks in the partition are valid (none
-      //    contains more columns than rows)
-
-      // Return an empty partition (an empty cache block range) if
-      // the partition index is out of range.
-      if (partitionIndex >= numPartitions) {
-        return {0, 0};
-      }
-
-      const LO numRowsCacheBlock =
-        strategy.cache_block_num_rows (numCols);
-      const LO numCacheBlocks =
-        strategy.num_cache_blocks (numRows, numCols, numRowsCacheBlock);
-
-      // Figure out how many cache blocks my partition contains.  If
-      // the number of partitions doesn't evenly divide the number
-      // of cache blocks, we spread out the remainder among the
-      // first few threads.
-      const LO quotient = numCacheBlocks / numPartitions;
-      const LO remainder = numCacheBlocks - quotient * numPartitions;
-      const LO myNumCacheBlocks = (partitionIndex < remainder) ?
-        (quotient + 1) : quotient;
-
-      // If there are no cache blocks, there is nothing to factor.
-      // Return an empty cache block range to indicate this.
-      if (myNumCacheBlocks == 0) {
-        return {0, 0};
-      }
-
-      // Index of my first cache block (inclusive).
-      const LO myFirstCacheBlockIndex = (partitionIndex < remainder) ?
-        partitionIndex * (quotient+1) :
-        remainder * (quotient+1) + (partitionIndex - remainder) * quotient;
-      // Index of my last cache block (exclusive).
-      const LO myLastCacheBlockIndex = (partitionIndex+1 < remainder) ?
-        (partitionIndex+1) * (quotient+1) :
-        remainder * (quotient+1) + (partitionIndex+1 - remainder) * quotient;
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (myLastCacheBlockIndex <= myFirstCacheBlockIndex,
-         std::logic_error, "Partition " << (partitionIndex+1) << " of "
-         << numPartitions << ":  My range of cache block indices ["
-         << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex
-         << ") is empty.");
-      return {myFirstCacheBlockIndex, myLastCacheBlockIndex};
-    }
-
-
-    /// \class FactorFirstPass
-    /// \brief First pass of KokkosNodeTsqr's factorization.
-    /// \author Mark Hoemmen
-    template<class LocalOrdinal, class Scalar>
-    class FactorFirstPass {
-    public:
-      typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-
-    private:
-      mat_view_type A_;
-      // While tauArrays_ is shared among tasks (i.e., partitions),
-      // there are no race conditions among entries, since each
-      // partition writes its own entry.  Ditto for topBlocks_.
-      std::vector<std::vector<Scalar> >& tauArrays_;
-      std::vector<mat_view_type>& topBlocks_;
-      CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-      int numPartitions_;
-      bool contiguousCacheBlocks_;
-
-      std::vector<Scalar>
-      factorFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
-                             const mat_view_type& A_top,
-                             std::vector<Scalar>& work) const
-      {
-        std::vector<Scalar> tau (A_top.extent(1));
-
-        // We should only call this if A_top.extent(1) > 0 and therefore
-        // work.size() > 0, but we've already checked for that, so we
-        // don't have to check again.
-        combine.factor_first (A_top, tau.data(), work.data());
-        return tau;
-      }
-
-      std::vector<Scalar>
-      factorCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
-                        const mat_view_type& A_top,
-                        const mat_view_type& A_cur,
-                        std::vector<Scalar>& work) const
-      {
-        std::vector<Scalar> tau (A_top.extent(1));
-
-        // We should only call this if A_top.extent(1) > 0 and therefore
-        // tau.size() > 0 and work.size() > 0, but we've already
-        // checked for that, so we don't have to check again.
-        combine.factor_inner (A_top, A_cur, tau.data(), work.data());
-        return tau;
-      }
-
-      /// \brief Factor the given cache block range using sequential TSQR.
-      ///
-      /// \param cbIndices [in] Half-exclusive range of cache block indices.
-      /// \param partitionIndex [in] Zero-based index of my partition.
-      ///
-      /// \return A view of the top block of the cache block range.
-      mat_view_type
-      factor (const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
-              const int partitionIndex) const
-      {
-        const char suffix[] = "  Please report this bug to the Tpetra developers.";
-        using cb_range_type = CacheBlockRange<mat_view_type>;
-
-        // Workspace is created here, because it must not be shared
-        // among threads.
-        std::vector<Scalar> work (A_.extent(1));
-
-        // Range of cache blocks to factor.
-        cb_range_type cbRange (A_, strategy_, cbIndices.first,
-                               cbIndices.second, contiguousCacheBlocks_);
-        // Iterator in the forward direction over the range of cache
-        // blocks to factor.
-        typedef typename CacheBlockRange<mat_view_type>::iterator range_iter_type;
-        range_iter_type cbIter = cbRange.begin();
-
-        // Remember the top (first) block.
-        mat_view_type A_top = *cbIter;
-        if (A_top.empty ()) {
-          return A_top;
-        }
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (cbIndices.first >= cbIndices.second, std::logic_error,
-           "FactorFirstPass::factor: A_top is not empty, but the "
-           "cache block index range " << cbIndices.first << ","
-           << cbIndices.second << " is empty." << suffix);
-
-        // Current cache block index.
-        LocalOrdinal curTauIdx = cbIndices.first;
-
-        // Factor the first cache block.
-        Combine<LocalOrdinal, Scalar> combine;
-        tauArrays_[curTauIdx++] = factorFirstCacheBlock (combine, A_top, work);
-
-        // Move past the first cache block.
-        ++cbIter;
-
-        // Number of cache block(s) we have factored thus far.
-        LocalOrdinal count = 1;
-
-        // Factor the remaining cache block(s).
-        range_iter_type cbEnd = cbRange.end();
-        while (cbIter != cbEnd) {
-          mat_view_type A_cur = *cbIter;
-          // Iteration over cache blocks of a partition should
-          // always result in nonempty cache blocks.
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (A_cur.empty (), std::logic_error, "FactorFirstPass::factor: "
-             "The current cache block (the " << count << "-th to factor in the "
-             "range [" << cbIndices.first << "," << cbIndices.second << ") of "
-             "cache block indices) in partition " << (partitionIndex+1) << " "
-             "(out of " << numPartitions_ << " partitions) is empty." << suffix);
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (static_cast<size_t>(curTauIdx) >= tauArrays_.size(),
-             std::logic_error, "FactorFirstPass::factor: curTauIdx (= "
-             << curTauIdx << ") >= tauArrays_.size() (= "
-             << tauArrays_.size() << ")." << suffix);
-          tauArrays_[curTauIdx++] =
-            factorCacheBlock (combine, A_top, A_cur, work);
-          ++count;
-          ++cbIter;
-        }
-        return A_top;
-      }
-
-    public:
-      /// \brief Constructor
-      ///
-      /// \param A [in/out] On input: View of the matrix to factor.
-      ///   On output: (Part of) the implicitly stored Q factor.
-      ///   (The other part is tauArrays.)
-      /// \param tauArrays [out] Where to write the "TAU" arrays
-      ///   (implicit factorization results) for each cache block.
-      ///   (TAU is what LAPACK's QR factorization routines call this
-      ///   array; see the LAPACK documentation for an explanation.)
-      ///   Indexed by the cache block index; one TAU array per cache
-      ///   block.
-      /// \param strategy [in] Cache blocking strategy to use.
-      /// \param numPartitions [in] Number of partitions (positive
-      ///   integer), and therefore the maximum parallelism available
-      ///   to the algorithm.  Oversubscribing processors is OK, but
-      ///   should not be done to excess.  This is an int, and not a
-      ///   LocalOrdinal, because it is the argument to Kokkos'
-      ///   parallel_for.
-      /// \param contiguousCacheBlocks [in] Whether the cache blocks
-      ///   of A are stored contiguously.
-      FactorFirstPass (const mat_view_type& A,
-                       std::vector<std::vector<Scalar> >& tauArrays,
-                       std::vector<mat_view_type>& topBlocks,
-                       const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                       const int numPartitions,
-                       const bool contiguousCacheBlocks = false) :
-        A_ (A),
-        tauArrays_ (tauArrays),
-        topBlocks_ (topBlocks),
-        strategy_ (strategy),
-        numPartitions_ (numPartitions),
-        contiguousCacheBlocks_ (contiguousCacheBlocks)
-      {
-        TEUCHOS_TEST_FOR_EXCEPTION(A_.empty(), std::logic_error,
-                           "TSQR::FactorFirstPass constructor: A is empty.  "
-                           "Please report this bug to the Kokkos developers.");
-        TEUCHOS_TEST_FOR_EXCEPTION(numPartitions < 1, std::logic_error,
-                           "TSQR::FactorFirstPass constructor: numPartitions "
-                           "must be positive, but numPartitions = "
-                           << numPartitions << ".  Please report this bug to "
-                           "the Kokkos developers.");
-      }
-
-      /// \brief First pass of intranode TSQR factorization.
-      ///
-      /// Invoked by Kokkos' parallel_for template method.  This
-      /// routine parallelizes over contiguous partitions of the
-      /// matrix.  Each partition in turn contains cache blocks.
-      /// Partitions do not break up cache blocks.  (This ensures that
-      /// the cache blocking scheme is the same as that used by
-      /// SequentialTsqr, as long as the cache blocking strategies are
-      /// the same.  However, the implicit Q factor is not compatible
-      /// with that of SequentialTsqr.)
-      ///
-      /// This method also saves a view of the top block of the
-      /// partition in the topBlocks_ array.  This is useful for the
-      /// next factorization pass.
-      ///
-      /// \param partitionIndex [in] Zero-based index of the
-      ///   partition.  If greater than or equal to the number of
-      ///   partitions, this routine does nothing.
-      void operator() (const int partitionIndex) const
-      {
-        if (partitionIndex < 0 || partitionIndex >= numPartitions_ || A_.empty ()) {
-          return;
-        }
-        else {
-          const std::pair<LocalOrdinal, LocalOrdinal> cbIndices =
-            cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex,
-                                  numPartitions_, strategy_);
-          // It's legitimate, though suboptimal, for some partitions
-          // not to get any work to do (in this case, not to get any
-          // cache blocks to factor).
-          if (cbIndices.second <= cbIndices.first) {
-            return;
-          } else {
-            topBlocks_[partitionIndex] = factor (cbIndices, partitionIndex);
-          }
-        }
-      }
-    };
-
-    /// \class ApplyFirstPass
-    /// \brief "First" pass of applying KokkosNodeTsqr's implicit Q factor.
-    /// \author Mark Hoemmen
-    ///
-    /// We call this ApplyFirstPass as a reminder that this algorithm
-    /// has the same form as FactorFirstPass and uses the results of
-    /// the latter, even though ApplyFirstPass is really the last pass
-    /// of applying the implicit Q factor.
-    template<class LocalOrdinal, class Scalar>
-    class ApplyFirstPass {
-    public:
-      using const_mat_view_type = MatView<LocalOrdinal, const Scalar>;
-      using mat_view_type = MatView<LocalOrdinal, Scalar>;
-
-    private:
-      ApplyType applyType_;
-      const_mat_view_type Q_;
-      const std::vector<std::vector<Scalar> >& tauArrays_;
-      const std::vector<mat_view_type>& topBlocks_;
-      mat_view_type C_;
-      CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-      int numPartitions_;
-      bool explicitQ_, contiguousCacheBlocks_;
-
-      void
-      applyFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
-                            const ApplyType& applyType,
-                            const const_mat_view_type& Q_top,
-                            const std::vector<Scalar>& tau,
-                            const mat_view_type& C_top,
-                            std::vector<Scalar>& work) const
-      {
-        TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast<size_t> (Q_top.extent(1)),
-                           std::logic_error,
-                           "ApplyFirstPass::applyFirstCacheBlock: tau.size() "
-                           "(= " << tau.size() << ") < number of columns "
-                           << Q_top.extent(1) << " in the Q factor.  Please "
-                           "report this bug to the Kokkos developers.");
-
-        // If we get this far, it's fair to assume that we have
-        // checked whether tau and work have nonzero lengths.
-        combine.apply_first (applyType, Q_top, tau.data(),
-                             C_top, work.data());
-      }
-
-      void
-      applyCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
-                       const ApplyType& applyType,
-                       const const_mat_view_type& Q_cur,
-                       const std::vector<Scalar>& tau,
-                       const mat_view_type& C_top,
-                       const mat_view_type& C_cur,
-                       std::vector<Scalar>& work) const
-      {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (tau.size() < static_cast<size_t> (Q_cur.extent(1)),
-           std::logic_error, "ApplyFirstPass::applyCacheBlock: tau.size() "
-           "(= " << tau.size() << ") < number of columns "
-           << Q_cur.extent(1) << " in the Q factor."
-           "  Please report this bug to the Tpetra developers.");
-
-        // If we get this far, it's fair to assume that we have
-        // checked whether tau and work have nonzero lengths.
-        combine.apply_inner (applyType, C_cur.extent(0), C_cur.extent(1),
-                             Q_cur.extent(1), Q_cur.data(), Q_cur.stride(1),
-                             tau.data(),
-                             C_top.data(), C_top.stride(1),
-                             C_cur.data(), C_cur.stride(1),
-                             work.data());
-      }
-
-      /// \fn apply
-      /// \brief Apply the sequential part of the implicit Q factor to C.
-      ///
-      /// \param applyType [in] Whether we are applying Q, Q^T, or Q^H.
-      /// \param cbIndices [in] Half-exclusive range of cache block
-      ///   indices.
-      /// \param partitionIndex [in] The argument to \c operator(); the
-      ///   index of the partition which instance of ApplyFirstPass
-      ///   is currently processing.
-      void
-      apply (const ApplyType& applyType,
-             const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
-             const int partitionIndex) const
-      {
-        using const_range_type = CacheBlockRange<const_mat_view_type>;
-        using range_type = CacheBlockRange<mat_view_type>;
-        const char suffix[] = "  Please report this bug to the Tpetra developers.";
-
-        if (cbIndices.first >= cbIndices.second) {
-          return; // My range of cache blocks is empty; nothing to do
-        }
-
-        // Q_range: Range of cache blocks in the Q factor.
-        // C_range: Range of cache blocks in the matrix C.
-        const_range_type Q_range (Q_, strategy_,
-                                  cbIndices.first, cbIndices.second,
-                                  contiguousCacheBlocks_);
-        range_type C_range (C_, strategy_,
-                            cbIndices.first, cbIndices.second,
-                            contiguousCacheBlocks_);
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (Q_range.empty(), std::logic_error,
-           "Q_range is empty, but the range of cache block "
-           "indices [" << cbIndices.first << ", "
-           << cbIndices.second << ") is not empty." << suffix);
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (C_range.empty(), std::logic_error,
-           "C_range is empty, but the range of cache block "
-           "indices [" << cbIndices.first << ", "
-           << cbIndices.second << ") is not empty." << suffix);
-
-        // Task-local workspace array of length C_.extent(1).  Workspace
-        // must be per task, else there will be race conditions as
-        // different tasks attempt to write to and read from the same
-        // workspace simultaneously.
-        std::vector<Scalar> work (C_.extent(1));
-
-        Combine<LocalOrdinal, Scalar> combine;
-        if (applyType.transposed ()) {
-          auto Q_rangeIter = Q_range.begin();
-          auto C_rangeIter = C_range.begin();
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (Q_rangeIter == Q_range.end(), std::logic_error,
-             "The Q cache block range claims to be nonempty, "
-             "but the iterator range is empty." << suffix);
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (C_rangeIter == C_range.end(), std::logic_error,
-             "The C cache block range claims to be nonempty, "
-             "but the iterator range is empty." << suffix);
-
-          // Q_top: Topmost cache block in the cache block range of Q.
-          // C_top: Topmost cache block in the cache block range of C.
-          const_mat_view_type Q_top = *Q_rangeIter;
-          mat_view_type C_top = *C_rangeIter;
-          if (explicitQ_) {
-            deep_copy (C_top, Scalar {});
-            if (partitionIndex == 0) {
-              for (LocalOrdinal j = 0; j < C_top.extent(1); ++j) {
-                C_top(j,j) = Scalar (1.0);
-              }
-            }
-          }
-          LocalOrdinal curTauIndex = cbIndices.first;
-
-          // Apply the first block.
-          applyFirstCacheBlock (combine, applyType, Q_top,
-                                tauArrays_[curTauIndex++], C_top, work);
-
-          // Apply the rest of the blocks, if any.
-          ++Q_rangeIter;
-          ++C_rangeIter;
-          while (Q_rangeIter != Q_range.end ()) {
-            TEUCHOS_TEST_FOR_EXCEPTION
-              (C_rangeIter == C_range.end(), std::logic_error,
-               "When applying Q^T or Q^H to C: The Q cache "
-               "block iterator is not yet at the end, but "
-               "the C cache block iterator is." << suffix);
-            const_mat_view_type Q_cur = *Q_rangeIter;
-            mat_view_type C_cur = *C_rangeIter;
-            ++Q_rangeIter;
-            ++C_rangeIter;
-            if (explicitQ_) {
-              deep_copy (C_cur, Scalar {});
-            }
-            applyCacheBlock (combine, applyType, Q_cur,
-                             tauArrays_[curTauIndex++],
-                             C_top, C_cur, work);
-          }
-        }
-        else {
-          // Q_top: Topmost cache block in the cache block range of Q.
-          // C_top: Topmost cache block in the cache block range of C.
-          const_mat_view_type Q_top = *(Q_range.begin());
-          mat_view_type C_top = *(C_range.begin());
-
-          if (explicitQ_) {
-            // We've already filled the top ncols x ncols block of
-            // C_top with data (that's the result of applying the
-            // internode part of the Q factor via DistTsqr).  However,
-            // we still need to fill the rest of C_top (everything but
-            // the top ncols rows of C_top) with zeros.
-            mat_view_type C_top_rest (C_top.extent(0) - C_top.extent(1),
-                                      C_top.extent(1),
-                                      C_top.data() + C_top.extent(1),
-                                      C_top.stride(1));
-            deep_copy (C_top_rest, Scalar {});
-          }
-          LocalOrdinal curTauIndex = cbIndices.second-1;
-
-          // When applying Q (rather than Q^T or Q^H), we apply the
-          // cache blocks in reverse order.
-          typename const_range_type::iterator Q_rangeIter = Q_range.rbegin();
-          typename range_type::iterator C_rangeIter = C_range.rbegin();
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (Q_rangeIter == Q_range.rend(), std::logic_error,
-             "The Q cache block range claims to be nonempty, "
-             "but the iterator range is empty." << suffix);
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (C_rangeIter == C_range.rend(), std::logic_error,
-             "The C cache block range claims to be nonempty, "
-             "but the iterator range is empty." << suffix);
-
-          // Equality of cache block range iterators only tests the
-          // cache block index, not reverse-ness.  This means we can
-          // compare a reverse-direction iterator (Q_rangeIter) with
-          // a forward-direction iterator (Q_range.begin()).
-          //
-          // We do this because we need to handle the topmost block
-          // of Q_range separately (applyFirstCacheBlock(), rather
-          // than applyCacheBlock()).
-          while (Q_rangeIter != Q_range.begin ()) {
-            const_mat_view_type Q_cur = *Q_rangeIter;
-            mat_view_type C_cur = *C_rangeIter;
-
-            if (explicitQ_) {
-              deep_copy (C_cur, Scalar {});
-            }
-            TEUCHOS_TEST_FOR_EXCEPTION
-              (curTauIndex < cbIndices.first, std::logic_error,
-               "curTauIndex=" << curTauIndex << " out of valid "
-               "range [" << cbIndices.first << ","
-               << cbIndices.second << ")." << suffix);
-            applyCacheBlock (combine, applyType, Q_cur,
-                             tauArrays_[curTauIndex--],
-                             C_top, C_cur, work);
-            ++Q_rangeIter;
-            ++C_rangeIter;
-          }
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (curTauIndex < cbIndices.first, std::logic_error,
-             "curTauIndex=" << curTauIndex << " out of valid range "
-             "[" << cbIndices.first << "," << cbIndices.second << ")."
-             << suffix);
-          // Apply the first block.
-          applyFirstCacheBlock (combine, applyType, Q_top,
-                                tauArrays_[curTauIndex--], C_top, work);
-        }
-      }
-
-    public:
-      /// \brief Constructor
-      ///
-      /// \param applyType [in] Whether we are applying Q, Q^T, or Q^H.
-      /// \param A [in/out] On input: View of the matrix to factor.
-      ///   On output: (Part of) the implicitly stored Q factor.
-      ///   (The other part is tauArrays.)
-      /// \param tauArrays [in] Where to write the "TAU" arrays
-      ///   (implicit factorization results) for each cache block.
-      ///   (TAU is what LAPACK's QR factorization routines call this
-      ///   array; see the LAPACK documentation for an explanation.)
-      ///   Indexed by the cache block index; one TAU array per cache
-      ///   block.
-      /// \param strategy [in] Cache blocking strategy to use.
-      /// \param numPartitions [in] Number of partitions (positive
-      ///   integer), and therefore the maximum parallelism available
-      ///   to the algorithm.  Oversubscribing processors is OK, but
-      ///   should not be done to excess.  This is an int, and not a
-      ///   LocalOrdinal, because it is the argument to Kokkos'
-      ///   parallel_for.
-      /// \param contiguousCacheBlocks [in] Whether the cache blocks
-      ///   of A are stored contiguously.
-      ApplyFirstPass (const ApplyType& applyType,
-                      const const_mat_view_type& Q,
-                      const std::vector<std::vector<Scalar>>& tauArrays,
-                      const std::vector<mat_view_type>& topBlocks,
-                      const mat_view_type& C,
-                      const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                      const int numPartitions,
-                      const bool explicitQ = false,
-                      const bool contiguousCacheBlocks = false) :
-        applyType_ (applyType),
-        Q_ (Q),
-        tauArrays_ (tauArrays),
-        topBlocks_ (topBlocks),
-        C_ (C),
-        strategy_ (strategy),
-        numPartitions_ (numPartitions),
-        explicitQ_ (explicitQ),
-        contiguousCacheBlocks_ (contiguousCacheBlocks)
-      {}
-
-      /// \brief First pass of applying intranode TSQR's implicit Q factor.
-      ///
-      /// Invoked by Kokkos' parallel_for template method.  This
-      /// routine parallelizes over contiguous partitions of the C
-      /// matrix.  Each partition in turn contains cache blocks.  We
-      /// take care not to break up the cache blocks among partitions;
-      /// this ensures that the cache blocking scheme is the same as
-      /// SequentialTsqr uses.  (However, the implicit Q factor is not
-      /// compatible with that of SequentialTsqr.)
-      ///
-      /// \param partitionIndex [in] Zero-based index of the partition
-      ///   which this instance of ApplyFirstPass is currently
-      ///   processing.  If greater than or equal to the number of
-      ///   partitions, this routine does nothing.
-      void operator() (const int partitionIndex) const
-      {
-        const char prefix[] = "TSQR::ApplyFirstPass::operator(): ";
-        const char suffix[] = "  Please report this bug to the Tpetra developers.";
-
-        if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
-            Q_.empty () || C_.empty ()) {
-          return;
-        }
-
-        // We use the same cache block indices for Q and for C.
-        std::pair<LocalOrdinal, LocalOrdinal> cbIndices =
-          cacheBlockIndexRange (Q_.extent(0), Q_.extent(1), partitionIndex,
-                                numPartitions_, strategy_);
-        if (cbIndices.second <= cbIndices.first)
-          return;
-        {
-          std::pair<size_t, size_t> cbInds (size_t (cbIndices.first),
-                                            size_t (cbIndices.second));
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (cbIndices.first < LocalOrdinal(0), std::logic_error,
-             prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", "
-             << Q_.extent(1) << ", " << partitionIndex << ", "
-             << numPartitions_ << ", strategy) returned a cache block "
-             "range " << cbIndices.first << "," << cbIndices.second <<
-             " with negative starting index." << suffix);
-          TEUCHOS_TEST_FOR_EXCEPTION
-            (cbInds.second > tauArrays_.size (), std::logic_error,
-             prefix << "cacheBlockIndexRange(" << Q_.extent (0) << ", "
-             << Q_.extent(1) << ", " << partitionIndex << ", "
-             << numPartitions_ << ", strategy) returned a cache block "
-             "range" << cbIndices.first << "," << cbIndices.second <<
-             " with starting index larger than the number of tau "
-             "arrays " << tauArrays_.size () << "." << suffix);
-        }
-        apply (applyType_, cbIndices, partitionIndex);
-      }
-    };
-
-    /// \class CacheBlockFunctor
-    /// \brief Kokkos functor for KokkosNodeTsqr's (un_)cache_block() methods.
-    /// \author Mark Hoemmen
-    template<class LocalOrdinal, class Scalar>
-    class CacheBlockFunctor {
-    private:
-      using const_mat_view_type = MatView<LocalOrdinal, const Scalar>;
-      using mat_view_type = MatView<LocalOrdinal, Scalar>;
-      using const_range_type = CacheBlockRange<const_mat_view_type>;
-      using range_type = CacheBlockRange<mat_view_type>;
-
-      const_mat_view_type A_in_;
-      mat_view_type A_out_;
-      CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-      int numPartitions_;
-      bool unblock_;
-
-      /// \brief Copy one range of cache blocks into another.
-      ///
-      /// \param cbInputRange [in] Range of input cache blocks.
-      /// \param cbOutputRange [out] Range of output cache blocks.
-      void copyRange (const_range_type& cbInputRange,
-                      range_type& cbOutputRange) const
-      {
-        typedef typename const_range_type::iterator input_iter_type;
-        typedef typename range_type::iterator output_iter_type;
-
-        input_iter_type inputIter = cbInputRange.begin();
-        output_iter_type outputIter = cbOutputRange.begin();
-
-        input_iter_type inputEnd = cbInputRange.end();
-        // TODO (mfh 29 Jun 2012) In a debug build, check in the loop
-        // below whether outputIter == cbOutputRange.end().  If so,
-        // throw std::logic_error.  Don't declare outputEnd unless
-        // we're in a debug build, because otherwise the compiler may
-        // report warnings (gcc 4.5 doesn't; gcc 4.6 does).
-        // output_iter_type outputEnd = cbOutputRange.end();
-
-        while (inputIter != inputEnd) {
-          const_mat_view_type A_in_cur = *inputIter;
-          mat_view_type A_out_cur = *outputIter;
-          deep_copy (A_out_cur, A_in_cur);
-          ++inputIter;
-          ++outputIter;
-        }
-      }
-
-    public:
-      /// \brief Constructor
-      ///
-      /// \param A_in [in] The matrix to (un-)cache-block.
-      /// \param A_out [in/out] Result of (un-)cache-blocking the
-      ///   matrix A_in.
-      /// \param strategy [in] Cache blocking strategy.
-      /// \param numPartitions [in] Number of partitions; maximum
-      ///   available parallelism.
-      /// \param unblock [in] If false, cache-block A_in (a matrix in
-      ///   column-major order) into A_out.  If true, un-cache-block
-      ///   A_in into A_out (a matrix in column-major order).
-      CacheBlockFunctor (const const_mat_view_type A_in,
-                         const mat_view_type A_out,
-                         const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                         const int numPartitions,
-                         const bool unblock) :
-        A_in_ (A_in),
-        A_out_ (A_out),
-        strategy_ (strategy),
-        numPartitions_ (numPartitions),
-        unblock_ (unblock)
-      {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (A_in_.extent(0) != A_out_.extent(0) ||
-           A_in_.extent(1) != A_out_.extent(1),
-           std::invalid_argument,
-           "A_in and A_out do not have the same dimensions: "
-           "A_in is " << A_in_.extent(0) << " by "
-           << A_in_.extent(1) << ", but A_out is "
-           << A_out_.extent(0) << " by "
-           << A_out_.extent(1) << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (numPartitions_ < 1, std::invalid_argument,
-           "The number of partitions " << numPartitions_
-           << " is not a positive integer.");
-      }
-
-      /// \brief Method called by Kokkos::parallel_for.
-      ///
-      /// \param partitionIndex [in] Zero-based index of the partition
-      ///   of the matrix.  We parallelize over partitions.
-      ///   Partitions respect cache blocks.
-      void operator() (const int partitionIndex) const
-      {
-        if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
-            A_in_.empty()) {
-          return;
-        }
-        else {
-          using index_range_type = std::pair<LocalOrdinal, LocalOrdinal>;
-          const index_range_type cbIndices =
-            cacheBlockIndexRange (A_in_.extent (0), A_in_.extent (1),
-                                  partitionIndex, numPartitions_, strategy_);
-          // It's perfectly legal for a partitioning to assign zero
-          // cache block indices to a particular partition.  In that
-          // case, this task has nothing to do.
-          if (cbIndices.first >= cbIndices.second) {
-            return;
-          }
-          else {
-            // If unblock_ is false, then A_in_ is in column-major
-            // order, and we want to cache-block it into A_out_.  If
-            // unblock_ is true, then A_in_ is cache-blocked, and we
-            // want to un-cache-block it into A_out_ (a matrix in
-            // column-major order).
-            const_range_type inputRange (A_in_, strategy_, cbIndices.first,
-                                         cbIndices.second, unblock_);
-            range_type outputRange (A_out_, strategy_, cbIndices.first,
-                                    cbIndices.second, ! unblock_);
-            copyRange (inputRange, outputRange);
-          }
-        }
-      }
-    };
-
-    /// \class MultFunctor
-    /// \brief Kokkos functor for \c KokkosNodeTsqr::Q_times_B().
-    /// \author Mark Hoemmen
-    template<class LocalOrdinal, class Scalar>
-    class MultFunctor {
-    private:
-      using const_mat_view_type = MatView<LocalOrdinal, const Scalar>;
-      using mat_view_type = MatView<LocalOrdinal, Scalar>;
-      using range_type = CacheBlockRange<mat_view_type>;
-
-      mat_view_type Q_;
-      const_mat_view_type B_;
-      CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-      int numPartitions_;
-      bool contiguousCacheBlocks_;
-
-      // This uses SystemBlas for now.
-      // In the future, we may want to use a TPL.
-      // That means we could switch to RawBlas.
-      void
-      multBlock (Impl::SystemBlas<Scalar>& blas,
-                 const mat_view_type& Q_cur,
-                 Matrix<LocalOrdinal, Scalar>& Q_temp) const
-      {
-        using Teuchos::NO_TRANS;
-        const LocalOrdinal numCols = Q_cur.extent (1);
-
-        // GEMM doesn't like aliased arguments, so we use a copy.  We
-        // only copy the current cache block, rather than all of Q;
-        // this saves memory.
-        Q_temp.reshape (Q_cur.extent (0), numCols);
-        deep_copy (Q_temp, Q_cur);
-
-        // Q_cur := Q_temp * B.
-        blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent(0), numCols, numCols,
-                   Scalar (1.0),
-                   Q_temp.data(), Q_temp.stride(1), B_.data(), B_.stride(1),
-                   Scalar(0), Q_cur.data(), Q_cur.stride(1));
-      }
-
-      /// \brief Multiply (in place) each cache block in the range by B_.
-      ///
-      /// \param cbRange [in/out] Range of cache blocks.
-      void multRange (range_type& cbRange) const
-      {
-        typedef typename range_type::iterator iter_type;
-        iter_type iter = cbRange.begin();
-        iter_type end = cbRange.end();
-
-        // Temporary storage for the BLAS' matrix-matrix multiply
-        // routine (which forbids aliasing of any input argument and
-        // the output argument).
-        Matrix<LocalOrdinal, Scalar> Q_temp;
-        Impl::SystemBlas<Scalar> blas;
-        while (iter != end) {
-          mat_view_type Q_cur = *iter;
-          multBlock (blas, Q_cur, Q_temp);
-          ++iter;
-        }
-      }
-
-    public:
-      /// \brief Constructor
-      ///
-      /// \param Q [in/out] Matrix to multiply in place by B.
-      /// \param B [in] \f$Q := Q * B\f$.
-      /// \param strategy [in] Cache-blocking strategy.
-      /// \param numPartitions [in] Number of partitions of the matrix
-      ///   Q; maximum available parallelism.
-      /// \param contiguousCacheBlocks [in] Whether the cache blocks
-      ///   of Q are stored contiguously.
-      MultFunctor (const mat_view_type Q,
-                   const const_mat_view_type B,
-                   const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                   const int numPartitions,
-                   const bool contiguousCacheBlocks) :
-        Q_ (Q),
-        B_ (B),
-        strategy_ (strategy),
-        numPartitions_ (numPartitions),
-        contiguousCacheBlocks_ (contiguousCacheBlocks)
-      {}
-
-      /// \brief Method called by Kokkos' parallel_for.
-      ///
-      /// \param partitionIndex [in] Zero-based index of the partition
-      ///   of the matrix.  We parallelize over partitions.
-      ///   Partitions respect cache blocks.
-      void operator() (const int partitionIndex) const
-      {
-        if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
-            Q_.empty ()) {
-          return;
-        }
-        else {
-          typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
-          const index_range_type cbIndices =
-            cacheBlockIndexRange (Q_.extent (0), Q_.extent (1), partitionIndex,
-                                  numPartitions_, strategy_);
-          if (cbIndices.first >= cbIndices.second) {
-            return;
-          }
-          else {
-            range_type range (Q_, strategy_, cbIndices.first,
-                              cbIndices.second, contiguousCacheBlocks_);
-            multRange (range);
-          }
-        }
-      }
-    };
-
-    /// \class FillFunctor
-    /// \brief Kokkos functor for \c KokkosNodeTsqr::fill_with_zeros().
-    /// \author Mark Hoemmen
-    template<class LocalOrdinal, class Scalar>
-    class FillFunctor {
-    private:
-      using mat_view_type = MatView<LocalOrdinal, Scalar>;
-      using range_type = CacheBlockRange<mat_view_type>;
-
-      mat_view_type A_;
-      CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-      const Scalar value_;
-      int numPartitions_;
-      bool contiguousCacheBlocks_;
-
-      //! Fill (in place) each cache block in the range with value.
-      void fillRange (range_type& cbRange, const Scalar value) const
-      {
-        typedef typename range_type::iterator iter_type;
-        iter_type iter = cbRange.begin();
-        iter_type end = cbRange.end();
-        while (iter != end) {
-          mat_view_type A_cur = *iter;
-          deep_copy (A_cur, value);
-          ++iter;
-        }
-      }
-
-    public:
-      /// \brief Constructor
-      ///
-      /// \param A [in/out] Matrix to fill with the value.
-      /// \param strategy [in] Cache-blocking strategy.
-      /// \param value [in] The value with which to fill A.
-      /// \param numPartitions [in] Number of partitions of
-      ///   the matrix A; maximum available parallelism.
-      /// \param contiguousCacheBlocks [in] Whether the cache
-      ///   blocks of A are stored contiguously.
-      FillFunctor (const mat_view_type A,
-                   const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                   const Scalar value,
-                   const int numPartitions,
-                   const bool contiguousCacheBlocks) :
-        A_ (A),
-        strategy_ (strategy),
-        value_ (value),
-        numPartitions_ (numPartitions),
-        contiguousCacheBlocks_ (contiguousCacheBlocks)
-      {}
-
-      /// \brief Method called by Kokkos' parallel_for.
-      ///
-      /// \param partitionIndex [in] Zero-based index of the partition
-      ///   of the matrix.  We parallelize over partitions.
-      ///   Partitions respect cache blocks.
-      void operator() (const int partitionIndex) const
-      {
-        if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
-            A_.empty ()) {
-          return;
-        }
-        else {
-          typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
-          const index_range_type cbIndices =
-            cacheBlockIndexRange (A_.extent(0), A_.extent(1), partitionIndex,
-                                  numPartitions_, strategy_);
-          if (cbIndices.first >= cbIndices.second) {
-            return;
-          }
-          else {
-            range_type range (A_, strategy_, cbIndices.first,
-                              cbIndices.second, contiguousCacheBlocks_);
-            fillRange (range, value_);
-          }
-        }
-      }
-    };
-  } // namespace details
-
-  /// \class KokkosNodeTsqrFactorOutput
-  /// \brief Part of KokkosNodeTsqr's implicit Q representation.
-  /// \author Mark Hoemmen
-  ///
-  /// The \c KokkoNodeTsqr::factor() method represents the Q factor of
-  /// the matrix A implicitly.  Part of that representation is in the
-  /// A matrix on output, and the other part is returned as an object
-  /// of this type.  The apply() and explicit_Q() methods need both
-  /// parts of the implicit Q representation in order to do their
-  /// work.
-  template<class LocalOrdinal, class Scalar>
-  struct KokkosNodeTsqrFactorOutput {
-    typedef MatView<LocalOrdinal, Scalar> mat_view_type;
-
-    /// \brief Constructor
-    ///
-    /// \param theNumCacheBlocks [in] Total number of cache blocks
-    ///   (over all partitions).
-    /// \param theNumPartitions [in] Number of partitions.  This is
-    ///   an int because partition indices are ints, and the latter
-    ///   are ints because they end up as range arguments to Kokkos'
-    ///   parallel_for.
-    KokkosNodeTsqrFactorOutput (const size_t theNumCacheBlocks,
-                                const int theNumPartitions) :
-      firstPassTauArrays (theNumCacheBlocks)
-    {
-      // Protect the cast to size_t from a negative number of
-      // partitions.
-      TEUCHOS_TEST_FOR_EXCEPTION(theNumPartitions < 1, std::invalid_argument,
-                         "TSQR::KokkosNodeTsqrFactorOutput: Invalid number of "
-                         "partitions " << theNumPartitions << "; number of "
-                         "partitions must be a positive integer.");
-      // If there's only one partition, we don't even need a second
-      // pass (it's just sequential TSQR), and we don't need a TAU
-      // array for the top partition.
-      secondPassTauArrays.resize (size_t (theNumPartitions-1));
-      topBlocks.resize (size_t (theNumPartitions));
-    }
-
-    //! Total number of cache blocks in the matrix (over all partitions).
-    int numCacheBlocks() const { return firstPassTauArrays.size(); }
-
-    //! Number of partitions of the matrix; max available parallelism.
-    int numPartitions() const { return topBlocks.size(); }
-
-    //! TAU arrays from the first pass; one per cache block.
-    std::vector<std::vector<Scalar>> firstPassTauArrays;
-
-    /// \brief TAU arrays from the second pass.
-    ///
-    /// There is one TAU array per partition, except for the topmost
-    /// partition.
-    ///
-    /// For now, KokkosNodeTsqr::factor() uses only two passes over
-    /// the matrix.  firstPassTauArrays contains the result of the
-    /// pass over cache blocks, and secondPassTauArrays contains the
-    /// result of combining the upper triangular R factors from the
-    /// first pass.  Later, we may add more passes, in which case we
-    /// will likely combine firstPassTauArrays and secondPassTauArrays
-    /// into a single std::vector (variable number of passes) or
-    /// Teuchos::Tuple (fixed number of passes).
-    std::vector<std::vector<Scalar>> secondPassTauArrays;
-
-    /// \brief Views of the topmost cache blocks in each partition.
-    ///
-    /// One entry for each partition.
-    std::vector<mat_view_type> topBlocks;
-  };
-
-  /// \class KokkosNodeTsqr
-  /// \brief Intranode (within an MPI process) TSQR parallelized using
-  ///   Kokkos::DefaultHostExecutionSpace.
-  /// \author Mark Hoemmen
-  ///
-  /// \tparam LocalOrdinal The type of indices in the (node-local)
-  ///   matrix.
-  ///
-  /// \tparam Scalar The type of entries in the (node-local) matrix.
-  ///
-  /// This implementation of the intranode part of TSQR factors the
-  /// matrix in two passes.  The first pass parallelizes over
-  /// partitions, doing Sequential TSQR over each partition.  The
-  /// second pass combines the R factors from the partitions, and is
-  /// not currently parallel.  Thus, the overall algorithm is similar
-  /// to that of TbbTsqr, except that:
-  /// <ul>
-  /// <li> TbbTsqr partitions differently; KokkosNodeTsqr's partitions
-  ///      use the same layout of cache blocks as SequentialTsqr,
-  ///      whereas TbbTsqr uses a different layout. </li>
-  /// <li> TbbTsqr reduces the R factors in parallel; it only needs
-  ///      one "pass." </li>
-  /// </ul>
-  template<class LocalOrdinal, class Scalar>
-  class KokkosNodeTsqr :
-    public NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar>>,
-    public Teuchos::ParameterListAcceptorDefaultBase
-  {
-  public:
-    typedef LocalOrdinal local_ordinal_type;
-    typedef Scalar scalar_type;
-
-    using const_mat_view_type = MatView<LocalOrdinal, const Scalar>;
-    using mat_view_type = MatView<LocalOrdinal, Scalar>;
-
-    /// \typedef FactorOutput
-    /// \brief Part of the implicit Q representation returned by factor().
-    typedef typename NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar> >::factor_output_type FactorOutput;
-
-    /// \brief Constructor (with user-specified parameters).
-    ///
-    /// \param params [in/out] List of parameters.  Missing parameters
-    ///   will be filled in with default values.
-    KokkosNodeTsqr (const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null)
-    {
-      setParameterList (params);
-    }
-
-    /// \brief Whether this object is ready to perform computations.
-    bool ready() const {
-      return true;
-    }
-
-    /// \brief One-line description of this object.
-    ///
-    /// This implements Teuchos::Describable::description().
-    std::string description () const {
-      using Teuchos::TypeNameTraits;
-      std::ostringstream os;
-      os << "KokkosNodeTsqr<LocalOrdinal="
-         << TypeNameTraits<LocalOrdinal>::name()
-         << ", Scalar="
-         << TypeNameTraits<Scalar>::name()
-         << ">: \"Cache Size Hint\"=" << strategy_.cache_size_hint()
-         << ", \"Size of Scalar\"=" << strategy_.size_of_scalar()
-         << ", \"Num Tasks\"=" << numPartitions_;
-      return os.str();
-    }
-
-    /// \brief Validate and read in parameters.
-    ///
-    /// \param paramList [in/out] On input: non-null parameter list
-    ///   containing zero or more of the parameters in \c
-    ///   getValidParameters().  On output: missing parameters (i.e.,
-    ///   parameters in \c getValidParameters() but not in the input
-    ///   list) are filled in with default values.
-    void
-    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& paramList)
-    {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-      using Teuchos::rcp;
-
-      RCP<ParameterList> plist;
-      if (paramList.is_null()) {
-        plist = rcp (new ParameterList (*getValidParameters ()));
-      }
-      else {
-        plist = paramList;
-        plist->validateParametersAndSetDefaults (*getValidParameters ());
-      }
-      // Get values of parameters.  We do this "transactionally" so
-      // that (except for validation and filling in defaults above)
-      // this method has the strong exception guarantee (it either
-      // returns, or throws an exception with no externally visible
-      // side effects).
-      size_t cacheSizeHint, sizeOfScalar;
-      int numPartitions;
-      try {
-        cacheSizeHint = plist->get<size_t> ("Cache Size Hint");
-        sizeOfScalar = plist->get<size_t> ("Size of Scalar");
-        numPartitions = plist->get<int> ("Num Tasks");
-      }
-      catch (Teuchos::Exceptions::InvalidParameter& e) {
-        std::ostringstream os;
-        os << "Failed to read default parameters after setting defaults.  Pleas"
-          "e report this bug to the Kokkos developers.  Original exception mess"
-          "age: " << e.what();
-        TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str());
-      }
-      numPartitions_ = numPartitions;
-
-      // Recreate the cache blocking strategy.
-      typedef CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_type;
-      strategy_ = strategy_type (cacheSizeHint, sizeOfScalar);
-
-      // Save the input parameter list.
-      setMyParamList (plist);
-    }
-
-    /// \brief Default valid parameter list.
-    ///
-    /// The returned list contains all parameters accepted by \c
-    /// KokkosNodeTsqr, with their default values and documentation.
-    Teuchos::RCP<const Teuchos::ParameterList>
-    getValidParameters() const
-    {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-
-      if (defaultParams_.is_null()) {
-        RCP<ParameterList> params = parameterList ("Intranode TSQR");
-        params->set ("Cache Size Hint",
-                     static_cast<size_t>(0),
-                     std::string("Cache size in bytes; a hint for TSQR.  Set to t"
-                                 "he size of the largest private cache per CPU co"
-                                 "re, or the fraction of shared cache per core.  "
-                                 "If zero, we pick a reasonable default."));
-        params->set ("Size of Scalar",
-                     sizeof(Scalar),
-                     std::string ("Size in bytes of the Scalar type.  In most "
-                                  "cases, the default sizeof(Scalar) is fine.  "
-                                  "Set a non-default value only when Scalar's "
-                                  "data is dynamically allocated (such as for a "
-                                  "type with precision variable at run time)."));
-
-        // The number of partitions is an int rather than a
-        // LocalOrdinal, to ensure that it is always stored with the
-        // same type, despite the type of LocalOrdinal.  Besides, Kokkos
-        // wants an int anyway.
-        params->set ("Num Tasks",
-                     defaultNumPartitions (),
-                     std::string ("Number of partitions; the maximum available pa"
-                                  "rallelelism in intranode TSQR.  Slight oversub"
-                                  "scription is OK; undersubscription may have a "
-                                  "performance cost."));
-        defaultParams_ = params;
-      }
-      return defaultParams_;
-    }
-
-    FactorOutput
-    factor (const LocalOrdinal numRows,
-            const LocalOrdinal numCols,
-            Scalar A[],
-            const LocalOrdinal lda,
-            Scalar R[],
-            const LocalOrdinal ldr,
-            const bool contiguousCacheBlocks) const
-    {
-      mat_view_type A_view (numRows, numCols, A, lda);
-      mat_view_type R_view (numCols, numCols, R, ldr);
-      return factorImpl (A_view, R_view, contiguousCacheBlocks);
-    }
-
-    void
-    apply (const ApplyType& applyType,
-           const LocalOrdinal nrows,
-           const LocalOrdinal ncols_Q,
-           const Scalar Q[],
-           const LocalOrdinal ldq,
-           const FactorOutput& factorOutput,
-           const LocalOrdinal ncols_C,
-           Scalar C[],
-           const LocalOrdinal ldc,
-           const bool contiguousCacheBlocks) const
-    {
-      const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq);
-      mat_view_type C_view (nrows, ncols_C, C, ldc);
-      applyImpl (applyType, Q_view, factorOutput, C_view,
-                 false, contiguousCacheBlocks);
-    }
-
-    void
-    explicit_Q (const LocalOrdinal nrows,
-                const LocalOrdinal ncols_Q,
-                const Scalar Q[],
-                const LocalOrdinal ldq,
-                const FactorOutput& factorOutput,
-                const LocalOrdinal ncols_C,
-                Scalar C[],
-                const LocalOrdinal ldc,
-                const bool contiguousCacheBlocks) const
-    {
-      const_mat_view_type Q_view (nrows, ncols_Q, Q, ldq);
-      mat_view_type C_view (nrows, ncols_C, C, ldc);
-      applyImpl (ApplyType::NoTranspose, Q_view, factorOutput,
-                 C_view, true, contiguousCacheBlocks);
-    }
-
-    bool QR_produces_R_factor_with_nonnegative_diagonal () const {
-      return combine_.QR_produces_R_factor_with_nonnegative_diagonal ();
-    }
-
-    size_t cache_size_hint() const {
-      return strategy_.cache_size_hint();
-    }
-
-    void
-    fill_with_zeros (const LocalOrdinal nrows,
-                     const LocalOrdinal ncols,
-                     Scalar A[],
-                     const LocalOrdinal lda,
-                     const bool contiguousCacheBlocks) const
-    {
-      mat_view_type A_view (nrows, ncols, A, lda);
-
-      using functor_type = details::FillFunctor<LocalOrdinal, Scalar>;
-      const Scalar ZERO {};
-      functor_type functor (A_view, strategy_, ZERO, numPartitions_,
-                            contiguousCacheBlocks);
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range (0, numPartitions_);
-      Kokkos::parallel_for ("KokkosNodeTsqr::fill_with_zeros", range, functor);
-    }
-
-    void
-    cache_block (const LocalOrdinal nrows,
-                 const LocalOrdinal ncols,
-                 Scalar A_out[],
-                 const Scalar A_in[],
-                 const LocalOrdinal lda_in) const
-    {
-      const_mat_view_type A_in_view (nrows, ncols, A_in, lda_in);
-
-      // The leading dimension of A_out doesn't matter here, since its
-      // cache blocks are to be stored contiguously.  We set it
-      // arbitrarily to a sensible value.
-      mat_view_type A_out_view (nrows, ncols, A_out, nrows);
-
-      using functor_type = details::CacheBlockFunctor<LocalOrdinal, Scalar>;
-      functor_type functor (A_in_view, A_out_view, strategy_,
-                            numPartitions_, false);
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range (0, numPartitions_);
-      Kokkos::parallel_for ("KokkosNodeTsqr::cache_block", range, functor);
-    }
-
-    void
-    un_cache_block (const LocalOrdinal nrows,
-                    const LocalOrdinal ncols,
-                    Scalar A_out[],
-                    const LocalOrdinal lda_out,
-                    const Scalar A_in[]) const
-    {
-      // The leading dimension of A_in doesn't matter here, since its
-      // cache blocks are contiguously stored.  We set it arbitrarily
-      // to a sensible value.
-      const_mat_view_type A_in_view (nrows, ncols, A_in, nrows);
-      mat_view_type A_out_view (nrows, ncols, A_out, lda_out);
-
-      using functor_type = details::CacheBlockFunctor<LocalOrdinal, Scalar>;
-      functor_type functor (A_in_view, A_out_view, strategy_,
-                            numPartitions_, true);
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range (0, numPartitions_);
-      Kokkos::parallel_for ("KokkosNodeTsqr::un_cache_block", range, functor);
-    }
-
-    void
-    Q_times_B (const LocalOrdinal nrows,
-               const LocalOrdinal ncols,
-               Scalar Q[],
-               const LocalOrdinal ldq,
-               const Scalar B[],
-               const LocalOrdinal ldb,
-               const bool contiguousCacheBlocks) const
-    {
-      mat_view_type Q_view (nrows, ncols, Q, ldq);
-      const_mat_view_type B_view (ncols, ncols, B, ldb);
-
-      using functor_type = details::MultFunctor<LocalOrdinal, Scalar>;
-      functor_type functor (Q_view, B_view, strategy_, numPartitions_,
-                            contiguousCacheBlocks);
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range (0, numPartitions_);
-      Kokkos::parallel_for ("KokkosNodeTsqr::Q_times_B", range, functor);
-    }
-
-  private:
-    //! Implementation of fundamental TSQR kernels.
-    Combine<LocalOrdinal, Scalar> combine_;
-
-    //! Workspace for Combine operations.
-    mutable std::vector<Scalar> work_;
-
-    //! Cache blocking strategy.
-    CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
-
-    /// \brief Number of partitions; max available parallelism.
-    ///
-    /// The number of partitions is an int rather than a LocalOrdinal,
-    /// to ensure that it is always stored in the ParameterList with
-    /// the same type, despite the type of LocalOrdinal.  Besides,
-    /// Kokkos wants an int anyway.
-    int numPartitions_;
-
-    //! Default parameter list (set by \c getValidParameters()).
-    mutable Teuchos::RCP<const Teuchos::ParameterList> defaultParams_;
-
-    //! Default number of partitions.
-    int
-    defaultNumPartitions () const
-    {
-      return Kokkos::DefaultHostExecutionSpace::concurrency ();
-    }
-
-    FactorOutput
-    factorImpl (mat_view_type A,
-                mat_view_type R,
-                const bool contiguousCacheBlocks) const
-    {
-      const char prefix[] = "KokkosNodeTsqr::factorImpl: ";
-      const char suffix[] = "  Please report this bug to the Tpetra developers.";
-      using LO = LocalOrdinal;
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range (0, numPartitions_);
-
-      if (A.empty ()) {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (! R.empty (), std::logic_error, prefix << "A is empty, "
-           "but R is not." << suffix);
-        return FactorOutput (0, 0);
-      }
-      const LO numRowsPerCacheBlock =
-        strategy_.cache_block_num_rows (A.extent(1));
-      const LO numCacheBlocks =
-        strategy_.num_cache_blocks (A.extent(0), A.extent(1), numRowsPerCacheBlock);
-      //
-      // Compute the first factorization pass (over partitions).
-      //
-      FactorOutput result (numCacheBlocks, numPartitions_);
-      using first_pass_type = details::FactorFirstPass<LO, Scalar>;
-      first_pass_type firstPass (A, result.firstPassTauArrays,
-                                 result.topBlocks, strategy_,
-                                 numPartitions_, contiguousCacheBlocks);
-      Kokkos::parallel_for ("KokkosNodeTsqr::factorImpl::firstPass",
-                            range, firstPass);
-
-      // Each partition collected a view of its top block, where that
-      // partition's R factor is stored.  The second pass reduces
-      // those R factors.  We do this on one thread to avoid the
-      // overhead of parallelizing it.  If the typical use case is
-      // oversubscription, you should parallelize this step with
-      // multiple passes.  Note that we can't use parallel_reduce,
-      // because the tree topology matters.
-      factorSecondPass (result.topBlocks, result.secondPassTauArrays,
-                        numPartitions_);
-
-      // The "topmost top block" contains the resulting R factor.
-      const mat_view_type& R_top = result.topBlocks[0];
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (R_top.empty (), std::logic_error, prefix << "After "
-         "factorSecondPass: result.topBlocks[0] is an empty view."
-         << suffix);
-      mat_view_type R_top_square (R_top.extent(1), R_top.extent(1),
-                                  R_top.data(), R_top.stride(1));
-      deep_copy (R, Scalar {});
-      // Only copy the upper triangle of R_top into R.
-      copy_upper_triangle (R.extent(1), R.extent(1), R.data(), R.stride(1),
-                           R_top.data(), R_top.stride(1));
-      return result;
-    }
-
-    void
-    applyImpl (const ApplyType& applyType,
-               const const_mat_view_type& Q,
-               const FactorOutput& factorOutput,
-               const mat_view_type& C,
-               const bool explicitQ,
-               const bool contiguousCacheBlocks) const
-    {
-      const char prefix[] = "KokkosNodeTsqr::applyImpl: ";
-      const char suffix[] = "  Please report this bug to the Tpetra developers.";
-      using LO = LocalOrdinal;
-      using details::cacheBlockIndexRange;
-      using first_pass_type = details::ApplyFirstPass<LO, Scalar>;
-      using execution_space = Kokkos::DefaultHostExecutionSpace;
-
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (numPartitions_ != factorOutput.numPartitions(),
-         std::invalid_argument, prefix << "KokkosNodeTsqr's number "
-         "of partitions " << numPartitions_ << " does not match the "
-         "given factorOutput's number of partitions "
-         << factorOutput.numPartitions() << ".  This likely means "
-         "that the given factorOutput object comes from a different "
-         "instance of KokkosNodeTsqr." << suffix);
-      const int numParts = numPartitions_;
-      first_pass_type firstPass (applyType, Q,
-                                 factorOutput.firstPassTauArrays,
-                                 factorOutput.topBlocks, C, strategy_,
-                                 numParts, explicitQ,
-                                 contiguousCacheBlocks);
-      // Get a view of each partition's top block of the C matrix.
-      std::vector<mat_view_type> topBlocksOfC (numParts);
-      {
-        using index_range_type = std::pair<LO, LO>;
-        using blocker_type = CacheBlocker<LO, Scalar>;
-        blocker_type C_blocker (C.extent(0), C.extent(1), strategy_);
-
-        // For each partition, collect its top block of C.
-        for (int partIdx = 0; partIdx < numParts; ++partIdx) {
-          const index_range_type cbIndices =
-            cacheBlockIndexRange (C.extent(0), C.extent(1), partIdx,
-                                  numParts, strategy_);
-          if (cbIndices.first >= cbIndices.second) {
-            topBlocksOfC[partIdx] = mat_view_type (0, 0, nullptr, 0);
-          } else {
-            topBlocksOfC[partIdx] =
-              C_blocker.get_cache_block (C, cbIndices.first,
-                                         contiguousCacheBlocks);
-          }
-        }
-      }
-
-      Kokkos::RangePolicy<execution_space, Kokkos::IndexType<int>>
-        range(0, numPartitions_);
-      if (applyType.transposed ()) {
-        Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass",
-                              range, firstPass);
-        applySecondPass (applyType, factorOutput, topBlocksOfC,
-                         strategy_, explicitQ);
-      }
-      else {
-        applySecondPass (applyType, factorOutput, topBlocksOfC,
-                         strategy_, explicitQ);
-        Kokkos::parallel_for ("KokkosNodeTsqr::applyImpl::firstPass",
-                              range, firstPass);
-      }
-    }
-
-    std::vector<Scalar>
-    factorPair (const mat_view_type& R_top,
-                const mat_view_type& R_bot) const
-    {
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (R_top.empty (), std::logic_error, "R_top is empty!");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (R_bot.empty(), std::logic_error, "R_bot is empty!");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (work_.size() == 0, std::logic_error,
-         "Workspace array work_ has length zero.");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (work_.size() < size_t (R_top.extent(1)), std::logic_error,
-         "Workspace array work_ has length = " << work_.size()
-         << " < R_top.extent(1) = " << R_top.extent(1) << ".");
-
-      std::vector<Scalar> tau (R_top.extent (1));
-
-      // Our convention for such helper methods is for the immediate
-      // parent to allocate workspace (the work_ array in this case).
-      //
-      // The statement below only works if R_top and R_bot have a
-      // nonzero (and the same) number of columns, but we have already
-      // checked that above.
-      combine_.factor_pair (R_top, R_bot, tau.data(), work_.data());
-      return tau;
-    }
-
-    void
-    factorSecondPass (std::vector<mat_view_type >& topBlocks,
-                      std::vector<std::vector<Scalar> >& tauArrays,
-                      const int numPartitions) const
-    {
-      const char prefix[] = "KokkosNodeTsqr::factorSecondPass: ";
-      const char suffix[] = "  Please report this bug to the Tpetra developers.";
-
-      if (numPartitions <= 1)
-        return; // Done!
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (topBlocks.size () < size_t (numPartitions), std::logic_error,
-         prefix << "topBlocks.size() (= " << topBlocks.size() << ") "
-         "< numPartitions (= " << numPartitions << ")." << suffix);
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (tauArrays.size () < size_t (numPartitions-1),
-         std::logic_error, prefix << "topBlocks.size() (= "
-         << topBlocks.size() << ") < numPartitions-1 (= "
-         << (numPartitions-1) << ")." << suffix);
-      // The top partition (partition index zero) should always be
-      // nonempty if we get this far, so its top block should also be
-      // nonempty.
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (topBlocks[0].empty(), std::logic_error,
-         prefix << "topBlocks[0] is empty." << suffix);
-      // However, other partitions besides the top one might be empty,
-      // in which case their top blocks will be empty.  We skip over
-      // the empty partitions in the loop below.
-      work_.resize (size_t (topBlocks[0].extent(1)));
-      for (int partIdx = 1; partIdx < numPartitions; ++partIdx) {
-        if (! topBlocks[partIdx].empty ()) {
-          tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]);
-        }
-      }
-    }
-
-    void
-    applyPair (const ApplyType& applyType,
-               const mat_view_type& R_bot,
-               const std::vector<Scalar>& tau,
-               const mat_view_type& C_top,
-               const mat_view_type& C_bot) const
-    {
-      // Our convention for such helper methods is for the immediate
-      // parent to allocate workspace (the work_ array in this case).
-      //
-      // The statement below only works if C_top, R_bot, and C_bot
-      // have a nonzero (and the same) number of columns, but we have
-      // already checked that above.
-      combine_.apply_pair (applyType, C_top.extent(1), R_bot.extent(1),
-                           R_bot.data(), R_bot.stride(1), tau.data(),
-                           C_top.data(), C_top.stride(1),
-                           C_bot.data(), C_bot.stride(1), work_.data());
-    }
-
-    void
-    applySecondPass (const ApplyType& applyType,
-                     const FactorOutput& factorOutput,
-                     std::vector<mat_view_type >& topBlocksOfC,
-                     const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
-                     const bool explicitQ) const
-    {
-      const char prefix[] = "KokkosNodeTsqr::applySecondPass: ";
-      const char suffix[] = "  Please report this bug to the Tpetra developers.";
-
-      const int numParts = factorOutput.numPartitions();
-      if (numParts <= 1)
-        return; // Done!
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (topBlocksOfC.size () != size_t (numParts), std::logic_error,
-         prefix << "topBlocksOfC.size() (= " << topBlocksOfC.size()
-         << ") != number of partitions (= " << numParts << ")."
-         << suffix);
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (factorOutput.secondPassTauArrays.size () != size_t (numParts-1),
-         std::logic_error, prefix <<
-         "factorOutput.secondPassTauArrays.size() (= "
-         << factorOutput.secondPassTauArrays.size()
-         << ") != number of partitions minus 1 (= "
-         << (numParts-1) << ")." << suffix);
-      const LocalOrdinal numCols = topBlocksOfC[0].extent(1);
-      work_.resize (size_t (numCols));
-
-      // Top blocks of C are the whole cache blocks.  We only want to
-      // affect the top ncols x ncols part of each of those blocks in
-      // this method.
-      mat_view_type C_top_square (numCols, numCols, topBlocksOfC[0].data(),
-                                  topBlocksOfC[0].stride(1));
-      if (applyType.transposed ()) {
-        // Don't include the topmost (index 0) partition in the
-        // iteration; that corresponds to C_top_square.
-        for (int partIdx = 1; partIdx < numParts; ++partIdx) {
-          // It's legitimate for some partitions not to have any
-          // cache blocks.  In that case, their top block will be
-          // empty, and we can skip over them.
-          const mat_view_type& C_cur = topBlocksOfC[partIdx];
-          if (! C_cur.empty()) {
-            mat_view_type C_cur_square (numCols, numCols, C_cur.data (),
-                                        C_cur.stride (1));
-            // If explicitQ: We've already done the first pass and
-            // filled the top blocks of C.
-            applyPair (applyType, factorOutput.topBlocks[partIdx],
-                       factorOutput.secondPassTauArrays[partIdx-1],
-                       C_top_square, C_cur_square);
-          }
-        }
-      } else {
-        // In non-transposed mode, when computing the first
-        // C.extent(1) columns of the explicit Q factor, intranode
-        // TSQR would run after internode TSQR (i.e., DistTsqr)
-        // (even if only running on a single node in non-MPI mode).
-        // Therefore, internode TSQR is responsible for filling the
-        // top block of this node's part of the C matrix.
-        //
-        // Don't include the topmost partition in the iteration;
-        // that corresponds to C_top_square.
-        for (int partIdx = numParts - 1; partIdx > 0; --partIdx) {
-          // It's legitimate for some partitions not to have any
-          // cache blocks.  In that case, their top block will be
-          // empty, and we can skip over them.
-          const mat_view_type& C_cur = topBlocksOfC[partIdx];
-          if (! C_cur.empty()) {
-            mat_view_type C_cur_square (numCols, numCols,
-                                        C_cur.data (),
-                                        C_cur.stride (1));
-            // The "first" pass (actually the last, only named
-            // "first" by analogy with factorFirstPass()) will
-            // fill the rest of these top blocks.  For now, we
-            // just fill the top n x n part of the top blocks
-            // with zeros.
-            if (explicitQ) {
-              deep_copy (C_cur_square, Scalar {});
-            }
-            applyPair (applyType, factorOutput.topBlocks[partIdx],
-                       factorOutput.secondPassTauArrays[partIdx-1],
-                       C_top_square, C_cur_square);
-          }
-        }
-      }
-    }
-
-  protected:
-
-    /// \brief Return the topmost cache block of the matrix C.
-    ///
-    /// NodeTsqr's top_block() method must be implemented using its
-    /// subclasses' const_top_block() method.  This is because
-    /// top_block() is a template method, and template methods cannot
-    /// be virtual.
-    ///
-    /// \param C [in] View of a matrix, with at least as many rows as
-    ///   columns.
-    /// \param contiguous_cache_blocks [in] Whether the cache blocks
-    ///   of C are stored contiguously.
-    ///
-    /// \return View of the topmost cache block of the matrix C.
-    const_mat_view_type
-    const_top_block (const const_mat_view_type& C,
-                     const bool contiguous_cache_blocks) const
-    {
-      typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
-      blocker_type blocker (C.extent(0), C.extent(1), strategy_);
-
-      // C_top_block is a view of the topmost cache block of C.
-      // C_top_block should have >= ncols rows, otherwise either cache
-      // blocking is broken or the input matrix C itself had fewer
-      // rows than columns.
-      const_mat_view_type C_top = blocker.top_block (C, contiguous_cache_blocks);
-      return C_top;
-    }
-  };
-} // namespace TSQR
-
-#endif // __TSQR_KokkosNodeTsqr_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp
deleted file mode 100644
index ab3f0411d22d..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_KokkosNodeTsqrTest.hpp
+++ /dev/null
@@ -1,511 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_Test_KokkosNodeTsqrTest_hpp
-#define __TSQR_Test_KokkosNodeTsqrTest_hpp
-
-#include "Tsqr_nodeTestProblem.hpp"
-#include "Tsqr_verifyTimerConcept.hpp"
-#include "Tsqr_Random_NormalGenerator.hpp"
-#include "Tsqr_LocalVerify.hpp"
-#include "Tsqr_Matrix.hpp"
-#include "Tsqr_KokkosNodeTsqr.hpp"
-#include "Teuchos_ScalarTraits.hpp"
-#include "Teuchos_Time.hpp"
-#include "Teuchos_TypeNameTraits.hpp"
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <stdexcept>
-
-namespace TSQR {
-  namespace Test {
-    /// \fn verifyKokkosNodeTsqr
-    /// \brief Test accuracy of KokkosNodeTsqr's QR factorization.
-    ///
-    /// Test the accuracy of KokkosNodeTsqr's QR factorization on a
-    /// numRows by numCols matrix, and print results to stdout.
-    ///
-    /// \param gen [in/out] Pseudorandom number generator for the
-    ///   normal(0,1) distribution.
-    /// \param numRows [in] Number of rows in the test matrix.
-    /// \param numCols [in] Number of columns in the test matrix.
-    /// \param numPartitions [in] Number of parallel partitions (must
-    ///   be a positive integer).
-    /// \param cacheSizeHint [in] Cache size hint, in bytes.  Zero
-    ///   means pick a reasonable default.
-    /// \param contiguousCacheBlocks [in] Whether cache blocks in the
-    ///   matrix to factor should be stored contiguously.
-    /// \param printFieldNames [in] If humanReadable is true, this is
-    ///   ignored; otherwise, whether to print a line of field names
-    ///   before the line of output.
-    /// \param humanReadable [in] Whether to print output that is easy
-    ///   for humans to read, or instead to print output that is easy
-    ///   for a script to parse.
-    /// \param debug [in] Whether to print extra debugging output to
-    ///   stderr.
-    template<class Ordinal, class Scalar>
-    void
-    verifyKokkosNodeTsqr (TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
-                          const Ordinal numRows,
-                          const Ordinal numCols,
-                          const int numPartitions,
-                          const size_t cacheSizeHint,
-                          const bool contiguousCacheBlocks,
-                          const bool printFieldNames,
-                          const bool humanReadable,
-                          const bool debug)
-    {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-      using Teuchos::TypeNameTraits;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-      using node_tsqr_type = TSQR::KokkosNodeTsqr<Ordinal, Scalar>;
-      typedef typename node_tsqr_type::FactorOutput factor_output_type;
-      typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
-      // typedef Teuchos::Time timer_type;
-      typedef Matrix<Ordinal, Scalar> matrix_type;
-      typedef MatView<Ordinal, Scalar> mat_view_type;
-
-      const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
-
-      // Set up TSQR implementation.
-      RCP<ParameterList> params = parameterList ("Intranode TSQR");
-      params->set ("Cache Size Hint", cacheSizeHint);
-      params->set ("Num Tasks", numPartitions);
-      node_tsqr_type actor (params);
-      if (debug) {
-        cerr << actor.description() << endl;
-        if (contiguousCacheBlocks) {
-          cerr << "-- Test with contiguous cache blocks" << endl;
-        }
-      }
-
-      // Allocate space for test problem.
-      matrix_type A (numRows, numCols);
-      matrix_type A_copy (numRows, numCols);
-      matrix_type Q (numRows, numCols);
-      matrix_type R (numCols, numCols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (A_copy, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (Q, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (R, std::numeric_limits<Scalar>::quiet_NaN());
-      }
-      else {
-        deep_copy (A, Scalar {});
-        deep_copy (A_copy, Scalar {});
-        deep_copy (Q, Scalar {});
-        deep_copy (R, Scalar {});
-      }
-      const Ordinal lda = numRows;
-      const Ordinal ldq = numRows;
-      const Ordinal ldr = numCols;
-
-      // Create a test problem
-      nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), true);
-
-      if (debug) {
-        cerr << "-- Generated test problem" << endl;
-        // Don't print the matrix if it's too big.
-        if (A.extent(0) <= 30) {
-          cerr << "A = " << endl;
-          print_local_matrix (cerr, A.extent(0), A.extent(1),
-                              A.data(), A.stride(1));
-          cerr << endl << endl;
-        }
-      }
-
-      // Copy A into A_copy, since TSQR overwrites the input.  If
-      // specified, rearrange the data in A_copy so that the data in
-      // each cache block is contiguously stored.
-      if (! contiguousCacheBlocks) {
-        deep_copy (A_copy, A);
-        if (debug) {
-          cerr << "-- Copied test problem from A into A_copy" << endl;
-          // Don't print the matrix if it's too big.
-          if (A_copy.extent(0) <= 30) {
-            cerr << "A_copy = " << endl;
-            print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1),
-                                A_copy.data(), A_copy.stride(1));
-            cerr << endl << endl;
-          }
-        }
-      }
-      else {
-        actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1));
-        if (debug) {
-          cerr << "-- Reorganized test matrix to have contiguous "
-            "cache blocks" << endl;
-          // Don't print the matrix if it's too big.
-          if (A_copy.extent(0) <= 30) {
-            cerr << "A_copy = " << endl;
-            print_local_matrix (cerr, A_copy.extent(0), A_copy.extent(1),
-                                A_copy.data(), A_copy.stride(1));
-            cerr << endl << endl;
-          }
-        }
-
-        // Verify cache blocking, when in debug mode.
-        if (debug) {
-          matrix_type A2 (numRows, numCols);
-          if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-            deep_copy (A2, std::numeric_limits<Scalar>::quiet_NaN());
-          }
-
-          actor.un_cache_block (numRows, numCols, A2.data(), A2.stride(1), A_copy.data());
-          if (matrix_equal (A, A2)) {
-            if (debug)
-              cerr << "-- Cache blocking test succeeded!" << endl;
-          }
-          else {
-            if (debug) {
-              cerr << "*** Cache blocking test failed! A != A2 ***"
-                   << endl << endl;
-              // Don't print the matrices if they are too big.
-              if (A.extent(0) <= 30 && A2.extent(0) <= 30) {
-                cerr << "A = " << endl;
-                print_local_matrix (cerr, A.extent(0), A.extent(1),
-                                    A.data(), A.stride(1));
-                cerr << endl << "A2 = " << endl;
-                print_local_matrix (cerr, A2.extent(0), A2.extent(1),
-                                    A2.data(), A2.stride(1));
-                cerr << endl;
-              }
-            }
-            throw std::logic_error ("Cache blocking failed");
-          }
-        }
-      }
-
-      // Fill R with zeros, since the factorization may not
-      // necessarily overwrite the strict lower triangle of R.
-      if (debug) {
-        cerr << "-- Filling R with zeros" << endl;
-      }
-      deep_copy (R, Scalar {});
-
-      if (debug) {
-        cerr << "-- Calling factor()" << endl;
-      }
-
-      // Factor the matrix and compute the explicit Q factor
-      factor_output_type factor_output =
-        actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                      R.data(), R.stride(1), contiguousCacheBlocks);
-      if (debug) {
-        cerr << "-- Finished factor()" << endl;
-        cerr << "-- Calling explicit_Q()" << endl;
-      }
-
-      // KokkosNodeTsqr isn't designed to be used by itself, so we
-      // have to help it along by filling the top ncols x ncols
-      // entries with the first ncols columns of the identity matrix.
-      {
-        mat_view_type Q_top =
-          actor.top_block (Q.view (), contiguousCacheBlocks);
-        mat_view_type Q_top_square (Q_top.extent(1), Q_top.extent(1),
-                                    Q_top.data(), Q_top.stride(1));
-        deep_copy (Q_top_square, Scalar {});
-        for (Ordinal j = 0; j < Q_top_square.extent(1); ++j) {
-          Q_top_square(j,j) = Scalar (1.0);
-        }
-      }
-      actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                        factor_output, numCols, Q.data(), Q.stride(1),
-                        contiguousCacheBlocks);
-      if (debug) {
-        cerr << "-- Finished explicit_Q()" << endl;
-      }
-
-      // "Un"-cache-block the output Q (the explicit Q factor), if
-      // contiguous cache blocks were used.  This is only necessary
-      // because local_verify() doesn't currently support contiguous
-      // cache blocks.
-      if (contiguousCacheBlocks) {
-        // Use A_copy as temporary storage for un-cache-blocking Q.
-        actor.un_cache_block (numRows, numCols, A_copy.data(),
-                              A_copy.stride(1), Q.data());
-        deep_copy (Q, A_copy);
-        if (debug) {
-          cerr << "-- Un-cache-blocked output Q factor" << endl;
-        }
-      }
-
-      // Print out the Q and R factors in debug mode.
-      if (debug) {
-        // Don't print the matrix if it's too big.
-        if (Q.extent(0) <= 30) {
-          cerr << endl << "-- Q factor:" << endl;
-          print_local_matrix (cerr, Q.extent(0), Q.extent(1),
-                              Q.data(), Q.stride(1));
-          cerr << endl << endl;
-        }
-        cerr << endl << "-- R factor:" << endl;
-        print_local_matrix (cerr, numCols, numCols, R.data(), R.stride(1));
-        cerr << endl;
-      }
-
-      // Validate the factorization
-      std::vector<magnitude_type> results =
-        local_verify (numRows, numCols, A.data(), lda,
-                      Q.data(), ldq, R.data(), ldr);
-      if (debug)
-        cerr << "-- Finished local_verify" << endl;
-
-      // Print the results
-      if (humanReadable) {
-        cout << "KokkosNodeTsqr:" << endl
-             << "Scalar type: " << scalarTypeName << endl
-             << "# rows: " << numRows << endl
-             << "# columns: " << numCols << endl
-             << "# partitions: " << numPartitions << endl
-             << "cache size hint (revised) in bytes: " << actor.cache_size_hint() << endl
-             << "contiguous cache blocks? " << contiguousCacheBlocks << endl
-             << "Absolute residual $\\|A - Q*R\\|_2$: "
-             << results[0] << endl
-             << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: "
-             << results[1] << endl
-             << "Test matrix norm $\\| A \\|_F$: "
-             << results[2] << endl
-             << endl;
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          cout << prefix
-               << "method"
-               << ",scalarType"
-               << ",numRows"
-               << ",numCols"
-               << ",numPartitions"
-               << ",cacheSizeHint"
-               << ",contiguousCacheBlocks"
-               << ",absFrobResid"
-               << ",absFrobOrthog"
-               << ",frobA"
-               << endl;
-        }
-        cout << "KokkosNodeTsqr"
-             << "," << scalarTypeName
-             << "," << numRows
-             << "," << numCols
-             << "," << numPartitions
-             << "," << actor.cache_size_hint()
-             << "," << contiguousCacheBlocks
-             << "," << results[0]
-             << "," << results[1]
-             << "," << results[2]
-             << endl;
-      }
-    }
-
-    /// \fn benchmarkKokkosNodeTsqr
-    /// \brief Test performance of KokkosNodeTsqr's QR factorization.
-    ///
-    /// Compare the performance of KokkosNodeTsqr's QR factorization
-    /// to that of LAPACK's QR factorization.  Print results to
-    /// stdout.
-    ///
-    /// \param numTrials [in] Number of times to run the benchmark;
-    ///   the timing result is cumulative over all trials.  Timing
-    ///   over larger numbers of trials improves certainty of the
-    ///   result.
-    /// \param numRows [in] Number of rows in the test matrix.
-    /// \param numCols [in] Number of columns in the test matrix.
-    /// \param numPartitions [in] Number of parallel partitions (must
-    ///   be a positive integer).
-    /// \param cacheSizeHint [in] Cache size hint, in bytes.  Zero
-    ///   means pick a reasonable default.
-    /// \param contiguousCacheBlocks [in] Whether cache blocks in the
-    ///   matrix to factor should be stored contiguously.
-    /// \param printFieldNames [in] If humanReadable is true, this is
-    ///   ignored; otherwise, whether to print a line of field names
-    ///   before the line of output.
-    /// \param humanReadable [in] Whether to print output that is easy
-    ///   for humans to read, or instead to print output that is easy
-    ///   for a script to parse.
-    template<class Ordinal, class Scalar>
-    void
-    benchmarkKokkosNodeTsqr (const int numTrials,
-                             const Ordinal numRows,
-                             const Ordinal numCols,
-                             const int numPartitions,
-                             const size_t cacheSizeHint,
-                             const bool contiguousCacheBlocks,
-                             const bool printFieldNames,
-                             const bool humanReadable)
-    {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-      using Teuchos::TypeNameTraits;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-      using node_tsqr_type = TSQR::KokkosNodeTsqr<Ordinal, Scalar>;
-      typedef typename node_tsqr_type::FactorOutput factor_output_type;
-      typedef Teuchos::Time timer_type;
-      typedef Matrix<Ordinal, Scalar> matrix_type;
-
-      const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
-
-      // Pseudorandom normal(0,1) generator.  Default seed is OK,
-      // because this is a benchmark, not an accuracy test.
-      TSQR::Random::NormalGenerator<Ordinal, Scalar> gen;
-
-      // Set up TSQR implementation.
-      RCP<ParameterList> params = parameterList ("Intranode TSQR");
-      params->set ("Cache Size Hint", cacheSizeHint);
-      params->set ("Num Tasks", numPartitions);
-      node_tsqr_type actor (params);
-
-      // Allocate space for test problem.
-      matrix_type A (numRows, numCols);
-      matrix_type A_copy (numRows, numCols);
-      matrix_type Q (numRows, numCols);
-      matrix_type R (numCols, numCols);
-
-      // Fill R with zeros, since the factorization may not overwrite
-      // the strict lower triangle of R.
-      deep_copy (R, Scalar {});
-
-      // Create a test problem
-      nodeTestProblem (gen, numRows, numCols, A.data(), A.stride(1), false);
-
-      // Copy A into A_copy, since TSQR overwrites the input.  If
-      // specified, rearrange the data in A_copy so that the data in
-      // each cache block is contiguously stored.
-      if (contiguousCacheBlocks) {
-        actor.cache_block (numRows, numCols, A_copy.data(), A.data(), A.stride(1));
-      } else {
-        deep_copy (A_copy, A);
-      }
-
-      // Do a few timing runs and throw away the results, just to warm
-      // up any libraries that do autotuning.
-      const int numWarmupRuns = 5;
-      for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-        // Factor the matrix in-place in A_copy, and extract the
-        // resulting R factor into R.
-        factor_output_type factor_output =
-          actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                        R.data(), R.stride(1), contiguousCacheBlocks);
-        // Compute the explicit Q factor (which was stored
-        // implicitly in A_copy and factor_output) and store in Q.
-        // We don't need to un-cache-block the output, because we
-        // aren't verifying it here.
-        actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                          factor_output, numCols, Q.data(), Q.stride(1),
-                          contiguousCacheBlocks);
-      }
-
-      // Benchmark intranode TSQR for numTrials trials.
-      //
-      // Name of timer doesn't matter here; we only need the timing.
-      timer_type timer("KokkosNodeTsqr");
-      timer.start();
-      for (int trialNum = 0; trialNum < numTrials; ++trialNum) {
-        // Factor the matrix in-place in A_copy, and extract the
-        // resulting R factor into R.
-        factor_output_type factor_output =
-          actor.factor (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                        R.data(), R.stride(1), contiguousCacheBlocks);
-        // Compute the explicit Q factor (which was stored
-        // implicitly in A_copy and factor_output) and store in Q.
-        // We don't need to un-cache-block the output, because we
-        // aren't verifying it here.
-        actor.explicit_Q (numRows, numCols, A_copy.data(), A_copy.stride(1),
-                          factor_output, numCols, Q.data(), Q.stride(1),
-                          contiguousCacheBlocks);
-      }
-      const double timing = timer.stop();
-
-      // Print the results
-      if (humanReadable) {
-        cout << "KokkosNodeTsqr cumulative timings:" << endl
-             << "Scalar type: " << scalarTypeName << endl
-             << "# rows = " << numRows << endl
-             << "# columns = " << numCols << endl
-             << "# partitions: " << numPartitions << endl
-             << "Cache size hint (in bytes) = " << actor.cache_size_hint() << endl
-             << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
-             << "# trials = " << numTrials << endl
-             << "Total time (s) = " << timing << endl;
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          cout << prefix
-               << "method"
-               << ",scalarType"
-               << ",numRows"
-               << ",numCols"
-               << ",numPartitions"
-               << ",cacheSizeHint"
-               << ",contiguousCacheBlocks"
-               << ",numTrials"
-               << ",timing"
-               << endl;
-        }
-
-        // We don't include {min,max}_seq_apply_timing() here, because
-        // those times don't benefit from the accuracy of benchmarking
-        // for numTrials > 1.  Thus, it's misleading to include them
-        // with tbb_tsqr_timing, the total time over numTrials trials.
-        cout << "KokkosNodeTsqr"
-             << "," << scalarTypeName
-             << "," << numRows
-             << "," << numCols
-             << "," << numPartitions
-             << "," << actor.cache_size_hint()
-             << "," << contiguousCacheBlocks
-             << "," << numTrials
-             << "," << timing
-             << endl;
-      }
-    }
-  } // namespace Test
-} // namespace TSQR
-
-#endif // __TSQR_Test_KokkosNodeTsqrTest_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp
index 2b3b8ddecd5d..46423863d970 100644
--- a/packages/tpetra/tsqr/src/Tsqr_MatView.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_MatView.hpp
@@ -37,15 +37,10 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __TSQR_Tsqr_MatView_hpp
-#define __TSQR_Tsqr_MatView_hpp
+#ifndef TSQR_MATVIEW_HPP
+#define TSQR_MATVIEW_HPP
 
-// Define for bounds checking and other safety features, undefine for speed.
-// #define TSQR_MATVIEW_DEBUG 1
-
-#ifdef TSQR_MATVIEW_DEBUG
-#  include <limits>
-#endif // TSQR_MATVIEW_DEBUG
+#include "Teuchos_TestForException.hpp"
 #include <sstream>
 #include <stdexcept>
 #include <type_traits>
@@ -91,50 +86,6 @@ namespace TSQR {
     return true;
   }
 
-#ifdef TSQR_MATVIEW_DEBUG
-  template<class Ordinal, class Scalar>
-  class MatViewVerify {
-  public:
-    static void
-    verify (const Ordinal num_rows,
-            const Ordinal num_cols,
-            const Scalar* const A,
-            const Ordinal leading_dim)
-    {
-      using std::endl;
-
-      bool good = true;
-      std::ostringstream os;
-      if (! std::numeric_limits<Ordinal>::is_integer) {
-        good = false;
-        os << "Error: Ordinal type must be an integer.";
-      }
-      if (std::numeric_limits<Ordinal>::is_signed) {
-        if (num_rows < 0) {
-          good = false;
-          os << "Error: num_rows (= " << num_rows << ") < 0.";
-        }
-        if (num_cols < 0) {
-          good = false;
-          os << "Error: num_cols (= " << num_cols << ") < 0.";
-        }
-        if (leading_dim < 0) {
-          good = false;
-          os << "Error: leading_dim (= " << leading_dim << ") < 0.";
-        }
-      }
-      if (leading_dim < num_rows) {
-        good = false;
-        os << "Error: leading_dim (= " << leading_dim << ") < num_rows (= "
-           << num_rows << ").";
-      }
-      if (! good) {
-        throw std::invalid_argument (os.str ());
-      }
-    }
-  };
-#endif // TSQR_MATVIEW_DEBUG
-
   // Forward declaration
   template<class Ordinal, class Scalar>
   class Matrix;
@@ -163,12 +114,7 @@ namespace TSQR {
       ncols_(num_cols),
       lda_(leading_dim),
       A_(A)
-    {
-#ifdef TSQR_MATVIEW_DEBUG
-      MatViewVerify<ordinal_type, non_const_value_type>::
-        verify (num_rows, num_cols, A, leading_dim);
-#endif // TSQR_MATVIEW_DEBUG
-    }
+    {}
 
     MatView (const MatView& view) = default;
     MatView& operator= (const MatView& view) = default;
@@ -197,155 +143,11 @@ namespace TSQR {
     operator() (const ordinal_type i,
                 const ordinal_type j) const
     {
-#ifdef TSQR_MATVIEW_DEBUG
-      if (std::numeric_limits<ordinal_type>::is_signed) {
-        if (i < 0 || i >= extent(0)) {
-          throw std::invalid_argument("Row range invalid");
-        }
-        else if (j < 0 || j >= extent(1)) {
-          throw std::invalid_argument("Column range invalid");
-        }
-      }
-      else {
-        if (i >= extent(0)) {
-          throw std::invalid_argument("Row range invalid");
-        }
-        else if (j >= extent(1)) {
-          throw std::invalid_argument("Column range invalid");
-        }
-      }
-      if (A_ == nullptr) {
-        throw std::logic_error("Attempt to reference NULL data");
-      }
-#endif // TSQR_MATVIEW_DEBUG
       return A_[i + j * this->stride(1)];
     }
 
     pointer data() const { return A_; }
 
-    bool empty() const { return extent(0) == 0 || extent(1) == 0; }
-
-    /// Return a "row block" (submatrix of consecutive rows in the
-    /// inclusive range [firstRow,lastRow]).
-    MatView row_block (const ordinal_type firstRow,
-                       const ordinal_type lastRow)
-    {
-#ifdef TSQR_MATVIEW_DEBUG
-      if (std::numeric_limits<ordinal_type>::is_signed) {
-        if (firstRow < 0 || firstRow > lastRow || lastRow >= extent(0)) {
-          throw std::invalid_argument ("Row range invalid");
-        }
-      }
-      else {
-        if (firstRow > lastRow || lastRow >= extent(0)) {
-          throw std::invalid_argument ("Row range invalid");
-        }
-      }
-#endif // TSQR_MATVIEW_DEBUG
-      return MatView (lastRow - firstRow + 1, extent(1), data() + firstRow, stride(1));
-    }
-
-    /// Split off and return the top cache block of nrows_top rows.
-    /// Modify *this to be the "rest" of the matrix.
-    ///
-    /// \note Only use this method to split off a single cache block.
-    ///   It breaks if you try to use it otherwise.
-    ///
-    /// \param nrows_top [in] Number of rows in the top block (which
-    ///   this method returns)
-    ///
-    /// \param b_contiguous_blocks [in] Whether or not the entries of
-    ///   the top block are stored contiguously in *this.  The default
-    ///   is no (false).
-    ///
-    /// \return The top block of nrows_top rows.  Data is a shallow
-    ///   copy of the data in *this.
-    MatView
-    split_top (const ordinal_type nrows_top,
-               const bool b_contiguous_blocks = false)
-    {
-#ifdef TSQR_MATVIEW_DEBUG
-      if (std::numeric_limits<ordinal_type>::is_signed && nrows_top < 0) {
-        std::ostringstream os;
-        os << "nrows_top (= " << nrows_top << ") < 0";
-        throw std::invalid_argument (os.str());
-      }
-      else if (nrows_top > extent(0)) {
-        std::ostringstream os;
-        os << "nrows_top (= " << nrows_top << ") > nrows (= " << extent(0) << ")";
-        throw std::invalid_argument (os.str());
-      }
-#endif // TSQR_MATVIEW_DEBUG
-
-      pointer const A_top_ptr = data();
-      pointer A_rest_ptr;
-      const ordinal_type nrows_rest = extent(0) - nrows_top;
-      ordinal_type lda_top, lda_rest;
-      if (b_contiguous_blocks) {
-        lda_top = nrows_top;
-        lda_rest = nrows_rest;
-        A_rest_ptr = A_top_ptr + nrows_top * extent(1);
-      }
-      else {
-        lda_top = stride(1);
-        lda_rest = stride(1);
-        A_rest_ptr = A_top_ptr + nrows_top;
-      }
-      MatView A_top (nrows_top, extent(1), data(), lda_top);
-      A_ = A_rest_ptr;
-      nrows_ = nrows_rest;
-      lda_ = lda_rest;
-
-      return A_top;
-    }
-
-    /// Split off and return the bottom block.  Modify *this to be the
-    /// "rest" of the matrix.
-    MatView
-    split_bottom (const ordinal_type nrows_bottom,
-                  const bool b_contiguous_blocks = false)
-    {
-#ifdef TSQR_MATVIEW_DEBUG
-      if (std::numeric_limits<ordinal_type>::is_signed && nrows_bottom < 0) {
-        throw std::invalid_argument ("nrows_bottom < 0");
-      }
-      if (nrows_bottom > extent(0)) {
-        throw std::invalid_argument ("nrows_bottom > nrows");
-      }
-#endif // TSQR_MATVIEW_DEBUG
-
-      pointer const A_rest_ptr = data();
-      pointer A_bottom_ptr;
-      const ordinal_type nrows_rest = extent(0) - nrows_bottom;
-      ordinal_type lda_bottom, lda_rest;
-      if (b_contiguous_blocks) {
-        lda_bottom = nrows_bottom;
-        lda_rest = extent(0) - nrows_bottom;
-        A_bottom_ptr = A_rest_ptr + nrows_rest * extent(1);
-      }
-      else {
-        lda_bottom = stride(1);
-        lda_rest = stride(1);
-        A_bottom_ptr = A_rest_ptr + nrows_rest;
-      }
-      MatView A_bottom (nrows_bottom, extent(1), A_bottom_ptr, lda_bottom);
-      A_ = A_rest_ptr;
-      nrows_ = nrows_rest;
-      lda_ = lda_rest;
-
-      return A_bottom;
-    }
-
-    bool operator== (const MatView& rhs) const {
-      return extent(0) == rhs.extent(0) && extent(1) == rhs.extent(1) &&
-        stride(1) == rhs.stride(1) && data() == rhs.data();
-    }
-
-    bool operator!= (const MatView& rhs) const {
-      return extent(0) != rhs.extent(0) || extent(1) != rhs.extent(1) ||
-        stride(1) != rhs.stride(1) || data() != rhs.data();
-    }
-
   private:
     ordinal_type nrows_ = 0;
     ordinal_type ncols_ = 0;
@@ -378,22 +180,25 @@ namespace TSQR {
   {
     const ptrdiff_t tgt_nrows (tgt.extent (0));
     const ptrdiff_t tgt_ncols (tgt.extent (1));
-    if (tgt_nrows != ptrdiff_t (src.extent (0)) ||
-        tgt_ncols != ptrdiff_t (src.extent (1))) {
-      std::ostringstream os;
-      os << "TSQR::deep_copy: dimensions of tgt (output matrix) and "
-        "src (input matrix) are not compatible.  tgt is "
-         << tgt.extent (0) << " x " << tgt.extent (1) << ", but src "
-        "is " << src.extent (0) << " x " << src.extent (1) << ".";
-      throw std::invalid_argument (os.str ());
-    }
-    for (ptrdiff_t j = 0; j < tgt_ncols; ++j) {
-      auto* const tgt_j = &tgt(0,j);
-      const auto* const src_j = &src(0,j);
-      for (ptrdiff_t i = 0; i < tgt_nrows; ++i) {
-        tgt_j[i] = src_j[i];
+
+    if (tgt_nrows == ptrdiff_t (src.extent (0)) ||
+        tgt_ncols == ptrdiff_t (src.extent (1))) {
+      for (ptrdiff_t j = 0; j < tgt_ncols; ++j) {
+        auto* const tgt_j = &tgt(0,j);
+        const auto* const src_j = &src(0,j);
+        for (ptrdiff_t i = 0; i < tgt_nrows; ++i) {
+          tgt_j[i] = src_j[i];
+        }
       }
     }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (true, std::invalid_argument, "TSQR::deep_copy: dimensions "
+         "of tgt (output matrix) and src (input matrix) are not "
+         "compatible.  tgt is " << tgt.extent (0) << " x " <<
+         tgt.extent (1) << ", but src is " << src.extent (0) << " x "
+         << src.extent (1) << ".");
+    }
   }
 
   template<class MatViewType>
@@ -427,7 +232,130 @@ namespace TSQR {
     return {A_top, A_bot};
   }
 
-} // namespace TSQR
+  template<class MatViewType>
+  std::pair<MatViewType, MatViewType>
+  partition_1x2 (const MatViewType& A,
+                 const typename MatViewType::ordinal_type ncols_left)
+  {
+    using ordinal_type = typename MatViewType::ordinal_type;
+    using pointer = typename MatViewType::pointer;
+
+    const ordinal_type nrows = A.extent(0);
+    const ordinal_type ncols = A.extent(1);
+    const ordinal_type ncols_right = ncols - ncols_left;
+    // assumes column major
+    const auto right_offset = A.stride(1) * ncols_right;
+
+    pointer A_top_ptr = A.data();
+    pointer A_bot_ptr = A.data() + right_offset;
+
+    MatViewType A_top (nrows, ncols_left, A_top_ptr, A.stride(1));
+    MatViewType A_bot (nrows, ncols_right, A_bot_ptr, A.stride(1));
+    return {A_top, A_bot};
+  }
+
+  /// \brief Split off and return the top block of nrows_top rows.
+  ///   Modify A in place to be the "rest" of the matrix.
+  ///
+  /// \param A [in] On input: The whole matrix view.  On output: A
+  ///   view of the "rest" of the matrix, that is, the part "below"
+  ///   the returned matrix view.
+  ///
+  /// \param nrows_top [in] Number of rows in the top block (which
+  ///   this method returns).
+  ///
+  /// \param contiguousCacheBlocks [in] Whether or not the entries of
+  ///   the top block are stored contiguously in A.  The default is no
+  ///   (false).
+  ///
+  /// \return A view of the top block of nrows_top rows.
+  template<class LO, class SC>
+  MatView<LO, SC>
+  split_top (MatView<LO, SC>& A,
+             const LO nrows_top,
+             const bool contiguousCacheBlocks = false)
+  {
+    using pointer = typename MatView<LO, SC>::pointer;
+    pointer A_top_ptr = A.data();
+    pointer A_rest_ptr {};
+    const LO nrows_rest = A.extent(0) - nrows_top;
+    const LO ncols = A.extent(1);
+
+    LO lda_top, lda_rest;
+    if (contiguousCacheBlocks) {
+      lda_top = nrows_top;
+      lda_rest = nrows_rest;
+      A_rest_ptr = A_top_ptr + nrows_top * ncols;
+    }
+    else {
+      lda_top = A.stride(1);
+      lda_rest = A.stride(1);
+      A_rest_ptr = A_top_ptr + nrows_top;
+    }
+    MatView<LO, SC> A_top (nrows_top, ncols, A_top_ptr, lda_top);
+    A = MatView<LO, SC> (nrows_rest, ncols, A_rest_ptr, lda_rest);
+    return A_top;
+  }
+
+  /// \brief Split off and return the bottom block.  Modify A to be
+  ///   the "rest" of the matrix.
+  template<class LO, class SC>
+  MatView<LO, SC>
+  split_bottom (MatView<LO, SC>& A,
+                const LO nrows_bottom,
+                const bool contiguousCacheBlocks = false)
+  {
+    using pointer = typename MatView<LO, SC>::pointer;
+
+    pointer A_rest_ptr = A.data();
+    pointer A_bottom_ptr {};
+    const LO nrows_rest = A.extent(0) - nrows_bottom;
+    const LO ncols = A.extent(1);
+
+    LO lda_bottom, lda_rest;
+    if (contiguousCacheBlocks) {
+      lda_bottom = nrows_bottom;
+      lda_rest = A.extent(0) - nrows_bottom;
+      A_bottom_ptr = A_rest_ptr + nrows_rest * ncols;
+    }
+    else {
+      lda_bottom = A.stride(1);
+      lda_rest = A.stride(1);
+      A_bottom_ptr = A_rest_ptr + nrows_rest;
+    }
+    MatView<LO, SC> A_bottom (nrows_bottom, ncols, A_bottom_ptr, lda_bottom);
+    A = MatView<LO, SC> (nrows_rest, ncols, A_rest_ptr, lda_rest);
+    return A_bottom;
+  }
 
+  template<class LO, class SC>
+  bool empty (const MatView<LO, SC>& A) {
+    return A.extent(0) == 0 || A.extent(1) == 0;
+  }
+
+  template<class LO, class TargetScalar, class SourceScalar>
+  void
+  copy_upper_triangle (const MatView<LO, TargetScalar>& R_out,
+                       const MatView<LO, SourceScalar>& R_in)
+  {
+    const LO nrows = R_out.extent (0);
+    const LO ncols = R_out.extent (1);
+
+    if (nrows >= ncols) {
+      for (LO j = 0; j < ncols; ++j) {
+        for (LO i = 0; i <= j; ++i) {
+          R_out(i,j) = R_in(i,j);
+        }
+      }
+    }
+    else {
+      auto R_out_lr = partition_1x2 (R_out, nrows);
+      auto R_in_lr = partition_1x2 (R_in, nrows);
+      copy_upper_triangle (R_out_lr.first, R_in_lr.first);
+      deep_copy (R_out_lr.second, R_in_lr.second);
+    }
+  }
+
+} // namespace TSQR
 
-#endif // __TSQR_Tsqr_MatView_hpp
+#endif // TSQR_MATVIEW_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp
index 2bb78584016e..24f7fac61afd 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Matrix.hpp
@@ -77,98 +77,12 @@ namespace TSQR {
     using mat_view_type = MatView<ordinal_type, non_const_value_type>;
     using const_mat_view_type = MatView<ordinal_type, const_value_type>;
 
-  private:
-    static bool
-    fits_in_size_t (const ordinal_type& ord)
-    {
-      const ordinal_type result = ordinal_type (size_t (ord));
-      return (ord == result);
-    }
-
-    /// Check whether num_rows*num_cols makes sense as an amount of
-    /// storage (for the num_rows by num_cols dense matrix).  Not
-    /// making sense includes negative values for either parameter (if
-    /// they are signed types), or overflow when computing their
-    /// product.  Throw an exception of the appropriate type for any
-    /// of these cases.  Otherwise, return num_rows*num_cols as a
-    /// size_t.
-    ///
-    /// \param num_rows [in] Number of rows in the matrix
-    /// \param num_cols [in] Number of columns in the matrix
-    /// \return num_rows*num_cols
-    size_t
-    verified_alloc_size (const ordinal_type num_rows,
-                         const ordinal_type num_cols) const
-    {
-      static_assert (std::numeric_limits<ordinal_type>::is_integer,
-                     "ordinal_type must be an integer type.");
-      // Quick exit also checks for zero num_cols (which prevents
-      // division by zero in the tests below).
-      if (num_rows == 0 || num_cols == 0) {
-        return size_t(0);
-      }
-
-      // If ordinal_type is signed, make sure that num_rows and num_cols
-      // are nonnegative.
-      if (std::numeric_limits<ordinal_type>::is_signed) {
-        if (num_rows < 0) {
-          std::ostringstream os;
-          os << "# rows (= " << num_rows << ") < 0";
-          throw std::logic_error (os.str());
-        }
-        else if (num_cols < 0) {
-          std::ostringstream os;
-          os << "# columns (= " << num_cols << ") < 0";
-          throw std::logic_error (os.str());
-        }
-      }
-
-      // If ordinal_type is bigger than a size_t, do special range
-      // checking.  The compiler warns (comparison of signed and
-      // unsigned) if ordinal_type is a signed type and we try to do
-      // "numeric_limits<size_t>::max() <
-      // std::numeric_limits<ordinal_type>::max()", so instead we cast each
-      // of num_rows and num_cols to size_t and back to ordinal_type again,
-      // and see if we get the same result.  If not, then we
-      // definitely can't return a size_t product of num_rows and
-      // num_cols.
-      if (! fits_in_size_t (num_rows)) {
-        std::ostringstream os;
-        os << "# rows (= " << num_rows << ") > max size_t value (= "
-           << std::numeric_limits<size_t>::max() << ")";
-        throw std::range_error (os.str());
-      }
-      else if (! fits_in_size_t (num_cols)) {
-        std::ostringstream os;
-        os << "# columns (= " << num_cols << ") > max size_t value (= "
-           << std::numeric_limits<size_t>::max() << ")";
-        throw std::range_error (os.str());
-      }
-
-      // Both num_rows and num_cols fit in a size_t, and are
-      // nonnegative.  Now check whether their product also fits in a
-      // size_t.
-      //
-      // Note: This may throw a SIGFPE (floating-point exception) if
-      // num_cols is zero.  Be sure to check first (above).
-      if (size_t (num_rows) >
-          std::numeric_limits<size_t>::max() / size_t (num_cols)) {
-        std::ostringstream os;
-        os << "num_rows (= " << num_rows << ") * num_cols (= "
-           << num_cols << ") > max size_t value (= "
-           << std::numeric_limits<size_t>::max() << ")";
-        throw std::range_error (os.str());
-      }
-      return size_t (num_rows) * size_t (num_cols);
-    }
-
-  public:
     //! Constructor with dimensions.
     Matrix (const ordinal_type num_rows,
             const ordinal_type num_cols) :
       nrows_ (num_rows),
       ncols_ (num_cols),
-      A_ (verified_alloc_size (num_rows, num_cols))
+      A_ (size_t (num_rows) * size_t (num_cols))
     {}
 
     //! Constructor with dimensions and fill datum.
@@ -177,7 +91,7 @@ namespace TSQR {
             const non_const_value_type& value) :
       nrows_ (num_rows),
       ncols_ (num_cols),
-      A_ (verified_alloc_size (num_rows, num_cols), value)
+      A_ (size_t (num_rows) * size_t (num_cols), value)
     {}
 
     /// \brief Copy constructor.
@@ -188,21 +102,17 @@ namespace TSQR {
     Matrix (const Matrix& in) :
       nrows_ (in.extent(0)),
       ncols_ (in.extent(1)),
-      A_ (verified_alloc_size (in.extent(0), in.extent(1)))
+      A_ (size_t (in.extent(0)) * size_t (in.extent(1)))
     {
-      if (! in.empty()) {
-        MatView<ordinal_type, non_const_value_type> this_view
-          (extent(0), extent(1), data(), stride(1));
-        MatView<ordinal_type, const_value_type> in_view
-          (in.extent(0), in.extent(1), in.data(), in.stride(1));
-        deep_copy (this_view, in_view);
-      }
+      MatView<ordinal_type, const_value_type> in_view
+        (in.extent(0), in.extent(1), in.data(), in.stride(1));
+      deep_copy (*this, in_view);
     }
 
     //! Default constructor (constructs an empty matrix).
     Matrix () = default;
 
-    /// \brief "Copy constructor" from a matrix view type.
+    /// \brief "Copy constructor" from a Matrix or MatrixView.
     ///
     /// This constructor allocates a new matrix and copies the
     /// elements of the input view into the resulting new matrix.
@@ -212,7 +122,7 @@ namespace TSQR {
     Matrix (const MatrixViewType& in) :
       nrows_ (in.extent(0)),
       ncols_ (in.extent(1)),
-      A_ (verified_alloc_size (in.extent(0), in.extent(1)))
+      A_ (size_t (in.extent(0)) * size_t (in.extent(1)))
     {
       if (A_.size() != 0) {
         MatView<ordinal_type, non_const_value_type> this_view
@@ -246,18 +156,6 @@ namespace TSQR {
       return A_[i];
     }
 
-    //! Equality: ONLY compares dimensions and pointers (shallow).
-    template<class MatrixViewType>
-    bool operator== (const MatrixViewType& B) const
-    {
-      if (data() != B.data() || extent(0) != B.extent(0) ||
-          extent(1) != B.extent(1) || stride(1) != B.stride(1)) {
-        return false;
-      } else {
-        return true;
-      }
-    }
-
     constexpr ordinal_type extent (const int r) const noexcept {
       return r == 0 ? nrows_ : (r == 1 ? ncols_ : ordinal_type(0));
     }
@@ -266,9 +164,6 @@ namespace TSQR {
       return r == 0 ? ordinal_type(1) : (r == 1 ? nrows_ : ordinal_type(0));
     }
 
-    //! Whether the matrix is empty (has either zero rows or zero columns).
-    bool empty() const { return extent(0) == 0 || extent(1) == 0; }
-
     //! A non-const pointer to the matrix data.
     pointer data()
     {
@@ -308,7 +203,7 @@ namespace TSQR {
       if (num_rows == extent(0) && num_cols == extent(1))
         return; // no need to reallocate or do anything else
 
-      const size_t alloc_size = verified_alloc_size (num_rows, num_cols);
+      const size_t alloc_size = size_t (num_rows) * size_t (num_cols);
       nrows_ = num_rows;
       ncols_ = num_cols;
       A_.resize (alloc_size);
@@ -327,6 +222,11 @@ namespace TSQR {
     std::vector<non_const_value_type> A_;
   };
 
+  template<class LO, class SC>
+  bool empty (const Matrix<LO, SC>& A) {
+    return A.extent(0) == 0 || A.extent(1) == 0;
+  }
+
   template<class LO, class SC, class SourceScalar>
   void
   deep_copy (Matrix<LO, SC>& tgt, const SourceScalar& src)
@@ -344,6 +244,23 @@ namespace TSQR {
     deep_copy (tgt.view(), src);
   }
 
+  template<class LO, class TargetScalar, class SourceScalar>
+  void
+  copy_upper_triangle (Matrix<LO, TargetScalar>& R_out,
+                       const MatView<LO, SourceScalar>& R_in)
+  {
+    copy_upper_triangle (R_out.view (), R_in);
+  }
+
+  template<class LO, class TargetScalar, class SourceScalar>
+  void
+  copy_upper_triangle (Matrix<LO, TargetScalar>& R_out,
+                       const Matrix<LO, SourceScalar>& R_in)
+  {
+    auto R_out_view = R_out.view ();
+    copy_upper_triangle (R_out_view, R_in.const_view ());
+  }
+
   template<class LO, class SC>
   std::pair<MatView<LO, SC>, MatView<LO, SC>>
   partition_2x1 (Matrix<LO, SC>& A,
diff --git a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp
index 6bd5406e13eb..e870193352ca 100644
--- a/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_MgsTest.hpp
@@ -42,9 +42,6 @@
 
 #include "Tsqr_ConfigDefs.hpp"
 #include "Tsqr_Mgs.hpp"
-#ifdef HAVE_KOKKOSTSQR_TBB
-#  include "TbbTsqr_TbbMgs.hpp"
-#endif // HAVE_KOKKOSTSQR_TBB
 #include "Tsqr_TestSetup.hpp"
 #include "Tsqr_GlobalVerify.hpp"
 #include "Tsqr_printGlobalMatrix.hpp"
@@ -68,13 +65,6 @@ namespace TSQR {
       if (which == "MpiSeqMGS") {
         return std::string ("MPI parallel / sequential MGS");
       }
-      else if (which == "MpiTbbMGS") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-        return std::string ("MPI parallel / TBB parallel MGS");
-#else
-        throw std::logic_error("MGS not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-      }
       else {
         throw std::logic_error("Unknown MGS implementation type \"" + which + "\"");
       }
@@ -184,16 +174,7 @@ namespace TSQR {
         }
       }
 
-      if (which == "MpiTbbMGS") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-        typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
-        mgs_type mgser (scalarComm);
-        MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
-#else
-        throw std::logic_error("MGS not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-      }
-      else if (which == "MpiSeqMGS") {
+      if (which == "MpiSeqMGS") {
         typedef MGS<Ordinal, Scalar> mgs_type;
         mgs_type mgser (scalarComm);
         MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
@@ -238,9 +219,6 @@ namespace TSQR {
                << "# rows = " << nrows_global << endl
                << "# columns = " << ncols << endl
                << "# MPI processes = " << nprocs << endl;
-          if (which == "MpiTbbTSQR") {
-            cout << "# cores per process = " << num_cores << endl;
-          }
           cout << "Absolute residual $\\|A - Q*R\\|_2: "
                << results[0] << endl
                << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: "
@@ -253,11 +231,8 @@ namespace TSQR {
           cout << which
                << "," << nrows_global
                << "," << ncols
-               << "," << nprocs;
-          if (which == "MpiTbbTSQR") {
-            cout << "," << num_cores << endl;
-          }
-          cout << "," << results[0]
+               << "," << nprocs
+               << "," << results[0]
                << "," << results[1]
                << "," << results[2]
                << endl;
@@ -384,17 +359,7 @@ namespace TSQR {
 
       // Set up MGS and run the benchmark.
       double mgs_timing; // Total run time in seconds of all ntrials trials
-      if (which == "MpiTbbMGS") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-        typedef TSQR::TBB::TbbMgs<Ordinal, Scalar> mgs_type;
-        mgs_type mgser (scalarComm);
-        mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R,
-                                                              ntrials, human_readable);
-#else
-        throw std::logic_error("MGS not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-      }
-      else if (which == "MpiSeqMGS") {
+      if (which == "MpiSeqMGS") {
         typedef MGS<Ordinal, Scalar> mgs_type;
         mgs_type mgser (scalarComm);
         mgs_timing = do_mgs_benchmark<mgs_type, TimerType> (mgser, Q_local, R,
@@ -428,11 +393,8 @@ namespace TSQR {
           cout << mgs_human_readable_name(which) << ":" << endl
                << "# rows = " << nrows_global << endl
                << "# columns = " << ncols << endl
-               << "# MPI processes = " << nprocs << endl;
-          if (which == "MpiTbbTSQR") {
-            cout << "# cores per process = " << num_cores << endl;
-          }
-          cout << "# trials = " << ntrials << endl
+               << "# MPI processes = " << nprocs << endl
+               << "# trials = " << ntrials << endl
                << "Min total time (s) over all MPI processes = "
                << min_mgs_timing << endl
                << "Max total time (s) over all MPI processes = "
@@ -443,11 +405,8 @@ namespace TSQR {
           cout << which
                << "," << nrows_global
                << "," << ncols
-               << "," << nprocs;
-          if (which == "MpiTbbTSQR") {
-            cout << "," << num_cores << endl;
-          }
-          cout << "," << ntrials
+               << "," << nprocs
+               << "," << ntrials
                << "," << min_mgs_timing
                << "," << max_mgs_timing
                << endl;
diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp
index 155081ca8d38..06c9c6ee1484 100644
--- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqr.hpp
@@ -48,11 +48,21 @@
 #include "Teuchos_as.hpp"
 #include "Teuchos_Describable.hpp"
 #include "Tsqr_Impl_Lapack.hpp"
+#include "Teuchos_ParameterList.hpp"
+#include "Teuchos_RCP.hpp"
 #include "Teuchos_ScalarTraits.hpp"
 #include "Teuchos_TypeNameTraits.hpp"
 #include <vector>
 
 namespace TSQR {
+  namespace Impl {
+    template<class Ordinal, class Scalar>
+    class NodeFactorOutput {
+    public:
+      virtual ~NodeFactorOutput() = default;
+    };
+  } // namespace Impl
+
   /// \class NodeTsqr
   /// \brief Common interface and functionality for intranode TSQR.
   ///
@@ -63,37 +73,16 @@ namespace TSQR {
   /// \tparam Ordinal The (local) Ordinal type; the type of indices
   ///   into a matrix on a node
   /// \tparam Scalar Tthe type of elements stored in the matrix
-  /// \tparam FactorOutputType The type returned by factor().
-  ///
-  /// We template on FactorOutputType for compile-time polymorphism.
-  /// This lets subclasses define the \c factor() method, without
-  /// constraining them to inherit their particular FactorOutputType
-  /// from a common abstract base class.  FactorOutputType is meant to
-  /// be either just a simple composition of std::pair and
-  /// std::vector, or a simple struct.  Its contents are specific to
-  /// each intranode TSQR implementation.  and are not intended to be
-  /// polymorphic, so it would not make sense for all the different
-  /// FactorOutputType types to inherit from a common base class.
-  ///
-  /// Templating on FactorOutputType means that we can't use run-time
-  /// polymorphism to swap between NodeTsqr subclasses, since the
-  /// latter are really subclasses of different NodeTsqr
-  /// instantiations (i.e., different FactorOutputType types).
-  /// However, inheriting from different specializations of NodeTsqr
-  /// does enforce correct compile-time polymorphism in a syntactic
-  /// way.  It also avoids repeated code for common functionality.
-  /// Full run-time polymorphism of different NodeTsqr subclasses
-  /// would not be useful.  This is because ultimately each subclass
-  /// is bound to a Kokkos Node type, and those only use compile-time
-  /// polymorphism.
-  template<class Ordinal, class Scalar, class FactorOutputType>
+  template<class Ordinal, class Scalar>
   class NodeTsqr : public Teuchos::Describable {
   public:
-    typedef Ordinal ordinal_type;
-    typedef Scalar scalar_type;
-    typedef FactorOutputType factor_output_type;
-    typedef MatView<Ordinal, Scalar> mat_view_type;
-    typedef MatView<Ordinal, const Scalar> const_mat_view_type;
+    using ordinal_type = Ordinal;
+    using scalar_type = Scalar;
+    using magnitude_type =
+      typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
+    using factor_output_type = Impl::NodeFactorOutput<Ordinal, Scalar>;
+    using mat_view_type = MatView<Ordinal, Scalar>;
+    using const_mat_view_type = MatView<Ordinal, const Scalar>;
 
     //! Constructor
     NodeTsqr() = default;
@@ -101,6 +90,17 @@ namespace TSQR {
     //! Virtual destructor, for memory safety of derived classes.
     virtual ~NodeTsqr() = default;
 
+    //! List of valid parameters for the NodeTsqr subclass.
+    virtual Teuchos::RCP<const Teuchos::ParameterList>
+    getValidParameters () const = 0;
+
+    //! Validate and read in parameters.
+    virtual void
+    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& paramList) = 0;
+
+    //! Whether the subclass wants large arrays as GPU device memory.
+    virtual bool wants_device_memory () const { return false; }
+
     /// \brief Whether this object is ready to perform computations.
     ///
     /// Some NodeTsqr subclasses require additional initialization
@@ -162,7 +162,7 @@ namespace TSQR {
     ///
     /// \return Part of the implicit representation of the Q factor.
     ///   The other part is the A matrix on output.
-    virtual factor_output_type
+    virtual Teuchos::RCP<factor_output_type>
     factor (const Ordinal nrows,
             const Ordinal ncols,
             Scalar A[],
@@ -203,7 +203,7 @@ namespace TSQR {
            const Ordinal ncols_Q,
            const Scalar Q[],
            const Ordinal ldq,
-           const FactorOutputType& factorOutput,
+           const factor_output_type& factorOutput,
            const Ordinal ncols_C,
            Scalar C[],
            const Ordinal ldc,
@@ -248,6 +248,47 @@ namespace TSQR {
                 const Ordinal ldc,
                 const bool contiguousCacheBlocks) const = 0;
 
+    /// \brief Force the diagonal entries of the R factor to be
+    ///   nonnegative, and change the columns of Q (result of
+    ///   explicit_Q) to match (if needed).
+    virtual void
+    force_nonnegative_diagonal (const Ordinal nrows,
+                                const Ordinal ncols,
+                                Scalar Q[],
+                                const Ordinal ldq,
+                                Scalar R[],
+                                const Ordinal ldr) const
+    {
+      mat_view_type Q_view (nrows, ncols, Q, ldq);
+      mat_view_type R_view (ncols, ncols, R, ldr);
+
+      // The complex-arithmetic specialization does nothing, since
+      // _GEQR{2,F} for complex arithmetic returns an R factor with
+      // nonnegative diagonal already.  However, we need the code to
+      // compile regardless.
+      using STS = Teuchos::ScalarTraits<Scalar>;
+      if (! STS::isComplex) {
+        using mag_type = typename STS::magnitudeType;
+        constexpr mag_type ZERO {};
+
+        for (Ordinal k = 0; k < ncols; ++k) {
+          if (STS::real (R_view(k,k)) < ZERO) {
+            // Scale column k of Q_view.
+            Scalar* const Q_k = &Q_view(0,k);
+            for (Ordinal i = 0; i < nrows; ++i) {
+              Q_k[i] = -Q_k[i];
+            }
+            // Scale row k of R_view.  R_view is upper triangular,
+            // so we only have to scale right of (and including) the
+            // diagonal entry.
+            for (int j = k; j < ncols; ++j) {
+              R_view(k,j) = -R_view(k,j);
+            }
+          }
+        }
+      }
+    }
+
     /// \brief Cache block A_in into A_out.
     ///
     /// \param nrows [in] Number of rows in A_in and A_out.
@@ -339,7 +380,9 @@ namespace TSQR {
     /// \endcode
     virtual const_mat_view_type
     const_top_block (const const_mat_view_type& C,
-                     const bool contiguousCacheBlocks) const = 0;
+                     const bool /* contiguousCacheBlocks */) const {
+      return C;
+    }
 
   public:
     /// \brief Return view of topmost cache block of C.
@@ -387,6 +430,49 @@ namespace TSQR {
                              C_top.stride(1));
     }
 
+    /// \brief Copy from "native" NodeTsqr device storage, to a packed
+    ///   host matrix.
+    virtual Matrix<Ordinal, Scalar>
+    copy_to_host (const MatView<Ordinal, Scalar>& C) const
+    {
+      // FIXME (mfh 17 Dec 2019) Need to reimplement in
+      // CuSolverNodeTsqr, since C is device memory there.
+      //
+      // The same concerns as in CuSolverNodeTsqr::extract_R, about
+      // Kokkos::deep_copy not wanting to copy from noncontiguous
+      // device memory to contiguous host memory, apply here.
+      return Matrix<Ordinal, Scalar> (C);
+    }
+
+    /// \brief Copy from a host matrix, to "native" NodeTsqr device
+    ///   storage.
+    virtual void
+    copy_from_host (const MatView<Ordinal, Scalar>& C_device,
+                    const MatView<Ordinal, const Scalar>& C_host) const
+    {
+      // FIXME (mfh 17 Dec 2019) Need to reimplement in
+      // CuSolverNodeTsqr, since C_device is device memory there.
+      //
+      // The same concerns as in CuSolverNodeTsqr::extract_R, about
+      // Kokkos::deep_copy not wanting to copy between noncontiguous
+      // device memory and contiguous host memory, apply here.
+      deep_copy (C_device, C_host);
+    }
+
+    //! Set the first C.extent(1) diagonal entries of C to 1.0.
+    virtual void
+    set_diagonal_entries_to_one
+      (const MatView<Ordinal, Scalar>& C) const
+    {
+      // NOTE (mfh 17 Dec 2019) Downstream classes must reimplement
+      // this if C is device memory for those classes.  See
+      // wants_device_memory above.
+      const Ordinal ncols = C.extent (1);
+      for (Ordinal j = 0; j < ncols; ++j) {
+        C(j,j) = Scalar (1.0);
+      }
+    }
+
     /// \brief Does factor() compute R with nonnegative diagonal?
     ///
     /// When using a QR factorization to orthogonalize a block of
@@ -454,9 +540,9 @@ namespace TSQR {
   };
 
 
-  template<class Ordinal, class Scalar, class FactorOutputType>
+  template<class Ordinal, class Scalar>
   Ordinal
-  NodeTsqr<Ordinal, Scalar, FactorOutputType>::
+  NodeTsqr<Ordinal, Scalar>::
   reveal_R_rank (const Ordinal ncols,
                  Scalar R[],
                  const Ordinal ldr,
@@ -467,7 +553,6 @@ namespace TSQR {
     using Teuchos::as;
     using Teuchos::TypeNameTraits;
     typedef Teuchos::ScalarTraits<Scalar> STS;
-    typedef typename STS::magnitudeType magnitude_type;
     typedef Teuchos::ScalarTraits<magnitude_type> STM;
 
     TEUCHOS_TEST_FOR_EXCEPTION(tol < 0, std::invalid_argument,
@@ -612,9 +697,9 @@ namespace TSQR {
     return rank;
   }
 
-  template<class Ordinal, class Scalar, class FactorOutputType>
+  template<class Ordinal, class Scalar>
   Ordinal
-  NodeTsqr<Ordinal, Scalar, FactorOutputType>::
+  NodeTsqr<Ordinal, Scalar>::
   reveal_rank (const Ordinal nrows,
                const Ordinal ncols,
                Scalar Q[],
diff --git a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp
index 0d2d84b580a4..161c7e6cc377 100644
--- a/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_NodeTsqrFactory.hpp
@@ -37,108 +37,146 @@
 // ************************************************************************
 //@HEADER
 
-#ifndef __TSQR_NodeTsqrFactory_hpp
-#define __TSQR_NodeTsqrFactory_hpp
+/// \file Tsqr_NodeTsqrFactory.hpp
+/// \brief Declaration and definition of a factory for creating an
+///   instance of the right NodeTsqr subclass.
 
-#include "Tsqr_ConfigDefs.hpp"
-#include "Kokkos_DefaultNode.hpp"
+#ifndef TSQR_NODETSQRFACTORY_HPP
+#define TSQR_NODETSQRFACTORY_HPP
 
-#ifdef HAVE_KOKKOSTSQR_TBB
-#  include "TbbTsqr.hpp"
-#endif // HAVE_KOKKOSTSQR_TBB
-
-#include "Tsqr_KokkosNodeTsqr.hpp"
 #include "Tsqr_SequentialTsqr.hpp"
-
-#include "Teuchos_ParameterList.hpp"
-#include "Teuchos_ParameterListExceptions.hpp"
+#include "Tsqr_CombineNodeTsqr.hpp"
+#include "Tsqr_CuSolverNodeTsqr.hpp"
 #include "Teuchos_RCP.hpp"
-#include "Teuchos_ScalarTraits.hpp"
-#include "Teuchos_TypeNameTraits.hpp"
-
-#include <stdexcept>
-
+#include "Teuchos_TestForException.hpp"
+#ifdef HAVE_TPETRATSQR_COMPLEX
+#  include "Kokkos_Complex.hpp"
+#endif // HAVE_TPETRATSQR_COMPLEX
+#include <string>
+#include <vector>
 
 namespace TSQR {
-
   /// \class NodeTsqrFactory
-  /// \brief Factory for creating an instance of the right \c NodeTsqr subclass.
+  /// \brief Factory for creating an instance of the right NodeTsqr
+  ///   subclass.
   /// \author Mark Hoemmen
   ///
-  /// \tparam Node The Kokkos Node type
-  /// \tparam Scalar The type of entries in the matrices to factor
-  /// \tparam LocalOrdinal The type of local indices in the matrices to factor
+  /// \tparam Scalar The type of entries in the matrices to factor.
+  /// \tparam LocalOrdinal The type of local indices in the matrices
+  ///   to factor.
+  /// \tparam Device Kokkos::Device specialization used by the
+  ///   matrices to factor.
   ///
-  /// This class maps from a particular Kokkos \c Node type, to the
-  /// corresponding \c NodeTsqr subclass.  It lets you construct a
-  /// default ParameterList for that \c NodeTsqr subclass, as well as
-  /// an instance of the \c NodeTsqr subclass.  It also provides
-  /// typedefs for template metaprogramming.
+  /// This class maps from (Scalar, LocalOrdinal, Device), to the
+  /// corresponding NodeTsqr subclass.  It lets you construct a
+  /// default ParameterList for that NodeTsqr subclass, as well as an
+  /// instance of the NodeTsqr subclass.  It also provides type
+  /// aliases for template metaprogramming.
   ///
-  /// The "right" \c NodeTsqr subclass is a function of the \c Node
-  /// template parameter, and possibly also of the other template
-  /// parameters.
+  /// The "right" NodeTsqr subclass is a function of Device, and
+  /// possibly also of the other template parameters.
   ///
   /// \note If this class does <i>not</i> have a partial
-  ///   specialization for your \c Node type, it defaults to use
+  ///   specialization for your Device type, it defaults to use
   ///   SequentialTsqr.  That class does <i>not</i> use threads, and
   ///   only knows how to deal with host data; it cannot handle GPU
   ///   device-resident data.  Thus, it may perform poorly.
-  template<class Node, class Scalar, class LocalOrdinal>
+  template<class Scalar, class LocalOrdinal, class Device>
   class NodeTsqrFactory {
   public:
-    //! The Kokkos Node type.
-    typedef Node node_type;
-    //! Pointer (RCP) to node_type.
-    typedef Teuchos::RCP<node_type> node_ptr;
-
-    //! The NodeTsqr subclass corresponding to the Kokkos Node type.
-    typedef SequentialTsqr<LocalOrdinal, Scalar> node_tsqr_type;
+    using node_tsqr_type = NodeTsqr<LocalOrdinal, Scalar>;
 
-    /// \brief Default parameter list for intranode TSQR.
+    /// \brief Get the default implementation of NodeTsqr.
     ///
-    /// \note The default implementation returns an empty (not null)
-    ///   parameter list.  Each specialization for a specific Node
-    ///   type redefines this method to return a parameter list
-    ///   appropriate for that Node type's TSQR implementation.
-    static Teuchos::RCP<const Teuchos::ParameterList>
-    getDefaultParameters ()
+    /// The default implementation is a function of the template
+    /// parameters, especialy Scalar and Device.
+    static Teuchos::RCP<node_tsqr_type>
+    getNodeTsqr ()
     {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-
-      RCP<ParameterList> params = parameterList ("NodeTsqr");
-      // Create a temporary node_tsqr_type instance in order to get
-      // default parameters.  The empty input parameter list will get
-      // filled in with default values of missing parameters.
-      node_tsqr_type nodeTsqr (params);
-
-      return params;
+      using Teuchos::rcp;
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+      using execution_space = typename Device::execution_space;
+      constexpr bool is_cuda =
+        std::is_same<execution_space, Kokkos::Cuda>::value;
+      if (is_cuda) {
+        return rcp (new CuSolverNodeTsqr<LocalOrdinal, Scalar>);
+      }
+      else {
+#endif
+
+        // NOTE (mfh 02 Dec 2019) SequentialTsqr does not currently
+        // give correct results for complex Scalar types, so we use
+        // CombineNodeTsqr in that case.
+#ifdef HAVE_TPETRATSQR_COMPLEX
+        constexpr bool is_complex =
+          std::is_same<Scalar, std::complex<double>>::value ||
+          std::is_same<Scalar, std::complex<float>>::value ||
+          std::is_same<Scalar, Kokkos::complex<double>>::value ||
+          std::is_same<Scalar, Kokkos::complex<float>>::value;
+#else
+        constexpr bool is_complex = false;
+#endif // HAVE_TPETRATSQR_COMPLEX
+        if (is_complex) {
+          return rcp (new CombineNodeTsqr<LocalOrdinal, Scalar>);
+        }
+        else {
+          return rcp (new SequentialTsqr<LocalOrdinal, Scalar>);
+        }
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)        
+      }
+#endif
     }
 
-    /// \brief Return a pointer to the intranode TSQR implementation.
+    /// \brief Get a specific implementation of NodeTsqr.
     ///
-    /// \param node [in/out] Pointer to the Kokkos Node instance.
-    ///
-    /// \param plist [in/out] Parameter list for configuring the
-    ///   NodeTsqr implementation.
+    /// \param name [in] Either "SequentialTsqr", "CombineNodeTsqr",
+    ///   or "Default".  "Default" means "return what the above
+    ///   zero-argument overload of getNodeTsqr() returns."
     static Teuchos::RCP<node_tsqr_type>
-    makeNodeTsqr (const Teuchos::RCP<node_type>& node,
-                  const Teuchos::RCP<Teuchos::ParameterList>& plist)
+    getNodeTsqr (const std::string& name)
     {
-      (void) node;
-      return rcp (new node_tsqr_type (plist));
+      using Teuchos::rcp;
+      if (name == "SequentialTsqr" || name == "Sequential") {
+        return rcp (new SequentialTsqr<LocalOrdinal, Scalar>);
+      }
+      else if (name == "CombineNodeTsqr" || name == "Combine") {
+        return rcp (new CombineNodeTsqr<LocalOrdinal, Scalar>);
+      }
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+      else if (name == "CuSolverNodeTsqr" || name == "CuSolver") {
+        return rcp (new CuSolverNodeTsqr<LocalOrdinal, Scalar>);
+      }
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+      else if (name == "Default") {
+        return getNodeTsqr ();
+      }
+      else {
+        const char prefix[] = "TSQR::NodeTsqrFactory::getNodeTsqr: ";
+        const std::vector<std::string> validNames
+          {{"SequentialTsqr",
+            "CombineNodeTsqr",
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+            "CuSolverNodeTsqr",
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+            "Default"}};
+        std::ostringstream os;
+        os << prefix << "Invalid NodeTsqr subclass name \"" << name
+           << "\".  Valid names are: {";
+        for (size_t k = 0; k < validNames.size (); ++k) {
+          os << "\"" << validNames[k] << "\"";
+          if (k + size_t (1) < validNames.size ()) {
+            os << ", ";
+          }
+        }
+        os << "}.";
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::invalid_argument, os.str ());
+      }
     }
 
-    /// \brief Prepare the NodeTsqr instance for use.
-    ///
-    /// \pre <tt> ! nodeTsqr.is_null() </tt>
-    /// \post <tt> nodeTsqr->ready() </tt>
-    static void
-    prepareNodeTsqr (const Teuchos::RCP<node_tsqr_type>& /* nodeTsqr */)
-    {}
   };
 } // namespace TSQR
 
-#endif // __TSQR_NodeTsqrFactory_hpp
+#endif // TSQR_NODETSQRFACTORY_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp b/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp
deleted file mode 100644
index 530dba578814..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_ParTest.hpp
+++ /dev/null
@@ -1,781 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_Test_DistTest_hpp
-#define __TSQR_Test_DistTest_hpp
-
-#include "Tsqr_ConfigDefs.hpp"
-#include "Tsqr_Random_NormalGenerator.hpp"
-#include "Tsqr_verifyTimerConcept.hpp"
-#include "Tsqr_generateStack.hpp"
-#include "Tsqr_DistTsqr.hpp"
-#include "Tsqr_GlobalTimeStats.hpp"
-#include "Tsqr_GlobalVerify.hpp"
-#include "Tsqr_printGlobalMatrix.hpp"
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <vector>
-
-namespace TSQR {
-  namespace Test {
-    /// \class DistTsqrVerifier
-    /// \brief Generic version of \c DistTsqr accuracy test.
-    template<class Ordinal, class Scalar>
-    class DistTsqrVerifier {
-      TSQR::Random::NormalGenerator<Ordinal, Scalar> gen_;
-      Teuchos::RCP<MessengerBase<Ordinal> > const ordinalComm_;
-      Teuchos::RCP<MessengerBase<Scalar> > const scalarComm_;
-      std::string scalarTypeName_;
-      std::ostream& out_;
-      std::ostream& err_;
-      const bool testFactorExplicit_, testFactorImplicit_;
-      const bool humanReadable_, printMatrices_, debug_;
-
-    public:
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-      typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
-      typedef typename std::vector<magnitude_type> result_type;
-      typedef Matrix<ordinal_type, scalar_type> matrix_type;
-
-      /// \brief Constructor, with custom seed value
-      ///
-      /// \param scalarComm [in/out] Communicator object over which to
-      ///   test.
-      /// \param seed [in] 4-element vector; the random seed input of
-      ///   TSQR::Random::NormalGenerator (which see, since there are
-      ///   restrictions on the set of valid seeds)
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   template type parameter
-      /// \param out [out] Output stream to which to write results
-      /// \param err [out] Output stream to which to write any
-      ///   debugging outputs (if applicable) or errors
-      /// \param testFactorExplicit [in] Whether to test
-      ///   DistTsqr::factorExplicit()
-      /// \param testFactorImplicit [in] Whether to test
-      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
-      /// \param humanReadable [in] Whether printed results should be
-      ///   easy for humans to read (vs. easy for parsers to parse)
-      /// \param debug [in] Whether to write verbose debug output to
-      ///   err
-      DistTsqrVerifier (const Teuchos::RCP<MessengerBase<Ordinal> >& ordinalComm,
-                        const Teuchos::RCP<MessengerBase<Scalar> >& scalarComm,
-                        const std::vector<int>& seed,
-                        const std::string& scalarTypeName,
-                        std::ostream& out,
-                        std::ostream& err,
-                        const bool testFactorExplicit,
-                        const bool testFactorImplicit,
-                        const bool humanReadable,
-                        const bool printMatrices,
-                        const bool debug) :
-        gen_ (seed),
-        ordinalComm_ (ordinalComm),
-        scalarComm_ (scalarComm),
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        err_ (err),
-        testFactorExplicit_ (testFactorExplicit),
-        testFactorImplicit_ (testFactorImplicit),
-        humanReadable_ (humanReadable),
-        printMatrices_ (printMatrices),
-        debug_ (debug)
-      {}
-
-      /// \brief Constructor, with default seed value
-      ///
-      /// This constructor sets a default seed (for the pseudorandom
-      /// number generator), which is the same seed (0,0,0,1) each
-      /// time.
-      ///
-      /// \param scalarComm [in/out] Communicator object over which to
-      ///   test.
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   template type parameter
-      /// \param out [out] Output stream to which to write results
-      /// \param err [out] Output stream to which to write any
-      ///   debugging outputs (if applicable) or errors
-      /// \param testFactorExplicit [in] Whether to test
-      ///   DistTsqr::factorExplicit()
-      /// \param testFactorImplicit [in] Whether to test
-      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
-      /// \param humanReadable [in] Whether printed results should be
-      ///   easy for humans to read (vs. easy for parsers to parse)
-      /// \param debug [in] Whether to write verbose debug output to
-      ///   err
-      DistTsqrVerifier (const Teuchos::RCP<MessengerBase<Ordinal> >& ordinalComm,
-                        const Teuchos::RCP<MessengerBase<Scalar> >& scalarComm,
-                        const std::string& scalarTypeName,
-                        std::ostream& out,
-                        std::ostream& err,
-                        const bool testFactorExplicit,
-                        const bool testFactorImplicit,
-                        const bool humanReadable,
-                        const bool printMatrices,
-                        const bool debug) :
-        ordinalComm_ (ordinalComm),
-        scalarComm_ (scalarComm),
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        err_ (err),
-        testFactorExplicit_ (testFactorExplicit),
-        testFactorImplicit_ (testFactorImplicit),
-        humanReadable_ (humanReadable),
-        printMatrices_ (printMatrices),
-        debug_ (debug)
-      {}
-
-      /// \brief Get seed vector for pseudorandom number generator
-      ///
-      /// Fill seed (changing size of vector as necessary) with the
-      /// seed vector used by the pseudorandom number generator.  You
-      /// can use this to resume the pseudorandom number stream from
-      /// where you last were.
-      void
-      getSeed (std::vector<int>& seed) const
-      {
-        gen_.getSeed (seed);
-      }
-
-      /// \brief Run the DistTsqr accuracy test
-      ///
-      /// \param numCols [in] Number of columns in the matrix to test.
-      ///   Number of rows := (# MPI processors) * ncols.
-      void
-      verify (const Ordinal numCols,
-              const std::string& additionalFieldNames,
-              const std::string& additionalData,
-              const bool printFieldNames)
-      {
-        using std::endl;
-
-        const int myRank = scalarComm_->rank();
-        if (debug_)
-          {
-            scalarComm_->barrier();
-            if (myRank == 0)
-              err_ << "Verifying DistTsqr:" << endl;
-            scalarComm_->barrier();
-          }
-
-        // Generate test problem.
-        Matrix< Ordinal, Scalar > A_local, Q_local, R;
-        testProblem (A_local, Q_local, R, numCols);
-        if (debug_)
-          {
-            scalarComm_->barrier();
-            if (myRank == 0)
-              err_ << "-- Generated test problem." << endl;
-            scalarComm_->barrier();
-          }
-
-        // Set up TSQR implementation.
-        DistTsqr<Ordinal, Scalar> par;
-        par.init (scalarComm_);
-        if (debug_)
-          {
-            scalarComm_->barrier();
-            if (myRank == 0)
-              err_ << "-- DistTsqr object initialized" << endl << endl;
-          }
-
-        // Whether we've printed field names (i.e., column headers)
-        // yet.  Only matters for non-humanReadable output.
-        bool printedFieldNames = false;
-
-        // Test DistTsqr::factor() and DistTsqr::explicit_Q().
-        if (testFactorImplicit_)
-          {
-            // Factor the matrix A (copied into R, which will be
-            // overwritten on output)
-            typedef typename DistTsqr<Ordinal, Scalar>::FactorOutput
-              factor_output_type;
-            factor_output_type factorOutput = par.factor (R.view());
-            if (debug_)
-              {
-                scalarComm_->barrier();
-                if (myRank == 0)
-                  err_ << "-- Finished DistTsqr::factor" << endl;
-              }
-            // Compute the explicit Q factor
-            par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput);
-            if (debug_) {
-              scalarComm_->barrier();
-              if (myRank == 0) {
-                err_ << "-- Finished DistTsqr::explicit_Q" << endl;
-              }
-            }
-            // Verify the factorization
-            result_type result =
-              global_verify (numCols, numCols, A_local.data(), A_local.stride(1),
-                             Q_local.data(), Q_local.stride(1), R.data(), R.stride(1),
-                             scalarComm_.get());
-            if (debug_) {
-              scalarComm_->barrier();
-              if (myRank == 0) {
-                err_ << "-- Finished global_verify" << endl;
-              }
-            }
-            reportResults ("DistTsqr", numCols, result,
-                           additionalFieldNames, additionalData,
-                           printFieldNames && (! printedFieldNames));
-            if (printFieldNames && (! printedFieldNames))
-              printedFieldNames = true;
-          }
-
-        // Test DistTsqr::factorExplicit()
-        if (testFactorExplicit_) {
-          // Factor the matrix and compute the explicit Q factor, both
-          // in a single operation.
-          par.factorExplicit (R.view(), Q_local.view());
-          if (debug_) {
-            scalarComm_->barrier();
-            if (myRank == 0) {
-              err_ << "-- Finished DistTsqr::factorExplicit" << endl;
-            }
-          }
-
-          if (printMatrices_) {
-            if (myRank == 0) {
-              err_ << std::endl << "Computed Q factor:" << std::endl;
-            }
-            printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get());
-            if (myRank == 0) {
-              err_ << std::endl << "Computed R factor:" << std::endl;
-              print_local_matrix (err_, R.extent(0), R.extent(1), R.data(), R.stride(1));
-              err_ << std::endl;
-            }
-          }
-
-          // Verify the factorization
-          result_type result =
-            global_verify (numCols, numCols, A_local.data(), A_local.stride(1),
-                           Q_local.data(), Q_local.stride(1), R.data(), R.stride(1),
-                           scalarComm_.get());
-          if (debug_) {
-            scalarComm_->barrier();
-            if (myRank == 0) {
-              err_ << "-- Finished global_verify" << endl;
-            }
-          }
-          reportResults ("DistTsqrRB", numCols, result,
-                         additionalFieldNames, additionalData,
-                         printFieldNames && (! printedFieldNames));
-          if (printFieldNames && (! printedFieldNames)) {
-            printedFieldNames = true;
-          }
-        }
-      }
-
-    private:
-      /// Report verification results.  Call on ALL MPI processes, not
-      /// just Rank 0.
-      ///
-      /// \param method [in] String to print before reporting results
-      /// \param numCols [in] Number of columns in the matrix tested.
-      /// \param result [in] (relative residual, orthogonality)
-      void
-      reportResults (const std::string& method,
-                     const Ordinal numCols,
-                     const result_type& result,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames)
-      {
-        using std::endl;
-
-        const int numProcs = scalarComm_->size();
-        const int myRank = scalarComm_->rank();
-
-        if (myRank == 0)
-          {
-            if (humanReadable_)
-              {
-                out_ << method << " accuracy results:" << endl
-                     << "Scalar type = " << scalarTypeName_ << endl
-                     << "Number of columns = " << numCols << endl
-                     << "Number of (MPI) processes = " << numProcs << endl
-                     << "Absolute residual $\\| A - Q R \\|_2: "
-                     << result[0] << endl
-                     << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: "
-                     << result[1] << endl
-                     << "Test matrix norm $\\| A \\|_F$: "
-                     << result[2] << endl;
-              }
-            else
-              {
-                // Use scientific notation for floating-point numbers
-                out_ << std::scientific;
-
-                if (printFieldNames)
-                  {
-                    out_ << "%method,scalarType,numCols,numProcs"
-                      ",absFrobResid,absFrobOrthog,frobA";
-                    if (! additionalFieldNames.empty())
-                      out_ << "," << additionalFieldNames;
-                    out_ << endl;
-                  }
-
-                out_ << method
-                     << "," << scalarTypeName_
-                     << "," << numCols
-                     << "," << numProcs
-                     << "," << result[0]
-                     << "," << result[1]
-                     << "," << result[2];
-                if (! additionalData.empty())
-                  out_ << "," << additionalData;
-                out_ << endl;
-              }
-          }
-      }
-
-      void
-      testProblem (Matrix< Ordinal, Scalar >& A_local,
-                   Matrix< Ordinal, Scalar >& Q_local,
-                   Matrix< Ordinal, Scalar >& R,
-                   const Ordinal numCols)
-      {
-        const Ordinal numRowsLocal = numCols;
-
-        // A_local: Space for the matrix A to factor -- local to each
-        //   processor.
-        //
-        // A_global: Global matrix (only nonempty on Proc 0); only
-        //   used temporarily.
-        Matrix< Ordinal, Scalar > A_global;
-
-        // This modifies A_local on all procs, and A_global on Proc 0.
-        par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_);
-
-        if (printMatrices_) {
-          const int myRank = scalarComm_->rank();
-          if (myRank == 0) {
-            err_ << "Input matrix A:" << std::endl;
-          }
-          printGlobalMatrix (err_, A_local, scalarComm_.get(), ordinalComm_.get());
-          if (myRank == 0) {
-            err_ << std::endl;
-          }
-        }
-
-        // Copy the test problem input into R, since the factorization
-        // will overwrite it in place with the final R factor.
-        R.reshape (numCols, numCols);
-        deep_copy (R, Scalar {});
-        deep_copy (R, A_local);
-
-        // Prepare space in which to construct the explicit Q factor
-        // (local component on this processor)
-        Q_local.reshape (numRowsLocal, numCols);
-        deep_copy (Q_local, Scalar {});
-      }
-    };
-
-
-    /// \class DistTsqrBenchmarker
-    /// \brief Generic version of \c DistTsqr performance test.
-    template< class Ordinal, class Scalar, class TimerType >
-    class DistTsqrBenchmarker {
-      TSQR::Random::NormalGenerator< Ordinal, Scalar > gen_;
-      Teuchos::RCP< MessengerBase< Scalar > > scalarComm_;
-      Teuchos::RCP< MessengerBase< double > > doubleComm_;
-      std::string scalarTypeName_;
-
-      std::ostream& out_;
-      std::ostream& err_;
-      const bool testFactorExplicit_, testFactorImplicit_;
-      const bool humanReadable_, debug_;
-
-    public:
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-      typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type;
-      typedef TimerType timer_type;
-
-      /// \brief Constructor, with custom seed value
-      ///
-      /// \param scalarComm [in/out] Communicator object over which
-      ///   to test.
-      /// \param doubleComm [in/out] Communicator object for doubles,
-      ///   used for finding the min and max of timing results over
-      ///   all the MPI processes.
-      /// \param seed [in] 4-element vector; the random seed input of
-      ///   TSQR::Random::NormalGenerator (which see, since there are
-      ///   restrictions on the set of valid seeds)
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   template type parameter
-      /// \param out [out] Output stream to which to write results
-      /// \param err [out] Output stream to which to write any
-      ///   debugging outputs (if applicable) or errors
-      /// \param testFactorExplicit [in] Whether to test
-      ///   DistTsqr::factorExplicit()
-      /// \param testFactorImplicit [in] Whether to test
-      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
-      /// \param humanReadable [in] Whether printed results should be
-      ///   easy for humans to read (vs. easy for parsers to parse)
-      /// \param debug [in] Whether to write verbose debug output to
-      ///   err
-      DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
-                           const Teuchos::RCP< MessengerBase< double > >& doubleComm,
-                           const std::vector<int>& seed,
-                           const std::string& scalarTypeName,
-                           std::ostream& out,
-                           std::ostream& err,
-                           const bool testFactorExplicit,
-                           const bool testFactorImplicit,
-                           const bool humanReadable,
-                           const bool debug) :
-        gen_ (seed),
-        scalarComm_ (scalarComm),
-        doubleComm_ (doubleComm),
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        err_ (err),
-        testFactorExplicit_ (testFactorExplicit),
-        testFactorImplicit_ (testFactorImplicit),
-        humanReadable_ (humanReadable),
-        debug_ (debug)
-      {}
-
-      /// \brief Constructor, with default seed value
-      ///
-      /// This constructor sets a default seed (for the pseudorandom
-      /// number generator), which is the same seed (0,0,0,1) each
-      /// time.
-      ///
-      /// \param scalarComm [in/out] Communicator object over which
-      ///   to test.
-      /// \param doubleComm [in/out] Communicator object for doubles,
-      ///   used for finding the min and max of timing results over
-      ///   all the MPI processes.
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   template type parameter
-      /// \param out [out] Output stream to which to write results
-      /// \param err [out] Output stream to which to write any
-      ///   debugging outputs (if applicable) or errors
-      /// \param testFactorExplicit [in] Whether to test
-      ///   DistTsqr::factorExplicit()
-      /// \param testFactorImplicit [in] Whether to test
-      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
-      /// \param humanReadable [in] Whether printed results should be
-      ///   easy for humans to read (vs. easy for parsers to parse)
-      /// \param debug [in] Whether to write verbose debug output to
-      ///   err
-      DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
-                           const Teuchos::RCP< MessengerBase< double > >& doubleComm,
-                           const std::string& scalarTypeName,
-                           std::ostream& out,
-                           std::ostream& err,
-                           const bool testFactorExplicit,
-                           const bool testFactorImplicit,
-                           const bool humanReadable,
-                           const bool debug) :
-        scalarComm_ (scalarComm),
-        doubleComm_ (doubleComm),
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        err_ (err),
-        testFactorExplicit_ (testFactorExplicit),
-        testFactorImplicit_ (testFactorImplicit),
-        humanReadable_ (humanReadable),
-        debug_ (debug)
-      {}
-
-      /// \brief Get seed vector for pseudorandom number generator
-      ///
-      /// Fill seed (changing size of vector as necessary) with the
-      /// seed vector used by the pseudorandom number generator.  You
-      /// can use this to resume the pseudorandom number stream from
-      /// where you last were.
-      void
-      getSeed (std::vector<int>& seed) const
-      {
-        gen_.getSeed (seed);
-      }
-
-      /// \brief Run the DistTsqr benchmark
-      ///
-      /// \param numTrials [in] Number of times to repeat the computation
-      ///   in a single timing run
-      /// \param numCols [in] Number of columns in the matrix to test.
-      ///   Number of rows := (# MPI processors) * ncols
-      void
-      benchmark (const int numTrials,
-                 const Ordinal numCols,
-                 const std::string& additionalFieldNames,
-                 const std::string& additionalData,
-                 const bool printFieldNames)
-      {
-        using std::endl;
-
-        // Set up test problem.
-        Matrix< Ordinal, Scalar > A_local, Q_local, R;
-        testProblem (A_local, Q_local, R, numCols);
-
-        // Set up TSQR implementation.
-        DistTsqr<Ordinal, Scalar> par;
-        par.init (scalarComm_);
-
-        // Whether we've printed field names (i.e., column headers)
-        // yet.  Only matters for non-humanReadable output.
-        bool printedFieldNames = false;
-
-        if (testFactorImplicit_)
-          {
-            std::string timerName ("DistTsqr");
-            typedef typename DistTsqr<Ordinal, Scalar>::FactorOutput
-              factor_output_type;
-
-            // Throw away some number of runs, because some MPI libraries
-            // (recent versions of OpenMPI at least) do autotuning for the
-            // first few collectives calls.
-            const int numThrowAwayRuns = 5;
-            for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum)
-              {
-                // Factor the matrix A (copied into R, which will be
-                // overwritten on output)
-                factor_output_type factorOutput = par.factor (R.view());
-                // Compute the explicit Q factor
-                par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput);
-              }
-
-            // Now do the actual timing runs.  Benchmark DistTsqr
-            // (factor() and explicit_Q()) for numTrials trials.
-            timer_type timer (timerName);
-            timer.start();
-            for (int trialNum = 0; trialNum < numTrials; ++trialNum)
-              {
-                // Factor the matrix A (copied into R, which will be
-                // overwritten on output)
-                factor_output_type factorOutput = par.factor (R.view());
-                // Compute the explicit Q factor
-                par.explicit_Q (numCols, Q_local.data(), Q_local.stride(1), factorOutput);
-              }
-            // Cumulative timing on this MPI process.
-            // "Cumulative" means the elapsed time of numTrials executions.
-            const double localCumulativeTiming = timer.stop();
-
-            // reportResults() must be called on all processes, since this
-            // figures out the min and max timings over all processes.
-            reportResults (timerName, numTrials, numCols, localCumulativeTiming,
-                           additionalFieldNames, additionalData,
-                           printFieldNames && (! printedFieldNames));
-            if (printFieldNames && (! printedFieldNames))
-              printedFieldNames = true;
-          }
-
-        if (testFactorExplicit_)
-          {
-            std::string timerName ("DistTsqrRB");
-
-            // Throw away some number of runs, because some MPI libraries
-            // (recent versions of OpenMPI at least) do autotuning for the
-            // first few collectives calls.
-            const int numThrowAwayRuns = 5;
-            for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum)
-              {
-                par.factorExplicit (R.view(), Q_local.view());
-              }
-
-            // Benchmark DistTsqr::factorExplicit() for numTrials trials.
-            timer_type timer (timerName);
-            timer.start();
-            for (int trialNum = 0; trialNum < numTrials; ++trialNum)
-              {
-                par.factorExplicit (R.view(), Q_local.view());
-              }
-            // Cumulative timing on this MPI process.
-            // "Cumulative" means the elapsed time of numTrials executions.
-            const double localCumulativeTiming = timer.stop();
-
-            // Report cumulative (not per-invocation) timing results
-            reportResults (timerName, numTrials, numCols, localCumulativeTiming,
-                           additionalFieldNames, additionalData,
-                           printFieldNames && (! printedFieldNames));
-            if (printFieldNames && (! printedFieldNames))
-              printedFieldNames = true;
-
-            // Per-invocation timings (for factorExplicit() benchmark
-            // only).  localTimings were computed on this MPI process;
-            // globalTimings are statistical summaries of those over
-            // all MPI processes.  We only collect that data for
-            // factorExplicit().
-            std::vector< TimeStats > localTimings;
-            std::vector< TimeStats > globalTimings;
-            par.getFactorExplicitTimings (localTimings);
-            for (std::vector< TimeStats >::size_type k = 0; k < localTimings.size(); ++k)
-              globalTimings.push_back (globalTimeStats (*doubleComm_, localTimings[k]));
-            std::vector< std::string > timingLabels;
-            par.getFactorExplicitTimingLabels (timingLabels);
-
-            if (humanReadable_)
-              out_ << timerName << " per-invocation benchmark results:" << endl;
-
-            const std::string labelLabel ("label,scalarType");
-            for (std::vector< std::string >::size_type k = 0; k < timingLabels.size(); ++k)
-              {
-                // Only print column headers (i.e., field names) once, if at all.
-                const bool printHeaders = (k == 0) && printFieldNames;
-                globalTimings[k].print (out_, humanReadable_,
-                                        timingLabels[k] + "," + scalarTypeName_,
-                                        labelLabel, printHeaders);
-              }
-          }
-      }
-
-    private:
-      /// Report timing results to the given output stream
-      ///
-      /// \param method [in] String to print before reporting results
-      /// \param numTrials [in] Number of times to repeat the computation
-      ///   in a single timing run
-      /// \param numCols [in] Number of columns in the matrix to test.
-      ///   Number of rows := (# MPI processors) * ncols
-      /// \param timing [in] Total benchmark time, as measured on this
-      ///   MPI process.  This may differ on each process; we report
-      ///   the min and the max.
-      ///
-      /// \warning Call on ALL MPI processes, not just Rank 0!
-      void
-      reportResults (const std::string& method,
-                     const int numTrials,
-                     const ordinal_type numCols,
-                     const double localTiming,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames)
-      {
-        using std::endl;
-
-        // Find min and max timing over all MPI processes
-        TimeStats localStats;
-        localStats.update (localTiming);
-        TimeStats globalStats = globalTimeStats (*doubleComm_, localStats);
-
-        // Only Rank 0 prints the final results.
-        const bool printResults = (doubleComm_->rank() == 0);
-        if (printResults)
-          {
-            const int numProcs = doubleComm_->size();
-            if (humanReadable_)
-              {
-                out_ << method << " cumulative benchmark results (total time over all trials):" << endl
-                     << "Scalar type = " << scalarTypeName_ << endl
-                     << "Number of columns = " << numCols << endl
-                     << "Number of (MPI) processes = " << numProcs << endl
-                     << "Number of trials = " << numTrials << endl
-                     << "Min timing (in seconds) = " << globalStats.min() << endl
-                     << "Mean timing (in seconds) = " << globalStats.mean() << endl
-                     << "Max timing (in seconds) = " << globalStats.max() << endl
-                     << endl;
-              }
-            else
-              {
-                // Use scientific notation for floating-point numbers
-                out_ << std::scientific;
-
-                if (printFieldNames)
-                  {
-                    out_ << "%method,scalarType,numCols,numProcs,numTrials"
-                         << ",minTiming,meanTiming,maxTiming";
-                    if (! additionalFieldNames.empty())
-                      out_ << "," << additionalFieldNames;
-                    out_ << endl;
-                  }
-
-                out_ << method
-                     << "," << scalarTypeName_
-                     << "," << numCols
-                     << "," << numProcs
-                     << "," << numTrials
-                     << "," << globalStats.min()
-                     << "," << globalStats.mean()
-                     << "," << globalStats.max();
-                if (! additionalData.empty())
-                  out_ << "," << additionalData;
-                out_ << endl;
-              }
-          }
-      }
-
-      void
-      testProblem (Matrix< Ordinal, Scalar >& A_local,
-                   Matrix< Ordinal, Scalar >& Q_local,
-                   Matrix< Ordinal, Scalar >& R,
-                   const Ordinal numCols)
-      {
-        const Ordinal numRowsLocal = numCols;
-
-        // A_local: Space for the matrix A to factor -- local to each
-        //   processor.
-        //
-        // A_global: Global matrix (only nonempty on Proc 0); only
-        //   used temporarily.
-        Matrix<Ordinal, Scalar> A_global;
-
-        // This modifies A_local on all procs, and A_global on Proc 0.
-        par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_);
-
-        // Copy the test problem input into R, since the factorization
-        // will overwrite it in place with the final R factor.
-        R.reshape (numCols, numCols);
-        deep_copy (R, A_local);
-
-        // Prepare space in which to construct the explicit Q factor
-        // (local component on this processor)
-        Q_local.reshape (numRowsLocal, numCols);
-        deep_copy (Q_local, Scalar {});
-      }
-
-      /// Make sure that timer_type satisfies the TimerType concept.
-      ///
-      static void
-      conceptChecks ()
-      {
-        verifyTimerConcept<timer_type>();
-      }
-    };
-
-
-  } // namespace Test
-} // namespace TSQR
-
-#endif // __TSQR_Test_DistTest_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp
index 399f13fa8fde..c3e24ac02569 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Random_MatrixGenerator.hpp
@@ -100,17 +100,12 @@ namespace TSQR {
         std::vector<Scalar> tau (std::min(nrows, ncols));
 
         // Workspace query
-        Scalar _lwork1, _lwork2;
-        lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(), &_lwork1, -1);
-        lapack.compute_explicit_Q (nrows, ncols, ncols,
-                                   Q, ldq, tau.data(),
-                                   &_lwork2, -1);
-
-        // Allocate workspace.  abs() returns a magnitude_type, and we
-        // can compare those using std::max.  If Scalar is complex,
-        // you can't compare it using max.
-        const Ordinal lwork = checkedCast (std::max (STS::magnitude (_lwork1),
-                                                     STS::magnitude (_lwork2)));
+        const int lwork1 =
+          lapack.compute_QR_lwork (nrows, ncols, Q, ldq);
+        const int lwork2 =
+          lapack.compute_explicit_Q_lwork (nrows, ncols, ncols,
+                                           Q, ldq, tau.data ());
+        const Ordinal lwork = std::max (lwork1, lwork2);
         std::vector<Scalar> work (lwork);
 
         lapack.compute_QR (nrows, ncols, Q, ldq, tau.data(),
@@ -140,19 +135,12 @@ namespace TSQR {
         // Fill Q with random numbers
         this->fill_random (nrows, ncols, Q, ldq);
 
-        // Get ready for QR factorization
         Impl::Lapack<Scalar> lapack;
-
-        // Workspace query
-        Scalar _lwork1;
-        lapack.compute_QR (nrows, ncols, Q, ldq, tau, &_lwork1, -1);
-
-        // Allocate workspace.
-        const Ordinal lwork = checkedCast (STS::magnitude (_lwork1));
+        const int lwork =
+          lapack.compute_QR_lwork (nrows, ncols, Q, ldq);
         std::vector<Scalar> work (lwork);
-
         lapack.compute_QR (nrows, ncols, Q, ldq, tau,
-                           work.data(), lwork);
+                           work.data (), lwork);
       }
 
       template< class MatrixViewType >
@@ -192,25 +180,29 @@ namespace TSQR {
         implicit_Q (V, tau_V.data());
 
         // Workspace query for ORMQR.
-        Scalar _lwork1, _lwork2;
         Impl::Lapack<Scalar> lapack;
-        lapack.apply_Q_factor ('L', 'N', nrows, ncols, ncols,
-                               U.data(), U.stride(1), tau_U.data(),
-                               A, lda, &_lwork1, -1);
+        const int lwork1 =
+          lapack.apply_Q_factor_lwork ('L', 'N', nrows, ncols, ncols,
+                                       U.data (), U.stride (1),
+                                       tau_U.data (), A, lda);
+        int lwork2 = 0;
         if (STS::isComplex) {
-          lapack.apply_Q_factor ('R', 'C', nrows, ncols, ncols,
-                                 V.data(), V.stride(1), tau_V.data(),
-                                 A, lda, &_lwork2, -1);
+          lwork2 =
+            lapack.apply_Q_factor_lwork ('R', 'C',
+                                         nrows, ncols, ncols,
+                                         V.data (), V.stride (1),
+                                         tau_V.data (), A, lda);
         }
         else {
-          lapack.apply_Q_factor ('R', 'T', nrows, ncols, ncols,
-                                 V.data(), V.stride(1), tau_V.data(),
-                                 A, lda, &_lwork2, -1);
+          lwork2 =
+            lapack.apply_Q_factor_lwork ('R', 'T',
+                                         nrows, ncols, ncols,
+                                         V.data (), V.stride (1),
+                                         tau_V.data (), A, lda);
         }
 
         // Allocate workspace.
-        Ordinal lwork = checkedCast (std::max (STS::magnitude (_lwork1),
-                                               STS::magnitude (_lwork2)));
+        Ordinal lwork (std::max (lwork1, lwork2));
         std::vector<Scalar> work (lwork);
 
         // Apply U to the left side of A, and V^H to the right side of A.
@@ -258,16 +250,13 @@ namespace TSQR {
         std::vector<Scalar> tau (n);
 
         // Workspace size query for QR factorization.
-        Scalar _lwork1;
         Impl::Lapack<Scalar> lapack;
-        lapack.compute_QR (n, n, R, ldr, tau.data(), &_lwork1, -1);
-
-        // Allocate workspace
-        Ordinal lwork = checkedCast (STS::magnitude (_lwork1));
-        std::vector<Scalar> work (lwork);
+        const int lwork = lapack.compute_QR_lwork (n, n, R, ldr);
 
         // Compute QR factorization (implicit representation in place).
-        lapack.compute_QR (n, n, R, ldr, tau.data(), work.data(), lwork);
+        std::vector<Scalar> work (lwork);
+        lapack.compute_QR (n, n, R, ldr, tau.data (),
+                           work.data (), lwork);
 
         // Zero out the stuff below the diagonal of R, leaving just the R factor.
         for (Ordinal j = 0; j < n; ++j) {
diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp
deleted file mode 100644
index 727c50019482..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.cpp
+++ /dev/null
@@ -1,1112 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#include "Tsqr_SeqTest.hpp"
-#include "Tsqr_Random_NormalGenerator.hpp"
-#include "Tsqr_nodeTestProblem.hpp"
-#include "Tsqr_verifyTimerConcept.hpp"
-#include "Tsqr_LocalVerify.hpp"
-#include "Tsqr_Matrix.hpp"
-#include "Tsqr_SequentialTsqr.hpp"
-#include "Tsqr_Util.hpp"
-#include "Tsqr_Impl_Lapack.hpp"
-#include "Teuchos_Time.hpp"
-#include <algorithm>
-#include <cstring> // size_t definition
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-
-namespace TSQR {
-  namespace Test {
-
-    template<class Ordinal, class Scalar>
-    static Ordinal
-    lworkQueryLapackQr (Impl::Lapack<Scalar>& lapack,
-                        const Ordinal nrows,
-                        const Ordinal ncols,
-                        const Ordinal lda)
-    {
-      using std::ostringstream;
-      using std::endl;
-      using STS = Teuchos::ScalarTraits<Scalar>;
-      using mag_type = typename STS::magnitudeType;
-
-      Scalar d_lwork_geqrf {};
-      lapack.compute_QR (nrows, ncols, nullptr, lda, nullptr,
-                         &d_lwork_geqrf, -1);
-
-      Scalar d_lwork_orgqr {};
-      // A workspace query appropriate for computing the explicit Q
-      // factor (nrows x ncols) in place, from the QR factorization of
-      // an nrows x ncols matrix with leading dimension lda.
-      lapack.compute_explicit_Q (nrows, ncols, ncols, nullptr, lda,
-                                 nullptr, &d_lwork_orgqr, -1);
-
-      // LAPACK workspace queries do return their results as a
-      // double-precision floating-point value, but LAPACK promises
-      // that that value will fit in an int.  Thus, we don't need to
-      // check for valid casts to int below.  I include the checks
-      // just to be "bulletproof" and also to show how to do the
-      // checks for later reference.
-      const mag_type lwork_geqrf_test =
-        static_cast<mag_type> (static_cast<Ordinal> (STS::magnitude (d_lwork_geqrf)));
-      if (lwork_geqrf_test != STS::magnitude (d_lwork_geqrf)) {
-        ostringstream os;
-        os << "LAPACK _GEQRF workspace query returned a result, "
-           << d_lwork_geqrf << ", bigger than the max Ordinal value, "
-           << std::numeric_limits<Ordinal>::max ();
-        throw std::range_error (os.str ());
-      }
-      const Scalar lwork_orgqr_test =
-        static_cast<mag_type> (static_cast<Ordinal> (STS::magnitude ((d_lwork_orgqr))));
-      if (lwork_orgqr_test != STS::magnitude (d_lwork_orgqr)) {
-        ostringstream os;
-        os << "LAPACK _UNGQR workspace query returned a result, "
-           << d_lwork_orgqr << ", bigger than the max Ordinal value, "
-           << std::numeric_limits<Ordinal>::max();
-        throw std::range_error (os.str());
-      }
-      return std::max (static_cast<Ordinal> (STS::magnitude (d_lwork_geqrf)),
-                       static_cast<Ordinal> (STS::magnitude (d_lwork_orgqr)));
-    }
-
-    /// Test the accuracy of sequential TSQR on an nrows by ncols
-    /// matrix (using the given cache block size (in bytes)), and
-    /// print the results to stdout.
-    template< class Ordinal, class Scalar >
-    static void
-    verifySeqTsqrTemplate (std::ostream& out,
-                           TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
-                           const std::string& datatype,
-                           const std::string& shortDatatype,
-                           const Ordinal nrows,
-                           const Ordinal ncols,
-                           const size_t cache_size_hint,
-                           const bool contiguous_cache_blocks,
-                           const bool save_matrices,
-                           const std::string& additionalFieldNames,
-                           const std::string& additionalData,
-                           const bool printFieldNames,
-                           const bool human_readable,
-                           const bool b_debug)
-    {
-      typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
-      using std::cerr;
-      using std::endl;
-      using std::pair;
-      using std::string;
-      using std::vector;
-
-      SequentialTsqr<Ordinal, Scalar> actor (cache_size_hint);
-      Ordinal numCacheBlocks;
-
-      if (b_debug) {
-        cerr << "Sequential TSQR test problem:" << endl
-             << "* " << nrows << " x " << ncols << endl
-             << "* Cache size hint of " << actor.cache_size_hint() << " bytes" << endl;
-        if (contiguous_cache_blocks) {
-          cerr << "* Contiguous cache blocks" << endl;
-        }
-      }
-
-      Matrix<Ordinal, Scalar> A (nrows, ncols);
-      Matrix<Ordinal, Scalar> A_copy (nrows, ncols);
-      Matrix<Ordinal, Scalar> Q (nrows, ncols);
-      Matrix<Ordinal, Scalar> R (ncols, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN());
-        deep_copy (A_copy, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (Q, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (R, std::numeric_limits<Scalar>::quiet_NaN());
-      }
-      const Ordinal lda = nrows;
-      const Ordinal ldq = nrows;
-      const Ordinal ldr = ncols;
-
-      // Create a test problem
-      nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true);
-
-      if (save_matrices) {
-        string filename = "A_" + shortDatatype + ".txt";
-        if (b_debug) {
-          cerr << "-- Saving test problem to \"" << filename << "\"" << endl;
-        }
-        std::ofstream fileOut (filename.c_str());
-        print_local_matrix (fileOut, nrows, ncols, A.data(), A.stride(1));
-        fileOut.close();
-      }
-
-      if (b_debug) {
-        cerr << "-- Generated test problem" << endl;
-      }
-
-      // Copy A into A_copy, since TSQR overwrites the input.  If
-      // specified, rearrange the data in A_copy so that the data in
-      // each cache block is contiguously stored.
-      if (! contiguous_cache_blocks) {
-        deep_copy (A_copy, A);
-        if (b_debug) {
-          cerr << "-- Copied test problem from A into A_copy" << endl;
-        }
-      }
-      else {
-        actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1));
-        if (b_debug) {
-          cerr << "-- Reorganized test matrix to have contiguous "
-            "cache blocks" << endl;
-        }
-
-        // Verify cache blocking, when in debug mode.
-        if (b_debug) {
-          Matrix<Ordinal, Scalar> A2 (nrows, ncols);
-          if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-            deep_copy (A2, std::numeric_limits<Scalar>::quiet_NaN ());
-          }
-          actor.un_cache_block (nrows, ncols, A2.data (), A2.stride (1),
-                                A_copy.data ());
-          if (matrix_equal (A, A2)) {
-            if (b_debug) {
-              cerr << "-- Cache blocking test succeeded!" << endl;
-            }
-          }
-          else {
-            throw std::logic_error ("Cache blocking failed");
-          }
-        }
-      }
-
-      // Fill R with zeros, since the factorization may not overwrite
-      // the strict lower triangle of R.
-      deep_copy (R, Scalar {});
-
-      // Count the number of cache blocks that factor() will use.
-      // This is only for diagnostic purposes.
-      numCacheBlocks =
-        actor.factor_num_cache_blocks (nrows, ncols, A_copy.data(),
-                                       A_copy.stride(1), contiguous_cache_blocks);
-      // In debug mode, report how many cache blocks factor() will use.
-      if (b_debug) {
-        cerr << "-- Number of cache blocks factor() will use: "
-             << numCacheBlocks << endl << endl;
-      }
-
-      // Factor the matrix and compute the explicit Q factor
-      typedef typename SequentialTsqr<Ordinal, Scalar>::FactorOutput
-        factor_output_type;
-      factor_output_type factorOutput =
-        actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                      R.data(), R.stride(1), contiguous_cache_blocks);
-      if (b_debug) {
-        cerr << "-- Finished SequentialTsqr::factor" << endl;
-      }
-      if (save_matrices) {
-        string filename = "R_" + shortDatatype + ".txt";
-        if (b_debug) {
-          cerr << "-- Saving R factor to \"" << filename << "\"" << endl;
-        }
-        std::ofstream fileOut (filename.c_str ());
-        print_local_matrix (fileOut, ncols, ncols, R.data (), R.stride (1));
-        fileOut.close ();
-      }
-
-      actor.explicit_Q (nrows, ncols, A_copy.data(), lda, factorOutput,
-                        ncols, Q.data(), Q.stride(1), contiguous_cache_blocks);
-      if (b_debug) {
-        cerr << "-- Finished SequentialTsqr::explicit_Q" << endl;
-      }
-
-      // "Un"-cache-block the output, if contiguous cache blocks were
-      // used.  This is only necessary because local_verify() doesn't
-      // currently support contiguous cache blocks.
-      if (contiguous_cache_blocks) {
-        // Use A_copy as temporary storage for un-cache-blocking Q.
-        actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data());
-        deep_copy (Q, A_copy);
-        if (b_debug) {
-          cerr << "-- Un-cache-blocked output Q factor" << endl;
-        }
-      }
-
-      if (save_matrices) {
-        string filename = "Q_" + shortDatatype + ".txt";
-        if (b_debug) {
-          cerr << "-- Saving Q factor to \"" << filename << "\"" << endl;
-        }
-        std::ofstream fileOut (filename.c_str());
-        print_local_matrix (fileOut, nrows, ncols, Q.data(), Q.stride(1));
-        fileOut.close();
-      }
-
-      // Print out the R factor
-      if (false && b_debug) {
-        cerr << endl << "-- R factor:" << endl;
-        print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1));
-        cerr << endl;
-      }
-
-      // Validate the factorization
-      vector< magnitude_type > results =
-        local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr);
-      if (b_debug) {
-        cerr << "-- Finished local_verify" << endl;
-      }
-
-      // Print the results
-      if (human_readable) {
-        out << "Sequential cache-blocked TSQR:" << endl
-            << "Scalar type: " << datatype << endl
-            << "Matrix dimensions: " << nrows << " by " << ncols << endl
-            << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
-            << "Number of cache blocks: " << numCacheBlocks << endl
-            << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
-            << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
-            << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
-            << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
-            << endl << endl;
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          out << prefix
-              << "method"
-              << ",scalarType"
-              << ",numRows"
-              << ",numCols"
-              << ",cacheSizeHint"
-              << ",contiguousCacheBlocks"
-              << ",absFrobResid"
-              << ",absFrobOrthog"
-              << ",frobA";
-          if (! additionalFieldNames.empty())
-            out << "," << additionalFieldNames;
-          out << endl;
-        }
-        out << "SeqTSQR"
-            << "," << datatype
-            << "," << nrows
-            << "," << ncols
-            << "," << actor.cache_size_hint()
-            << "," << contiguous_cache_blocks
-            << "," << results[0]
-            << "," << results[1]
-            << "," << results[2];
-        if (! additionalData.empty ()) {
-          out << "," << additionalData;
-        }
-        out << endl;
-      }
-    }
-
-
-    void
-    verifySeqTsqr (std::ostream& out,
-                   const int nrows,
-                   const int ncols,
-                   const size_t cache_size_hint,
-                   const bool test_complex_arithmetic,
-                   const bool save_matrices,
-                   const bool contiguous_cache_blocks,
-                   const std::string& additionalFieldNames,
-                   const std::string& additionalData,
-                   const bool printFieldNames,
-                   const bool human_readable,
-                   const bool b_debug)
-    {
-      using TSQR::Random::NormalGenerator;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      using std::complex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      using std::string;
-      using std::vector;
-
-      //
-      // We do tests one after another, using the seed from the
-      // previous test in the current test, so that the pseudorandom
-      // streams used by the tests are independent.
-      //
-
-      // On output: Seed for the next pseudorandom number generator.
-      vector< int > iseed(4);
-      string datatype; // name of the current datatype being tested
-      string shortDatatype; // one-letter version of datatype
-
-      // First test.  The PRNG seeds itself with a default value.
-      // This will be the same each time, so if you want
-      // nondeterministic behavior, you should pick the seed values
-      // yourself.  Only print field names (if at all) for the first
-      // data type tested; field names are only printed if output is
-      // not human_readable.
-      NormalGenerator< int, float > normgenS;
-      datatype = "float";
-      shortDatatype = "S";
-      verifySeqTsqrTemplate (out, normgenS, datatype, shortDatatype, nrows, ncols,
-                             cache_size_hint, contiguous_cache_blocks,
-                             save_matrices, additionalFieldNames, additionalData,
-                             printFieldNames, human_readable, b_debug);
-      // Fetch the pseudorandom seed from the previous test.
-      normgenS.getSeed (iseed);
-      NormalGenerator< int, double > normgenD (iseed);
-      // Next test.
-      datatype = "double";
-      shortDatatype = "D";
-      verifySeqTsqrTemplate (out, normgenD, datatype, shortDatatype, nrows, ncols,
-                             cache_size_hint, contiguous_cache_blocks,
-                             save_matrices, additionalFieldNames, additionalData,
-                             printFieldNames, human_readable, b_debug);
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      if (test_complex_arithmetic) {
-        normgenD.getSeed (iseed);
-        NormalGenerator< int, complex<float> > normgenC (iseed);
-        datatype = "complex<float>";
-        shortDatatype = "C";
-        verifySeqTsqrTemplate (out, normgenC, datatype, shortDatatype, nrows, ncols,
-                               cache_size_hint, contiguous_cache_blocks,
-                               save_matrices, additionalFieldNames, additionalData,
-                               printFieldNames, human_readable, b_debug);
-        normgenC.getSeed (iseed);
-        NormalGenerator< int, complex<double> > normgenZ (iseed);
-        datatype = "complex<double>";
-        shortDatatype = "Z";
-        verifySeqTsqrTemplate (out, normgenZ, datatype, shortDatatype, nrows, ncols,
-                               cache_size_hint, contiguous_cache_blocks,
-                               save_matrices, additionalFieldNames, additionalData,
-                               printFieldNames, human_readable, b_debug);
-      }
-#else // HAVE_KOKKOSTSQR_COMPLEX
-      if (test_complex_arithmetic) {
-        throw std::logic_error ("Trilinos was not built with "
-                                "complex arithmetic support");
-      }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    }
-
-
-
-    template< class Ordinal, class Scalar >
-    static void
-    verifyLapackTemplate (std::ostream& out,
-                          TSQR::Random::NormalGenerator<Ordinal, Scalar>& generator,
-                          const std::string& datatype,
-                          const Ordinal nrows,
-                          const Ordinal ncols,
-                          const std::string& additionalFieldNames,
-                          const std::string& additionalData,
-                          const bool printFieldNames,
-                          const bool human_readable,
-                          const bool b_debug)
-    {
-      typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
-      using std::ostringstream;
-      using std::cerr;
-      using std::endl;
-
-      Impl::Lapack<Scalar> lapack;
-
-      if (b_debug) {
-        cerr << "LAPACK test problem:" << endl
-             << "* " << nrows << " x " << ncols << endl;
-      }
-
-      Matrix<Ordinal, Scalar> A (nrows, ncols);
-      Matrix<Ordinal, Scalar> A_copy (nrows, ncols);
-      Matrix<Ordinal, Scalar> Q (nrows, ncols);
-      Matrix<Ordinal, Scalar> R (ncols, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN());
-        deep_copy (A_copy, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (Q, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (R, std::numeric_limits<Scalar>::quiet_NaN());
-      }
-      const Ordinal lda = nrows;
-      const Ordinal ldq = nrows;
-      const Ordinal ldr = ncols;
-
-      // Create a test problem
-      nodeTestProblem (generator, nrows, ncols,
-                       A.data (), A.stride (1), true);
-      if (b_debug) {
-        cerr << "-- Generated test problem" << endl;
-      }
-
-      // Copy A into A_copy, since LAPACK QR overwrites the input.
-      deep_copy (A_copy, A);
-      if (b_debug) {
-        cerr << "-- Copied test problem from A into A_copy" << endl;
-      }
-
-      // Now determine the required workspace for the factorization.
-      const Ordinal lwork =
-        lworkQueryLapackQr (lapack, nrows, ncols, A_copy.stride (1));
-      std::vector<Scalar> work (lwork);
-      std::vector<Scalar> tau (ncols);
-
-      // Fill R with zeros, since the factorization may not overwrite
-      // the strict lower triangle of R.
-      deep_copy (R, Scalar {});
-
-      lapack.compute_QR (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                         tau.data(), work.data(), lwork);
-      // Copy out the R factor from A_copy (where we computed the QR
-      // factorization in place) into R.
-      copy_upper_triangle (ncols, ncols, R.data(), ldr, A_copy.data(), lda);
-
-      if (b_debug) {
-        cerr << endl << "-- R factor:" << endl;
-        print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1));
-        cerr << endl;
-      }
-
-      // The explicit Q factor will be computed in place, so copy the
-      // result of the factorization into Q.
-      deep_copy (Q, A_copy);
-
-      lapack.compute_explicit_Q (nrows, ncols, ncols, Q.data(), ldq,
-                                 tau.data(), work.data(), lwork);
-
-      // Validate the factorization
-      std::vector<magnitude_type> results =
-        local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq,
-                      R.data(), ldr);
-
-      // Print the results
-      if (human_readable) {
-        out << "LAPACK QR (DGEQRF and DUNGQR):" << endl
-            << "Scalar type: " << datatype << endl
-            << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
-            << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
-            << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
-            << endl << endl;
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          out << prefix
-              << "method"
-              << ",scalarType"
-              << ",numRows"
-              << ",numCols"
-              << ",cacheSizeHint"
-              << ",contiguousCacheBlocks"
-              << ",absFrobResid"
-              << ",absFrobOrthog"
-              << ",frobA";
-          if (! additionalFieldNames.empty ()) {
-            out << "," << additionalFieldNames;
-          }
-          out << endl;
-        }
-        out << "LAPACK"
-            << "," << datatype
-            << "," << nrows
-            << "," << ncols
-            << "," << size_t(0) // cache_size_hint
-            << "," << false     // contiguous_cache_blocks
-            << "," << results[0]
-            << "," << results[1]
-            << "," << results[2];
-        if (! additionalData.empty ()) {
-          out << "," << additionalData;
-        }
-        out << endl;
-      }
-    }
-
-
-    void
-    verifyLapack (std::ostream& out,
-                  const int nrows,
-                  const int ncols,
-                  const bool test_complex_arithmetic,
-                  const std::string& additionalFieldNames,
-                  const std::string& additionalData,
-                  const bool printFieldNames,
-                  const bool human_readable,
-                  const bool b_debug)
-    {
-      using TSQR::Random::NormalGenerator;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      using std::complex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      using std::string;
-      using std::vector;
-
-      //
-      // We do tests one after another, using the seed from the
-      // previous test in the current test, so that the pseudorandom
-      // streams used by the tests are independent.
-      //
-
-      // On output: Seed for the next pseudorandom number generator.
-      vector< int > iseed(4);
-      string datatype; // name of the current datatype being tested
-
-      // First test.  The PRNG seeds itself with a default value.
-      // This will be the same each time, so if you want
-      // nondeterministic behavior, you should pick the seed values
-      // yourself.
-      NormalGenerator< int, float > normgenS;
-      datatype = "float";
-      verifyLapackTemplate (out, normgenS, datatype, nrows, ncols,
-                            additionalFieldNames, additionalData,
-                            printFieldNames, human_readable, b_debug);
-      // Fetch the pseudorandom seed from the previous test.
-      normgenS.getSeed (iseed);
-      NormalGenerator< int, double > normgenD (iseed);
-      // Next test.
-      datatype = "double";
-      verifyLapackTemplate (out, normgenD, datatype, nrows, ncols,
-                            additionalFieldNames, additionalData,
-                            false, human_readable, b_debug);
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      if (test_complex_arithmetic) {
-        normgenD.getSeed (iseed);
-        NormalGenerator< int, complex<float> > normgenC (iseed);
-        datatype = "complex<float>";
-        verifyLapackTemplate (out, normgenC, datatype, nrows, ncols,
-                              additionalFieldNames, additionalData,
-                              false, human_readable, b_debug);
-        normgenC.getSeed (iseed);
-        NormalGenerator< int, complex<double> > normgenZ (iseed);
-        datatype = "complex<double>";
-        verifyLapackTemplate (out, normgenZ, datatype, nrows, ncols,
-                              additionalFieldNames, additionalData,
-                              false, human_readable, b_debug);
-      }
-#else // HAVE_KOKKOSTSQR_COMPLEX
-      if (test_complex_arithmetic) {
-        throw std::logic_error ("Trilinos was not built with "
-                                "complex arithmetic support");
-      }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    }
-
-    /// \class LapackBenchmarker
-    /// \brief Template version of LAPACK QR benchmark
-    ///
-    /// LAPACK QR benchmark, templated on Ordinal, Scalar, and
-    /// TimerType.
-    template< class Ordinal, class Scalar, class TimerType >
-    class LapackBenchmarker {
-    public:
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-
-      /// \brief Constructor
-      ///
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   type.
-      /// \param out [out] Reference to the output stream (e.g.,
-      ///   std::cout) to which to write benchmark results.
-      /// \param humanReadable [in] Whether to print results to out in
-      ///   a verbose human-readable way, or in a way that is easy to
-      ///   parse with a script.  In either case, the results will be
-      ///   printed in ASCII format.
-      LapackBenchmarker (const std::string& scalarTypeName,
-                         std::ostream& out = std::cout,
-                         const bool humanReadable = false) :
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        humanReadable_ (humanReadable)
-      {
-        TSQR::Test::verifyTimerConcept< TimerType >();
-      }
-
-      void
-      benchmark (const int numTrials,
-                 const Ordinal numRows,
-                 const Ordinal numCols,
-                 const std::string& additionalFieldNames,
-                 const std::string& additionalData,
-                 const bool printFieldNames)
-      {
-        Matrix<Ordinal, Scalar> A (numRows, numCols);
-        Matrix<Ordinal, Scalar> Q (numRows, numCols);
-        Matrix<Ordinal, Scalar> R (numCols, numCols);
-        const Ordinal lda = numRows;
-        const Ordinal ldq = numRows;
-        const Ordinal ldr = numCols;
-
-        // Create a test problem
-        nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false);
-
-        // Copy A into Q, since LAPACK QR overwrites the input.  We only
-        // need Q because LAPACK's computation of the explicit Q factor
-        // occurs in place.  This doesn't work with TSQR.  To give
-        // LAPACK QR the fullest possible advantage over TSQR, we don't
-        // allocate an A_copy here (as we would when benchmarking TSQR).
-        deep_copy (Q, A);
-
-        // Determine the required workspace for the factorization
-        const Ordinal lwork = lworkQueryLapackQr (lapack_, numRows, numCols, lda);
-        std::vector<Scalar> work (lwork);
-        std::vector<Scalar> tau (numCols);
-
-        // Benchmark LAPACK's QR factorization for numTrials trials.
-        //
-        // Name of timer doesn't matter here; we only need the timing.
-        TimerType timer("LAPACK");
-        timer.start();
-        for (int trialNum = 0; trialNum < numTrials; ++trialNum) {
-          lapack_.compute_QR (numRows, numCols,
-                              Q.data(), ldq, tau.data(),
-                              work.data(), lwork);
-          // Extract the upper triangular factor R from Q (where it
-          // was computed in place by GEQRF), since UNGQR will
-          // overwrite all of Q with the explicit Q factor.
-          copy_upper_triangle (numRows, numCols, R.data(), ldr,
-                               Q.data(), ldq);
-          lapack_.compute_explicit_Q (numRows, numCols, numCols,
-                                      Q.data(), ldq, tau.data(),
-                                      work.data(), lwork);
-        }
-        const double lapackTiming = timer.stop();
-        reportResults (numTrials, numRows, numCols, lapackTiming,
-                       additionalFieldNames, additionalData, printFieldNames);
-      }
-
-
-    private:
-      //! Wrapper around LAPACK routines.
-      Impl::Lapack<Scalar> lapack_;
-
-      /// \brief Pseudorandom normal(0,1) generator.
-      ///
-      /// Default seed is OK, because this is a benchmark, not an
-      /// accuracy test.
-      TSQR::Random::NormalGenerator< ordinal_type, scalar_type > gen_;
-
-      //! Human-readable string representation of the Scalar type.
-      std::string scalarTypeName_;
-
-      //! Output stream to which to print benchmark results.
-      std::ostream& out_;
-
-      /// \brief Whether results should be printed in a human-readable way,
-      ///
-      /// rather than a way easily parsed by a script.
-      bool humanReadable_;
-
-      /// \brief Report benchmark results to out_
-      void
-      reportResults (const int numTrials,
-                     const Ordinal numRows,
-                     const Ordinal numCols,
-                     const double lapackTiming,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames)
-      {
-        using std::endl;
-        if (humanReadable_) {
-          out_ << "LAPACK\'s QR factorization (_GEQRF + _UNGQR):" << endl
-               << "Scalar type = " << scalarTypeName_ << endl
-               << "# rows = " << numRows << endl
-               << "# columns = " << numCols << endl
-               << "# trials = " << numTrials << endl
-               << "Total time (s) = " << lapackTiming << endl
-               << endl;
-        }
-        else {
-          if (printFieldNames) {
-            const char prefix[] = "%";
-            out_ << prefix
-                 << "method"
-                 << ",scalarType"
-                 << ",numRows"
-                 << ",numCols"
-                 << ",cacheSizeHint"
-                 << ",contiguousCacheBlocks"
-                 << ",numTrials"
-                 << ",timing";
-            if (! additionalFieldNames.empty ()) {
-              out_ << "," << additionalFieldNames;
-            }
-            out_ << endl;
-          }
-          // "0" refers to the cache size hint, which is not
-          // applicable in this case; we retain it for easy
-          // comparison of results with SequentialTsqr (so that the
-          // number of fields is the same in both cases).  "false"
-          // (that follows 0) refers to whether or not contiguous
-          // cache blocks were used (see TSQR::SequentialTsqr); this
-          // is also not applicable in this case.
-          out_ << "LAPACK"
-               << "," << scalarTypeName_
-               << "," << numRows
-               << "," << numCols
-               << "," << 0
-               << "," << false
-               << "," << numTrials
-               << "," << lapackTiming;
-          if (! additionalData.empty ()) {
-            out_ << "," << additionalData;
-          }
-          out_ << endl;
-        }
-      }
-    };
-
-
-    void
-    benchmarkLapack (std::ostream& out,
-                     const int numRows,
-                     const int numCols,
-                     const int numTrials,
-                     const bool testComplex,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames,
-                     const bool humanReadable)
-    {
-      typedef Teuchos::Time timer_type;
-      const bool testReal = true;
-      using std::string;
-
-      // Only print field names (if at all) for the first data type tested.
-      bool printedFieldNames = false;
-
-      if (testReal) {
-        { // Scalar=float
-          typedef LapackBenchmarker< int, float, timer_type > benchmark_type;
-          string scalarTypeName ("float");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-        { // Scalar=double
-          typedef LapackBenchmarker< int, double, timer_type > benchmark_type;
-          string scalarTypeName ("double");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-      }
-
-      if (testComplex) {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-        using std::complex;
-        { // Scalar=complex<float>
-          typedef LapackBenchmarker< int, complex<float>, timer_type > benchmark_type;
-          string scalarTypeName ("complex<float>");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-        { // Scalar=complex<double>
-          typedef LapackBenchmarker<int, complex<double>, timer_type> benchmark_type;
-          string scalarTypeName ("complex<double>");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
-        throw std::logic_error ("Trilinos was not built with "
-                                "complex arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      }
-    }
-
-
-
-    /// \class SeqTsqrBenchmarker
-    /// \brief Template version of SequentialTsqr benchmark.
-    ///
-    /// SequentialTsqr benchmark, templated on Ordinal, Scalar, and
-    /// TimerType.
-    template<class Ordinal, class Scalar, class TimerType>
-    class SeqTsqrBenchmarker {
-    public:
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-
-      /// \brief Constructor
-      ///
-      /// \param scalarTypeName [in] Human-readable name of the Scalar
-      ///   type.
-      /// \param out [out] Reference to the output stream (e.g.,
-      ///   std::cout) to which to write benchmark results.
-      /// \param humanReadable [in] Whether to print results to out in
-      ///   a verbose human-readable way, or in a way that is easy to
-      ///   parse with a script.  In either case, the results will be
-      ///   printed in ASCII format.
-      SeqTsqrBenchmarker (const std::string& scalarTypeName,
-                          std::ostream& out = std::cout,
-                          const bool humanReadable = false) :
-        scalarTypeName_ (scalarTypeName),
-        out_ (out),
-        humanReadable_ (humanReadable)
-      {
-        // Make sure that TimerType satisfies the required interface.
-        TSQR::Test::verifyTimerConcept<TimerType>();
-      }
-
-      void
-      benchmark (const int numTrials,
-                 const Ordinal numRows,
-                 const Ordinal numCols,
-                 const size_t cacheSizeHint,
-                 const bool contiguousCacheBlocks,
-                 const std::string& additionalFieldNames,
-                 const std::string& additionalData,
-                 const bool printFieldNames)
-      {
-        SequentialTsqr<Ordinal, Scalar> actor (cacheSizeHint);
-
-        Matrix<Ordinal, Scalar> A (numRows, numCols);
-        Matrix<Ordinal, Scalar> A_copy (numRows, numCols);
-        Matrix<Ordinal, Scalar> Q (numRows, numCols);
-        Matrix<Ordinal, Scalar> R (numCols, numCols);
-        const Ordinal lda = numRows;
-        const Ordinal ldq = numRows;
-
-        // Create a test problem
-        nodeTestProblem (gen_, numRows, numCols, A.data(), lda, false);
-
-        // Copy A into A_copy, since TSQR overwrites the input
-        deep_copy (A_copy, A);
-
-        // Benchmark sequential TSQR for numTrials trials.
-        //
-        // Name of timer doesn't matter here; we only need the timing.
-        TimerType timer("SeqTSQR");
-        timer.start();
-        for (int trialNum = 0; trialNum < numTrials; ++trialNum) {
-          // Factor the matrix and extract the resulting R factor
-          auto factorOutput =
-            actor.factor (numRows, numCols, A_copy.data(), lda,
-                          R.data(), R.stride(1), contiguousCacheBlocks);
-          // Compute the explicit Q factor.  Unlike with LAPACK QR,
-          // this doesn't happen in place: the implicit Q factor is
-          // stored in A_copy, and the explicit Q factor is written to
-          // Q.
-          actor.explicit_Q (numRows, numCols, A_copy.data(), lda, factorOutput,
-                            numCols, Q.data(), ldq, contiguousCacheBlocks);
-        }
-        const double seqTsqrTiming = timer.stop();
-        reportResults (numTrials, numRows, numCols, actor.cache_size_hint(),
-                       contiguousCacheBlocks, seqTsqrTiming,
-                       additionalFieldNames, additionalData, printFieldNames);
-      }
-
-
-    private:
-      /// \brief Pseudorandom normal(0,1) generator.
-      ///
-      /// Default seed is OK, because this is a benchmark, not an
-      /// accuracy test.
-      TSQR::Random::NormalGenerator<ordinal_type, scalar_type> gen_;
-
-      //! Human-readable string representation of the Scalar type.
-      std::string scalarTypeName_;
-
-      //! Output stream to which to print benchmark results.
-      std::ostream& out_;
-
-      /// \brief Whether results should be printed in a human-readable way,
-      ///
-      /// as opposed to a way easily parsed by a script.
-      bool humanReadable_;
-
-      //! Report benchmark results to out_
-      void
-      reportResults (const int numTrials,
-                     const Ordinal numRows,
-                     const Ordinal numCols,
-                     const size_t actualCacheSizeHint,
-                     const bool contiguousCacheBlocks,
-                     const double seqTsqrTiming,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames)
-      {
-        using std::endl;
-        if (humanReadable_) {
-          out_ << "Sequential (cache-blocked) TSQR:" << endl
-               << "Scalar type = " << scalarTypeName_ << endl
-               << "# rows = " << numRows << endl
-               << "# columns = " << numCols << endl
-               << "cache size hint in bytes = " << actualCacheSizeHint << endl
-               << "contiguous cache blocks? " << contiguousCacheBlocks << endl
-               << "# trials = " << numTrials << endl
-               << "Total time (s) = " << seqTsqrTiming << endl
-               << endl;
-        }
-        else {
-          if (printFieldNames) {
-            const char prefix[] = "%";
-            out_ << prefix
-                 << "method"
-                 << ",scalarType"
-                 << ",numRows"
-                 << ",numCols"
-                 << ",cacheSizeHint"
-                 << ",contiguousCacheBlocks"
-                 << ",numTrials"
-                 << ",timing";
-            if (! additionalFieldNames.empty ()) {
-              out_ << "," << additionalFieldNames;
-            }
-            out_ << endl;
-          }
-          out_ << "SeqTSQR"
-               << "," << scalarTypeName_
-               << "," << numRows
-               << "," << numCols
-               << "," << actualCacheSizeHint
-               << "," << contiguousCacheBlocks
-               << "," << numTrials
-               << "," << seqTsqrTiming;
-          if (! additionalData.empty ()) {
-            out_ << "," << additionalData;
-          }
-          out_ << endl;
-        }
-      }
-    };
-
-
-    void
-    benchmarkSeqTsqr (std::ostream& out,
-                      const int numRows,
-                      const int numCols,
-                      const int numTrials,
-                      const size_t cacheSizeHint,
-                      const bool contiguousCacheBlocks,
-                      const bool testComplex,
-                      const std::string& additionalFieldNames,
-                      const std::string& additionalData,
-                      const bool printFieldNames,
-                      const bool humanReadable)
-    {
-      typedef Teuchos::Time timer_type;
-      const bool testReal = true;
-      using std::string;
-
-      // Only print field names (if at all) for the first data type tested.
-      bool printedFieldNames = false;
-
-      if (testReal) {
-        { // Scalar=float
-          typedef SeqTsqrBenchmarker<int, float, timer_type> benchmark_type;
-          string scalarTypeName ("float");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols, cacheSizeHint,
-                            contiguousCacheBlocks,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-        { // Scalar=double
-          typedef SeqTsqrBenchmarker< int, double, timer_type > benchmark_type;
-          string scalarTypeName ("double");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols, cacheSizeHint,
-                            contiguousCacheBlocks,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-      }
-
-      if (testComplex) {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-        using std::complex;
-        { // Scalar=complex<float>
-          typedef SeqTsqrBenchmarker< int, complex<float>, timer_type > benchmark_type;
-          string scalarTypeName ("complex<float>");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols, cacheSizeHint,
-                            contiguousCacheBlocks,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-        { // Scalar=complex<double>
-          typedef SeqTsqrBenchmarker< int, complex<double>, timer_type > benchmark_type;
-          string scalarTypeName ("complex<double>");
-          benchmark_type widget (scalarTypeName, out, humanReadable);
-          widget.benchmark (numTrials, numRows, numCols, cacheSizeHint,
-                            contiguousCacheBlocks,
-                            additionalFieldNames, additionalData,
-                            printFieldNames && ! printedFieldNames);
-          if (printFieldNames && ! printedFieldNames) {
-            printedFieldNames = true;
-          }
-        }
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
-        throw std::logic_error ("Trilinos was not built with "
-                                "complex arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      }
-    }
-
-
-
-  } // namespace Test
-} // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp b/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp
deleted file mode 100644
index 9f290c2e9c53..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_SeqTest.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_Test_SeqTest_hpp
-#define __TSQR_Test_SeqTest_hpp
-
-#include "Tsqr_ConfigDefs.hpp"
-#include <cstring> // size_t definition
-#include <string>
-#include <iostream>
-
-namespace TSQR {
-  namespace Test {
-    /// \brief Test accuracy of SequentialTsqr.
-    ///
-    /// Test the accuracy of our sequential TSQR implementation
-    /// (SequentialTsqr), on an nrows by ncols matrix, using the given
-    /// cache size hint (in bytes).  Print the results to the given
-    /// output stream out.
-    void
-    verifySeqTsqr (std::ostream& out,
-                   const int nrows,
-                   const int ncols,
-                   const size_t cache_size_hint,
-                   const bool test_complex_arithmetic,
-                   const bool save_matrices,
-                   const bool contiguous_cache_blocks,
-                   const std::string& additionalFieldNames,
-                   const std::string& additionalData,
-                   const bool printFieldNames,
-                   const bool human_readable = false,
-                   const bool b_debug = false);
-
-    /// \brief Test accuracy of LAPACK's QR factorization.
-    ///
-    /// Test the accuracy of LAPACK's QR factorization (_GEQRF +
-    /// _ORGQR) on an nrows by ncols matrix, and print the results to
-    /// the given output stream out.
-    void
-    verifyLapack (std::ostream& out,
-                  const int nrows,
-                  const int ncols,
-                  const bool test_complex_arithmetic,
-                  const std::string& additionalFieldNames,
-                  const std::string& additionalData,
-                  const bool printFieldNames,
-                  const bool human_readable,
-                  const bool b_debug = false);
-
-    /// \brief Test performance of SequentialTsqr.
-    ///
-    /// Test the run time over ntrials trials of sequential TSQR, on
-    /// an nrows by ncols matrix (using the given cache block size (in
-    /// bytes)), and print the results to the given output stream out.
-    ///
-    /// \param human_readable [in] If true, print the benchmark
-    ///   results to stdout in human-readable format.  Otherwise,
-    ///   print them as two rows of comma-delimited ASCII, in an
-    ///   abbreviated format suitable for automatic processing.
-    void
-    benchmarkSeqTsqr (std::ostream& out,
-                      const int numRows,
-                      const int numCols,
-                      const int numTrials,
-                      const size_t cacheSizeHint,
-                      const bool contiguousCacheBlocks,
-                      const bool testComplex,
-                      const std::string& additionalFieldNames,
-                      const std::string& additionalData,
-                      const bool printFieldNames,
-                      const bool humanReadable);
-
-    /// \brief Test performance of LAPACK's QR factorization.
-    ///
-    /// Test the run time over numTrials trials of LAPACK QR (_GEQRF +
-    /// _ORGQR), on a numRows by numCols matrix, and print the results
-    /// to the given output stream out.
-    ///
-    /// \param humanReadable [in] If true, print the benchmark results
-    ///   to out in human-readable format.  Otherwise, print them as
-    ///   two rows of comma-delimited ASCII, in an abbreviated format
-    ///   suitable for automatic processing.
-    void
-    benchmarkLapack (std::ostream& out,
-                     const int numRows,
-                     const int numCols,
-                     const int numTrials,
-                     const bool testComplex,
-                     const std::string& additionalFieldNames,
-                     const std::string& additionalData,
-                     const bool printFieldNames,
-                     const bool humanReadable);
-
-  } // namespace Test
-} // namespace TSQR
-
-#endif // __TSQR_Test_SeqTest_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp
index f768fe5ae898..aa305064776b 100644
--- a/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_SequentialCholeskyQR.hpp
@@ -147,7 +147,7 @@ namespace TSQR {
                    Scalar (1), A_cur.data (), A_cur.stride (1), A_cur.data (),
                    A_cur.stride (1), Scalar (0), ATA.data (), ATA.stride (1));
         // Process the remaining cache blocks in order.
-        while (! A_rest.empty ()) {
+        while (! empty (A_rest)) {
           A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
           // ATA := ATA + A_cur^T * A_cur
           //
@@ -178,7 +178,7 @@ namespace TSQR {
       {
         mat_view_type R_out (ncols, ncols, R, ldr);
         deep_copy (R_out, Scalar {});
-        copy_upper_triangle (ncols, ncols, R, ldr, ATA.data(), ATA.stride(1));
+        copy_upper_triangle (R, ATA);
       }
 
       // Compute A := A * R^{-1}.  We do this in place in A, using
@@ -202,7 +202,7 @@ namespace TSQR {
                    A_cur.data (), A_cur.stride (1));
 
         // Process the remaining cache blocks in order.
-        while (! A_rest.empty ()) {
+        while (! empty (A_rest)) {
           A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
           blas.TRSM (RIGHT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG,
                      A_cur.extent (0), ncols,
@@ -225,25 +225,25 @@ namespace TSQR {
                 const LocalOrdinal ncols_C,
                 Scalar C[],
                 const LocalOrdinal ldc,
-                const bool contiguous_cache_blocks = false)
+                const bool contiguousCacheBlocks = false)
     {
       if (ncols_Q != ncols_C)
         throw std::logic_error("SequentialCholeskyQR::explicit_Q() "
                                "does not work if ncols_C != ncols_Q");
       const LocalOrdinal ncols = ncols_Q;
 
-      if (contiguous_cache_blocks) {
+      if (contiguousCacheBlocks) {
         CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols,
                                                     strategy_);
         mat_view_type C_rest (nrows, ncols, C, ldc);
         const_mat_view_type Q_rest (nrows, ncols, Q, ldq);
 
         mat_view_type C_cur =
-          blocker.split_top_block (C_rest, contiguous_cache_blocks);
+          blocker.split_top_block (C_rest, contiguousCacheBlocks);
         const_mat_view_type Q_cur =
-          blocker.split_top_block (Q_rest, contiguous_cache_blocks);
+          blocker.split_top_block (Q_rest, contiguousCacheBlocks);
 
-        while (! C_rest.empty ()) {
+        while (! empty (C_rest)) {
           deep_copy (Q_cur, C_cur);
         }
       }
@@ -253,7 +253,6 @@ namespace TSQR {
       }
     }
 
-
     /// Cache-block the given A_in matrix, writing the results to A_out.
     void
     cache_block (const LocalOrdinal nrows,
@@ -262,11 +261,10 @@ namespace TSQR {
                  const Scalar A_in[],
                  const LocalOrdinal lda_in) const
     {
-      CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
+      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
     }
 
-
     /// "Un"-cache-block the given A_in matrix, writing the results to A_out.
     void
     un_cache_block (const LocalOrdinal nrows,
diff --git a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp
index 0390be6c05f5..78cd2e91a84f 100644
--- a/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_SequentialTsqr.hpp
@@ -40,15 +40,14 @@
 /// \file Tsqr_SequentialTsqr.hpp
 /// \brief Implementation of the sequential cache-blocked part of TSQR.
 
-#ifndef __TSQR_Tsqr_SequentialTsqr_hpp
-#define __TSQR_Tsqr_SequentialTsqr_hpp
+#ifndef TSQR_SEQUENTIALTSQR_HPP
+#define TSQR_SEQUENTIALTSQR_HPP
 
 #include "Tsqr_ApplyType.hpp"
 #include "Tsqr_Matrix.hpp"
 #include "Tsqr_CacheBlockingStrategy.hpp"
 #include "Tsqr_CacheBlocker.hpp"
-#include "Tsqr_Combine.hpp"
-#include "Tsqr_LocalVerify.hpp"
+#include "Tsqr_Impl_CombineUser.hpp"
 #include "Tsqr_NodeTsqr.hpp"
 #include "Tsqr_Util.hpp"
 #include "Tsqr_Impl_SystemBlas.hpp"
@@ -60,10 +59,34 @@
 #include <limits>
 #include <sstream>
 #include <string>
-#include <utility> // std::pair
 #include <vector>
 
 namespace TSQR {
+  namespace Impl {
+    template<class LocalOrdinal, class Scalar>
+    class SequentialTsqrFactorOutput :
+      public NodeFactorOutput<LocalOrdinal, Scalar>
+    {
+    private:
+      using my_data_type = std::vector<std::vector<Scalar>>;
+    public:
+      SequentialTsqrFactorOutput () = default;
+      ~SequentialTsqrFactorOutput () override = default;
+
+      void add_and_consume (std::vector<Scalar>&& tau) {
+        data_.emplace_back (tau);
+      }
+      typename my_data_type::const_iterator begin() const {
+        return data_.begin();
+      }
+      typename my_data_type::const_reverse_iterator rbegin() const {
+        return data_.rbegin();
+      }
+    private:
+      my_data_type data_;
+    };
+  } // namespace Impl
+
   /// \class SequentialTsqr
   /// \brief Sequential cache-blocked TSQR factorization.
   /// \author Mark Hoemmen
@@ -91,11 +114,12 @@ namespace TSQR {
   /// may be different on different architectures.
   ///
   /// SequentialTsqr is designed to be used as the "intranode TSQR"
-  /// part of the full TSQR implementation in \c Tsqr.  The \c Tsqr
-  /// class can use any of various intranode TSQR implementations.
+  /// part of the full TSQR implementation in Tsqr.  The Tsqr class
+  /// can use any of various intranode TSQR implementations.
   /// SequentialTsqr is an appropriate choice when running in MPI-only
-  /// mode.  Other intranode TSQR implementations, such as \c TbbTsqr,
-  /// are appropriate for hybrid parallelism (MPI + threads).
+  /// mode.  Other intranode TSQR implementations, such as TbbTsqr
+  /// (which has been removed temporarily) are appropriate for hybrid
+  /// parallelism (MPI + threads).
   ///
   /// SequentialTsqr is unlikely to benefit from a multithreaded BLAS
   /// implementation.  In fact, implementations of LAPACK's QR
@@ -103,28 +127,48 @@ namespace TSQR {
   /// multithreading when factoring tall skinny matrices.  (See our
   /// Supercomputing 2009 paper and my IPDPS 2011 paper.)  This is why
   /// we built other intranode TSQR factorizations that do effectively
-  /// exploit thread-level parallelism, such as \c TbbTsqr.
+  /// exploit thread-level parallelism, such as TbbTsqr.
   ///
-  /// \note To implementers: SequentialTsqr cannot currently be a \c
+  /// \note To implementers: SequentialTsqr cannot currently be a
   ///   Teuchos::ParameterListAcceptorDefaultBase, because the latter
   ///   uses RCP, and RCPs (more specifically, their reference counts)
-  ///   are not currently thread safe.  \c TbbTsqr uses SequentialTsqr
-  ///   in parallel to implement each thread's cache-blocked TSQR.
-  ///   This can be fixed as soon as RCPs are made thread safe.
+  ///   are not currently thread safe.  TbbTsqr uses SequentialTsqr in
+  ///   parallel to implement each thread's cache-blocked TSQR.  This
+  ///   can be fixed as soon as RCPs are made thread safe.
   template<class LocalOrdinal, class Scalar>
   class SequentialTsqr :
-    public NodeTsqr<LocalOrdinal, Scalar, std::vector<std::vector<Scalar>>>
+    public NodeTsqr<LocalOrdinal, Scalar>,
+    private Impl::CombineUser<LocalOrdinal, Scalar>
   {
+  private:
+    using base_type = NodeTsqr<LocalOrdinal, Scalar>;
+    using my_factor_output_type =
+      Impl::SequentialTsqrFactorOutput<LocalOrdinal, Scalar>;
+
   public:
-    using ordinal_type = LocalOrdinal;
-    using scalar_type = Scalar;
-    using mat_view_type = MatView<LocalOrdinal, Scalar>;
-    using const_mat_view_type = MatView<LocalOrdinal, const Scalar>;
-    using magnitude_type = typename Teuchos::ScalarTraits<Scalar>::magnitudeType;
-    using FactorOutput = typename NodeTsqr<LocalOrdinal, Scalar,
-      std::vector<std::vector<Scalar>>>::factor_output_type;
+    using ordinal_type = typename base_type::ordinal_type;
+    using scalar_type = typename base_type::scalar_type;
+    using mat_view_type = typename base_type::mat_view_type;
+    using const_mat_view_type =
+      typename base_type::const_mat_view_type;
+    using magnitude_type = typename base_type::magnitude_type;
+    using factor_output_type = typename base_type::factor_output_type;
 
   private:
+    Combine<ordinal_type, scalar_type>&
+    getMyCombine (const ordinal_type /* maxNumCols */) const
+    {
+      // FIXME (mfh 20 Dec 2019) If SequentialTsqr has more than one
+      // cache block, it only passes tests if you use CombineNative.
+      // This likely explains why it fails with complex Scalar types,
+      // since CombineNative just uses CombineDefault in that case.  I
+      // tried making SequentialTsqr's implementation of
+      // QR_produces_R_factor_with_nonnegative_diagonal always return
+      // false, but that didn't help, so the issue likely is
+      // CombineDefault.
+      return this->getCombine ("CombineNative");
+    }
+
     /// \brief Factor the first cache block of the matrix.
     ///
     /// Compute the QR factorization of the first cache block A_top.
@@ -154,59 +198,14 @@ namespace TSQR {
     ///   R factor.
     mat_view_type
     factor_first_block (Combine<LocalOrdinal, Scalar>& combine,
-                        mat_view_type& A_top,
+                        const mat_view_type& A_top,
                         std::vector<Scalar>& tau,
-                        std::vector<Scalar>& work) const
-    {
-      const LocalOrdinal ncols = A_top.extent(1);
-      combine.factor_first (A_top, tau.data(), work.data());
-      return mat_view_type(ncols, ncols, A_top.data(), A_top.stride(1));
-    }
-
-    /// Apply the Q factor of the first (topmost) cache blocks, as
-    /// computed by factor_first_block() and stored implicitly in
-    /// Q_first and tau, to the first (topmost) block C_first of the
-    /// matrix C.
-    void
-    apply_first_block (Combine<LocalOrdinal, Scalar>& combine,
-                       const ApplyType& applyType,
-                       const const_mat_view_type& Q_first,
-                       const std::vector<Scalar>& tau,
-                       mat_view_type& C_first,
-                       std::vector<Scalar>& work) const
+                        Scalar work[],
+                        const LocalOrdinal lwork) const
     {
-      combine.apply_first (applyType, Q_first, tau.data(),
-                           C_first, work.data());
-    }
-
-    void
-    combine_apply (Combine<LocalOrdinal, Scalar>& combine,
-                   const ApplyType& apply_type,
-                   const const_mat_view_type& Q_cur,
-                   const std::vector<Scalar>& tau,
-                   mat_view_type& C_top,
-                   mat_view_type& C_cur,
-                   std::vector<Scalar>& work) const
-    {
-      const LocalOrdinal nrows_local = Q_cur.extent(0);
-      const LocalOrdinal ncols_Q = Q_cur.extent(1);
-      const LocalOrdinal ncols_C = C_cur.extent(1);
-
-      combine.apply_inner (apply_type,
-                           nrows_local, ncols_C, ncols_Q,
-                           Q_cur.data(), C_cur.stride(1), tau.data(),
-                           C_top.data(), C_top.stride(1),
-                           C_cur.data(), C_cur.stride(1), work.data());
-    }
-
-    void
-    combine_factor (Combine<LocalOrdinal, Scalar>& combine,
-                    mat_view_type& R,
-                    mat_view_type& A_cur,
-                    std::vector<Scalar>& tau,
-                    std::vector<Scalar>& work) const
-    {
-      combine.factor_inner (R, A_cur, tau.data(), work.data());
+      combine.factor_first (A_top, tau.data (), work, lwork);
+      const LocalOrdinal ncols = A_top.extent (1);
+      return partition_2x1 (A_top, ncols).first;
     }
 
   public:
@@ -276,14 +275,14 @@ namespace TSQR {
       setParameterList (params);
     }
 
-    /// \brief Valid default parameters for SequentialTsqr.
+    /// \brief List of valid parameters for SequentialTsqr.
     ///
     /// \note This object has to create a new parameter list each
     ///   time, since it cannot cache an RCP (due to thread safety --
     ///   TbbTsqr invokes multiple instances of SequentialTsqr in
     ///   parallel).
     Teuchos::RCP<const Teuchos::ParameterList>
-    getValidParameters () const
+    getValidParameters () const override
     {
       using Teuchos::ParameterList;
       using Teuchos::parameterList;
@@ -315,7 +314,7 @@ namespace TSQR {
     /// For a list of currently understood parameters, see the
     /// parameter list returned by \c getValidParameters().
     void
-    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
+    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist) override
     {
       using Teuchos::Exceptions::InvalidParameter;
       using Teuchos::ParameterList;
@@ -360,7 +359,7 @@ namespace TSQR {
     /// This implements Teuchos::Describable::description().  For now,
     /// SequentialTsqr uses the default implementation of
     /// Teuchos::Describable::describe().
-    std::string description () const {
+    std::string description () const override {
       std::ostringstream os;
       os << "Intranode Tall Skinny QR (TSQR): sequential cache-blocked "
         "implementation with cache size hint " << this->cache_size_hint()
@@ -369,16 +368,20 @@ namespace TSQR {
     }
 
     //! Whether this object is ready to perform computations.
-    bool ready() const {
+    bool ready() const override {
       return true;
     }
 
-    /// \brief Does factor() compute R with nonnegative diagonal?
-    ///
-    /// See the \c NodeTsqr documentation for details.
-    bool QR_produces_R_factor_with_nonnegative_diagonal () const {
-      using combine_type = Combine<LocalOrdinal, Scalar>;
-      return combine_type::QR_produces_R_factor_with_nonnegative_diagonal();
+    //! Whether factor() promises to compute R with a nonnegative diagonal.
+    bool
+    QR_produces_R_factor_with_nonnegative_diagonal () const override
+    {
+      // FIXME (19 Dec 2019) If the combine type is dynamic, we can't
+      // answer this question without knowing the number of columns.
+      // Just guess for now.
+      constexpr LocalOrdinal fakeNumCols = 10;
+      auto& c = this->getMyCombine (fakeNumCols);
+      return c.QR_produces_R_factor_with_nonnegative_diagonal ();
     }
 
     /// \brief Cache size hint (in bytes) used for the factorization.
@@ -386,74 +389,10 @@ namespace TSQR {
     /// This may be different than the cache size hint argument
     /// specified in the constructor.  SequentialTsqr treats that as a
     /// hint, not a command.
-    size_t cache_size_hint () const {
+    size_t cache_size_hint () const override {
       return strategy_.cache_size_hint();
     }
 
-    /// \brief Compute QR factorization (implicitly stored Q factor) of A.
-    ///
-    /// Compute the QR factorization in place of the nrows by ncols
-    /// matrix A, with nrows >= ncols.  The matrix A is stored either
-    /// in column-major order (the default) or with contiguous
-    /// column-major cache blocks, with leading dimension lda >=
-    /// nrows.  Write the resulting R factor to the top block of A (in
-    /// place).  (You can get a view of this via the top_block()
-    /// method.)  Everything below the upper triangle of A is
-    /// overwritten with part of the implicit representation of the Q
-    /// factor.  The other part of that representation is returned.
-    ///
-    /// \param nrows [in] Number of rows in the matrix A.
-    /// \param ncols [in] Number of columns in the matrix A.
-    /// \param A [in/out] On input: the nrows by ncols matrix to
-    ///   factor.  On output: part of the representation of the
-    ///   implicitly stored Q factor.
-    /// \param lda [in] Leading dimension of A, if A is stored in
-    ///   column-major order.  Otherwise its value doesn't matter.
-    /// \param contiguous_cache_blocks [in] Whether the matrix A is
-    ///   stored in a contiguously cache-blocked format.
-    ///
-    /// \return Part of the representation of the implicitly stored Q
-    ///   factor.  The complete representation includes A (on output).
-    ///   The FactorOutput and A go together.
-    FactorOutput
-    factor (const LocalOrdinal nrows,
-            const LocalOrdinal ncols,
-            Scalar A[],
-            const LocalOrdinal lda,
-            const bool contiguous_cache_blocks) const
-    {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
-      Combine<LocalOrdinal, Scalar> combine;
-      std::vector<Scalar> work (ncols);
-      FactorOutput tau_arrays;
-
-      // We say "A_rest" because it points to the remaining part of
-      // the matrix left to factor; at the beginning, the "remaining"
-      // part is the whole matrix, but that will change as the
-      // algorithm progresses.
-      //
-      // Note: if the cache blocks are stored contiguously, lda won't
-      // be the correct leading dimension of A, but it won't matter:
-      // we only ever operate on A_cur here, and A_cur's leading
-      // dimension is set correctly by A_rest.split_top().
-      mat_view_type A_rest (nrows, ncols, A, lda);
-      // This call modifies A_rest.
-      mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
-
-      // Factor the topmost block of A.
-      std::vector<Scalar> tau_first (ncols);
-      mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work);
-      tau_arrays.push_back (tau_first);
-
-      while (! A_rest.empty()) {
-        A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
-        std::vector<Scalar> tau (ncols);
-        combine_factor (combine, R_view, A_cur, tau, work);
-        tau_arrays.push_back (tau);
-      }
-      return tau_arrays;
-    }
-
     /// \brief Extract R factor from \c factor() results.
     ///
     /// The five-argument version of \c factor() leaves the R factor
@@ -480,7 +419,7 @@ namespace TSQR {
       deep_copy (R_view, Scalar {});
 
       // Copy out the upper triangle of the R factor from A into R.
-      copy_upper_triangle (ncols, ncols, R, ldr, A_top.data(), A_top.stride(1));
+      copy_upper_triangle (R, A_top);
     }
 
     /// \brief Compute the QR factorization of the matrix A.
@@ -490,20 +429,23 @@ namespace TSQR {
     /// when using SequentialTsqr as the intranode TSQR implementation
     /// in \c Tsqr.  The five-argument version is more useful when
     /// using SequentialTsqr inside of another intranode TSQR
-    /// implementation, such as \c TbbTsqr.
-    FactorOutput
+    /// implementation, such as TbbTsqr.
+    Teuchos::RCP<factor_output_type>
     factor (const LocalOrdinal nrows,
             const LocalOrdinal ncols,
             Scalar A[],
             const LocalOrdinal lda,
             Scalar R[],
             const LocalOrdinal ldr,
-            const bool contiguous_cache_blocks) const
+            const bool contigCacheBlocks) const override
     {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
-      Combine<LocalOrdinal, Scalar> combine;
-      std::vector<Scalar> work (ncols);
-      FactorOutput tau_arrays;
+      using LO = LocalOrdinal;
+      CacheBlocker<LO, Scalar> blocker (nrows, ncols, strategy_);
+      auto& combine = this->getMyCombine (ncols);
+      const LO lwork = combine.work_size (nrows, ncols, ncols);
+      std::vector<Scalar> work (lwork);
+      Teuchos::RCP<my_factor_output_type> tau_arrays
+        (new my_factor_output_type);
 
       // We say "A_rest" because it points to the remaining part of
       // the matrix left to factor; at the beginning, the "remaining"
@@ -513,21 +455,25 @@ namespace TSQR {
       // Note: if the cache blocks are stored contiguously, lda won't
       // be the correct leading dimension of A, but it won't matter:
       // we only ever operate on A_cur here, and A_cur's leading
-      // dimension is set correctly by A_rest.split_top().
+      // dimension is set correctly by split_top_block.
       mat_view_type A_rest (nrows, ncols, A, lda);
       // This call modifies A_rest.
-      mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
+      mat_view_type A_cur =
+        blocker.split_top_block (A_rest, contigCacheBlocks);
 
       // Factor the topmost block of A.
       std::vector<Scalar> tau_first (ncols);
-      mat_view_type R_view = factor_first_block (combine, A_cur, tau_first, work);
-      tau_arrays.push_back (tau_first);
+      mat_view_type R_view =
+        factor_first_block (combine, A_cur, tau_first,
+                            work.data (), lwork);
+      tau_arrays->add_and_consume (std::move (tau_first));
 
-      while (! A_rest.empty()) {
-        A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
+      while (! empty (A_rest)) {
+        A_cur = blocker.split_top_block (A_rest, contigCacheBlocks);
         std::vector<Scalar> tau (ncols);
-        combine_factor (combine, R_view, A_cur, tau, work);
-        tau_arrays.push_back (tau);
+        combine.factor_inner (R_view, A_cur, tau.data (),
+                              work.data (), lwork);
+        tau_arrays->add_and_consume (std::move (tau));
       }
 
       // Copy the R factor resulting from the factorization out of
@@ -535,7 +481,7 @@ namespace TSQR {
       // output argument.
       mat_view_type R_out (ncols, ncols, R, ldr);
       deep_copy (R_out, Scalar {});
-      copy_upper_triangle (ncols, ncols, R, ldr, R_view.data(), R_view.stride(1));
+      copy_upper_triangle (R_out, R_view);
       return tau_arrays;
     }
 
@@ -554,7 +500,7 @@ namespace TSQR {
     /// \param lda [in] If the matrix A is stored in column-major
     ///   order: the leading dimension (a.k.a. stride) of A.
     ///   Otherwise, the value of this parameter doesn't matter.
-    /// \param contiguous_cache_blocks [in] Whether the cache blocks
+    /// \param contigCacheBlocks [in] Whether the cache blocks
     ///   in the matrix A are stored contiguously.
     ///
     /// \return Number of cache blocks in the matrix A: a positive integer.
@@ -563,21 +509,22 @@ namespace TSQR {
                              const LocalOrdinal ncols,
                              const Scalar A[],
                              const LocalOrdinal lda,
-                             const bool contiguous_cache_blocks) const
+                             const bool contigCacheBlocks) const
     {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
-      LocalOrdinal count = 0;
+      using LO = LocalOrdinal;
+      CacheBlocker<LO, Scalar> blocker (nrows, ncols, strategy_);
+      LO count = 0;
 
       const_mat_view_type A_rest (nrows, ncols, A, lda);
-      if (A_rest.empty()) {
+      if (empty (A_rest)) {
         return count;
       }
-
-      const_mat_view_type A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
+      const_mat_view_type A_cur =
+        blocker.split_top_block (A_rest, contigCacheBlocks);
       ++count; // first factor step
 
-      while (! A_rest.empty()) {
-        A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
+      while (! empty (A_rest)) {
+        A_cur = blocker.split_top_block (A_rest, contigCacheBlocks);
         ++count; // next factor step
       }
       return count;
@@ -592,37 +539,63 @@ namespace TSQR {
            const LocalOrdinal ncols_Q,
            const Scalar Q[],
            const LocalOrdinal ldq,
-           const FactorOutput& factor_output,
+           const factor_output_type& factor_output,
            const LocalOrdinal ncols_C,
            Scalar C[],
            const LocalOrdinal ldc,
-           const bool contiguous_cache_blocks) const
+           const bool contigCacheBlocks) const override
     {
+      using LO = LocalOrdinal;
+      const char prefix[] = "TSQR::SequentialTsqr::apply: ";
+
       // Quick exit and error tests
       if (ncols_Q == 0 || ncols_C == 0 || nrows == 0) {
         return;
       }
       else if (ldc < nrows) {
         std::ostringstream os;
-        os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")";
+        os << prefix << "ldc (= " << ldc << ") < nrows (= "
+           << nrows << ")";
         throw std::invalid_argument (os.str());
       }
       else if (ldq < nrows) {
         std::ostringstream os;
-        os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")";
+        os << prefix << "ldq (= " << ldq << ") < nrows (= "
+           << nrows << ")";
         throw std::invalid_argument (os.str());
       }
 
+      const my_factor_output_type& tau_arrays = [&] () {
+        const my_factor_output_type* tau_arrays_ptr =
+          dynamic_cast<const my_factor_output_type*> (&factor_output);
+        if (tau_arrays_ptr == nullptr) {
+          using Teuchos::demangleName;
+          using Teuchos::TypeNameTraits;
+          using Teuchos::typeName;
+          std::ostringstream os;
+          os << prefix << "Input factor_output_type object was not "
+            "created by the same type of SequentialTsqr object as "
+            "this one.  This object has type " << typeName (*this) <<
+            " and its subclass of factor_output_type has type " <<
+            TypeNameTraits<my_factor_output_type>::name () << ", but "
+            "the input factor_output_type object has dynamic type "
+            << demangleName (typeid (factor_output).name ());
+          throw std::invalid_argument (os.str ());
+        }
+        return *tau_arrays_ptr;
+      } ();
+
       // If contiguous cache blocks are used, then we have to use the
       // same convention as we did for factor().  Otherwise, we are
       // free to choose the cache block dimensions as we wish in
       // apply(), independently of what we did in factor().
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols_Q, strategy_);
-      Combine<LocalOrdinal, Scalar> combine;
+      CacheBlocker<LO, Scalar> blocker (nrows, ncols_Q, strategy_);
+      auto& combine =
+        this->getMyCombine (std::max (ncols_Q, ncols_C));
+      const LO lwork = combine.work_size (nrows, ncols_Q, ncols_C);
+      std::vector<Scalar> work (lwork);
 
-      const bool transposed = apply_type.transposed();
-      const FactorOutput& tau_arrays = factor_output; // rename for encapsulation
-      std::vector<Scalar> work (ncols_C);
+      const bool transposed = apply_type.transposed ();
 
       // We say "*_rest" because it points to the remaining part of
       // the matrix left to factor; at the beginning, the "remaining"
@@ -640,37 +613,51 @@ namespace TSQR {
 
       // Identify the top ncols_C by ncols_C block of C.  C_rest is
       // not modified.
-      mat_view_type C_top = blocker.top_block (C_rest, contiguous_cache_blocks);
+      mat_view_type C_top =
+        blocker.top_block (C_rest, contigCacheBlocks);
 
       if (transposed) {
-        const_mat_view_type Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
-        mat_view_type C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
+        const_mat_view_type Q_cur =
+          blocker.split_top_block (Q_rest, contigCacheBlocks);
+        mat_view_type C_cur =
+          blocker.split_top_block (C_rest, contigCacheBlocks);
 
         // Apply the topmost block of Q.
         auto tau_iter = tau_arrays.begin();
-        const std::vector<Scalar>& tau = *tau_iter++;
-        apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work);
-
-        while (! Q_rest.empty()) {
-          Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
-          C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
-          combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
+        const std::vector<Scalar>& tau_first = *tau_iter++;
+        combine.apply_first (apply_type, Q_cur, tau_first.data (),
+                             C_cur, work.data (), lwork);
+        while (! empty (Q_rest)) {
+          Q_cur = blocker.split_top_block (Q_rest, contigCacheBlocks);
+          C_cur = blocker.split_top_block (C_rest, contigCacheBlocks);
+          const Scalar* tau = tau_iter->data ();
+          combine.apply_inner (apply_type, Q_cur, tau, C_top, C_cur,
+                               work.data (), lwork);
+          tau_iter++;
         }
       }
       else {
-        // Start with the last local Q factor and work backwards up the matrix.
-        auto tau_iter = tau_arrays.rbegin();
-
-        const_mat_view_type Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
-        mat_view_type C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
-
-        while (! Q_rest.empty()) {
-          combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
-          Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
-          C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
+        // Start with the last local Q factor and work backwards up
+        // the matrix.
+        auto tau_iter = tau_arrays.rbegin ();
+        const_mat_view_type Q_cur =
+          blocker.split_bottom_block (Q_rest, contigCacheBlocks);
+        mat_view_type C_cur =
+          blocker.split_bottom_block (C_rest, contigCacheBlocks);
+        while (! empty (Q_rest)) {
+          const Scalar* tau = tau_iter->data ();
+          combine.apply_inner (apply_type, Q_cur, tau, C_top, C_cur,
+                               work.data (), lwork);
+          tau_iter++;
+          Q_cur =
+            blocker.split_bottom_block (Q_rest, contigCacheBlocks);
+          C_cur =
+            blocker.split_bottom_block (C_rest, contigCacheBlocks);
         }
         // Apply to last (topmost) cache block.
-        apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work);
+        const std::vector<Scalar>& tau_first = *tau_iter++;
+        combine.apply_first (apply_type, Q_cur, tau_first.data (),
+                             C_cur, work.data (), lwork);
       }
     }
 
@@ -682,38 +669,27 @@ namespace TSQR {
                 const LocalOrdinal ncols_Q,
                 const Scalar Q[],
                 const LocalOrdinal ldq,
-                const FactorOutput& factor_output,
+                const factor_output_type& factor_output,
                 const LocalOrdinal ncols_C,
                 Scalar C[],
                 const LocalOrdinal ldc,
-                const bool contiguous_cache_blocks) const
+                const bool contigCacheBlocks) const override
     {
-      // Identify top ncols_C by ncols_C block of C.  C_view is not
-      // modified.  top_block() will set C_top to have the correct
-      // leading dimension, whether or not cache blocks are stored
-      // contiguously.
       mat_view_type C_view (nrows, ncols_C, C, ldc);
-      mat_view_type C_top = this->top_block (C_view, contiguous_cache_blocks);
-
-      // Fill C with zeros, and then fill the topmost block of C with
-      // the first ncols_C columns of the identity matrix, so that C
-      // itself contains the first ncols_C columns of the identity
-      // matrix.
-      fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks);
-      for (LocalOrdinal j = 0; j < ncols_C; ++j) {
-        C_top(j, j) = Scalar(1.0);
-      }
-
-      // Apply the Q factor to C, to extract the first ncols_C columns
-      // of Q in explicit form.
+      deep_copy (C_view, Scalar {});
+      // Don't just call set_diagonal_entries_to_one(C_view), because
+      // that doesn't respect contigCacheBlocks.
+      auto C_top = this->top_block (C_view, contigCacheBlocks);
+      deep_copy (C_top, Scalar {});
+      this->set_diagonal_entries_to_one (C_top);
       apply (ApplyType::NoTranspose,
              nrows, ncols_Q, Q, ldq, factor_output,
-             ncols_C, C, ldc, contiguous_cache_blocks);
+             ncols_C, C, ldc, contigCacheBlocks);
     }
 
     /// \brief Compute Q := Q*B.
     ///
-    /// See the \c NodeTsqr documentation for details.
+    /// See the NodeTsqr documentation for details.
     void
     Q_times_B (const LocalOrdinal nrows,
                const LocalOrdinal ncols,
@@ -721,12 +697,10 @@ namespace TSQR {
                const LocalOrdinal ldq,
                const Scalar B[],
                const LocalOrdinal ldb,
-               const bool contiguous_cache_blocks) const
+               const bool contigCacheBlocks) const override
     {
       using Teuchos::NO_TRANS;
-
-      // We don't do any other error checking here (e.g., matrix
-      // dimensions), though it would be a good idea to do so.
+      using LO = LocalOrdinal;
 
       // Take the easy exit if available.
       if (ncols == 0 || nrows == 0) {
@@ -739,14 +713,13 @@ namespace TSQR {
       // computation is completely independent of the others; a slight
       // restructuring of this code would parallelize nicely using
       // OpenMP.
-      CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
+      CacheBlocker<LO, Scalar> blocker (nrows, ncols, strategy_);
       Impl::SystemBlas<Scalar> blas;
       mat_view_type Q_rest (nrows, ncols, Q, ldq);
-      Matrix<LocalOrdinal, Scalar>
-        Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized
-      while (! Q_rest.empty ()) {
+      Matrix<LO, Scalar> Q_cur_copy (0, 0); // will be resized
+      while (! empty (Q_rest)) {
         mat_view_type Q_cur =
-          blocker.split_top_block (Q_rest, contiguous_cache_blocks);
+          blocker.split_top_block (Q_rest, contigCacheBlocks);
 
         // GEMM doesn't like aliased arguments, so we use a copy.
         // We only copy the current cache block, rather than all of
@@ -754,9 +727,13 @@ namespace TSQR {
         Q_cur_copy.reshape (Q_cur.extent (0), ncols);
         deep_copy (Q_cur_copy, Q_cur);
         // Q_cur := Q_cur_copy * B.
-        blas.GEMM (NO_TRANS, NO_TRANS, Q_cur.extent (0), ncols, ncols,
-                   Scalar (1.0), Q_cur_copy.data (), Q_cur_copy.stride (1),
-                   B, ldb, Scalar {}, Q_cur.data (), Q_cur.stride (1));
+        constexpr Scalar ZERO {};
+        constexpr Scalar ONE (1.0);
+        blas.GEMM (NO_TRANS, NO_TRANS,
+                   Q_cur.extent (0), ncols, ncols,
+                   ONE, Q_cur_copy.data (), Q_cur_copy.stride (1),
+                   B, ldb,
+                   ZERO, Q_cur.data (), Q_cur.stride (1));
       }
     }
 
@@ -775,9 +752,10 @@ namespace TSQR {
                  const LocalOrdinal ncols,
                  Scalar A_out[],
                  const Scalar A_in[],
-                 const LocalOrdinal lda_in) const
+                 const LocalOrdinal lda_in) const override
     {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
+      CacheBlocker<LocalOrdinal, Scalar> blocker
+        (nrows, ncols, strategy_);
       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
     }
 
@@ -802,9 +780,10 @@ namespace TSQR {
                     const LocalOrdinal ncols,
                     Scalar A_out[],
                     const LocalOrdinal lda_out,
-                    const Scalar A_in[]) const
+                    const Scalar A_in[]) const override
     {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
+      CacheBlocker<LocalOrdinal, Scalar> blocker
+        (nrows, ncols, strategy_);
       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
     }
 
@@ -818,17 +797,19 @@ namespace TSQR {
     /// \param A [out] nrows by ncols column-major-order dense matrix
     ///   with leading dimension lda
     /// \param lda [in] Leading dimension of A: lda >= nrows
-    /// \param contiguous_cache_blocks [in] Whether the cache blocks
+    /// \param contigCacheBlocks [in] Whether the cache blocks
     ///   in A are stored contiguously.
     void
     fill_with_zeros (const LocalOrdinal nrows,
                      const LocalOrdinal ncols,
                      Scalar A[],
                      const LocalOrdinal lda,
-                     const bool contiguous_cache_blocks) const
+                     const bool contigCacheBlocks) const override
     {
-      CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
-      blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
+      CacheBlocker<LocalOrdinal, Scalar> blocker
+        (nrows, ncols, strategy_);
+      blocker.fill_with_zeros (nrows, ncols, A, lda,
+                               contigCacheBlocks);
     }
 
   protected:
@@ -840,29 +821,27 @@ namespace TSQR {
     ///
     /// \param C [in] View of a matrix, with at least as many rows as
     ///   columns.
-    /// \param contiguous_cache_blocks [in] Whether the cache blocks
-    ///   of C are stored contiguously.
+    /// \param contigCacheBlocks [in] Whether the cache blocks of C
+    ///   are stored contiguously.
     ///
     /// \return View of the topmost cache block of the matrix C.
     const_mat_view_type
     const_top_block (const const_mat_view_type& C,
-                     const bool contiguous_cache_blocks) const
+                     const bool contigCacheBlocks) const override
     {
       // The CacheBlocker object knows how to construct a view of the
       // top cache block of C.  This is complicated because cache
       // blocks (in C) may or may not be stored contiguously.  If they
       // are stored contiguously, the CacheBlocker knows the right
       // layout, based on the cache blocking strategy.
-      typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
-      blocker_type blocker (C.extent(0), C.extent(1), strategy_);
-
-      // C_top_block is a view of the topmost cache block of C.
-      // C_top_block should have >= ncols rows, otherwise either cache
-      // blocking is broken or the input matrix C itself had fewer
-      // rows than columns.
-      const_mat_view_type C_top_block =
-        blocker.top_block (C, contiguous_cache_blocks);
-      return C_top_block;
+      using blocker_type = CacheBlocker<LocalOrdinal, Scalar>;
+      blocker_type blocker (C.extent (0), C.extent (1), strategy_);
+
+      // This is a view of the topmost cache block of C.  C_top_block
+      // should have >= ncols rows, otherwise either cache blocking is
+      // broken or the input matrix C itself had fewer rows than
+      // columns.
+      return blocker.top_block (C, contigCacheBlocks);
     }
 
   private:
@@ -872,4 +851,4 @@ namespace TSQR {
 
 } // namespace TSQR
 
-#endif // __TSQR_Tsqr_SequentialTsqr_hpp
+#endif // TSQR_SEQUENTIALTSQR_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp
deleted file mode 100644
index ad86d8c3d206..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_TbbTest.hpp
+++ /dev/null
@@ -1,423 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_Test_TbbTest_hpp
-#define __TSQR_Test_TbbTest_hpp
-
-#include "Tsqr_nodeTestProblem.hpp"
-#include "Tsqr_verifyTimerConcept.hpp"
-#include "Tsqr_Random_NormalGenerator.hpp"
-
-#include "Tsqr_LocalVerify.hpp"
-#include "Tsqr_Matrix.hpp"
-#include "Tsqr_Util.hpp"
-#include "TbbTsqr.hpp"
-
-#include "Teuchos_LAPACK.hpp"
-#include "Teuchos_Time.hpp"
-
-#include <algorithm>
-#include <cstring> // size_t definition
-#include <iostream>
-#include <limits>
-#include <stdexcept>
-#include <vector>
-
-using std::make_pair;
-using std::pair;
-using std::vector;
-
-using std::cerr;
-using std::cout;
-using std::endl;
-
-namespace TSQR {
-  namespace Test {
-    /// Test the accuracy of Intel TBB TSQR on an nrows by ncols
-    /// matrix (using the given number of cores and the given cache
-    /// block size (in bytes)), and print the results to stdout.
-    template<class Ordinal, class Scalar>
-    void
-    verifyTbbTsqr (const std::string& scalarTypeName,
-                   TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
-                   const Ordinal nrows,
-                   const Ordinal ncols,
-                   const int num_cores,
-                   const size_t cache_size_hint,
-                   const bool contiguous_cache_blocks,
-                   const bool printFieldNames,
-                   const bool human_readable,
-                   const bool b_debug = false)
-    {
-      typedef Teuchos::Time timer_type;
-      typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type;
-      typedef typename node_tsqr_type::FactorOutput factor_output_type;
-      typedef Teuchos::ScalarTraits<Scalar> STS;
-      typedef typename STS::magnitudeType magnitude_type;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-
-      node_tsqr_type actor (num_cores, cache_size_hint);
-
-      if (b_debug) {
-        cerr << "Intel TBB TSQR test problem:" << endl
-             << "* " << nrows << " x " << ncols << endl
-             << "* # cores: " << num_cores << endl
-             << "* Cache size hint in bytes: " << actor.cache_size_hint() << endl;
-        if (contiguous_cache_blocks) {
-          cerr << "* Contiguous cache blocks" << endl;
-        }
-      }
-
-      Matrix< Ordinal, Scalar > A (nrows, ncols);
-      Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
-      Matrix< Ordinal, Scalar > Q (nrows, ncols);
-      Matrix< Ordinal, Scalar > R (ncols, ncols);
-      if (std::numeric_limits< Scalar >::has_quiet_NaN) {
-        deep_copy (A, std::numeric_limits< Scalar>::quiet_NaN());
-        deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN());
-        deep_copy (Q, std::numeric_limits< Scalar >::quiet_NaN());
-        deep_copy (R, std::numeric_limits< Scalar >::quiet_NaN());
-      }
-      const Ordinal lda = nrows;
-      const Ordinal ldq = nrows;
-      const Ordinal ldr = ncols;
-
-      // Create a test problem
-      nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), true);
-
-      if (b_debug) {
-        cerr << "-- Generated test problem" << endl;
-      }
-
-      // Copy A into A_copy, since TSQR overwrites the input.  If
-      // specified, rearrange the data in A_copy so that the data in
-      // each cache block is contiguously stored.
-      if (! contiguous_cache_blocks) {
-        deep_copy (A_copy, A);
-        if (b_debug) {
-          cerr << "-- Copied test problem from A into A_copy" << endl;
-        }
-      }
-      else {
-        actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1));
-        if (b_debug) {
-          cerr << "-- Reorganized test matrix to have contiguous "
-            "cache blocks" << endl;
-        }
-        // Verify cache blocking, when in debug mode.
-        if (b_debug) {
-          Matrix< Ordinal, Scalar > A2 (nrows, ncols);
-          if (std::numeric_limits< Scalar >::has_quiet_NaN) {
-            deep_copy (A2, std::numeric_limits< Scalar >::quiet_NaN());
-          }
-          actor.un_cache_block (nrows, ncols, A2.data(), A2.stride(1), A_copy.data());
-          if (matrix_equal (A, A2)) {
-            if (b_debug) {
-              cerr << "-- Cache blocking test succeeded!" << endl;
-            }
-          }
-          else {
-            throw std::logic_error ("Cache blocking failed");
-          }
-        }
-      }
-
-      // Fill R with zeros, since the factorization may not overwrite
-      // the strict lower triangle of R.
-      deep_copy (R, Scalar {});
-
-      // Factor the matrix and compute the explicit Q factor
-      factor_output_type factor_output =
-        actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1), R.data(),
-                      R.stride(1), contiguous_cache_blocks);
-      if (b_debug) {
-        cerr << "-- Finished TbbTsqr::factor" << endl;
-      }
-      actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1), factor_output,
-                        ncols, Q.data(), Q.stride(1), contiguous_cache_blocks);
-      if (b_debug) {
-        cerr << "-- Finished TbbTsqr::explicit_Q" << endl;
-      }
-
-      // "Un"-cache-block the output Q (the explicit Q factor), if
-      // contiguous cache blocks were used.  This is only necessary
-      // because local_verify() doesn't currently support contiguous
-      // cache blocks.
-      if (contiguous_cache_blocks) {
-        // Use A_copy as temporary storage for un-cache-blocking Q.
-        actor.un_cache_block (nrows, ncols, A_copy.data(), A_copy.stride(1), Q.data());
-        deep_copy (Q, A_copy);
-        if (b_debug) {
-          cerr << "-- Un-cache-blocked output Q factor" << endl;
-        }
-      }
-
-      // Print out the R factor
-      if (b_debug) {
-        cerr << endl << "-- R factor:" << endl;
-        print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1));
-        cerr << endl;
-      }
-
-      // Validate the factorization
-      std::vector< magnitude_type > results =
-        local_verify (nrows, ncols, A.data(), lda, Q.data(), ldq, R.data(), ldr);
-      if (b_debug) {
-        cerr << "-- Finished local_verify" << endl;
-      }
-
-      // Print the results
-      if (human_readable) {
-        cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl
-             << "Scalar type: " << scalarTypeName << endl
-             << "# rows: " << nrows << endl
-             << "# columns: " << ncols << endl
-             << "# cores: " << num_cores << endl
-             << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
-             << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
-             << "Absolute residual $\\|A - Q*R\\|_2$: "
-             << results[0] << endl
-             << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: "
-             << results[1] << endl
-             << "Test matrix norm $\\| A \\|_F$: "
-             << results[2] << endl
-             << endl;
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          cout << prefix
-               << "method"
-               << ",scalarType"
-               << ",numRows"
-               << ",numCols"
-               << ",numThreads"
-               << ",cacheSizeHint"
-               << ",contiguousCacheBlocks"
-               << ",absFrobResid"
-               << ",absFrobOrthog"
-               << ",frobA"
-               << endl;
-        }
-        cout << "TbbTsqr"
-             << "," << scalarTypeName
-             << "," << nrows
-             << "," << ncols
-             << "," << num_cores
-             << "," << actor.cache_size_hint()
-             << "," << contiguous_cache_blocks
-             << "," << results[0]
-             << "," << results[1]
-             << "," << results[2]
-             << endl;
-      }
-    }
-
-    /// \brief Benchmark Intel TBB TSQR vs. LAPACK's QR, and print the
-    ///   results to stdout.
-    ///
-    /// \note c++0x support is need in order to have a default
-    /// template parameter argument for a template function, otherwise
-    /// we would have templated this function on TimerType and made
-    /// Teuchos::Time the default.
-    template< class Ordinal, class Scalar >
-    void
-    benchmarkTbbTsqr (const std::string& scalarTypeName,
-                      const int ntrials,
-                      const Ordinal nrows,
-                      const Ordinal ncols,
-                      const int num_cores,
-                      const size_t cache_size_hint,
-                      const bool contiguous_cache_blocks,
-                      const bool printFieldNames,
-                      const bool human_readable)
-    {
-      using TSQR::TBB::TbbTsqr;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-
-      typedef Teuchos::Time timer_type;
-      typedef Ordinal ordinal_type;
-      typedef Scalar scalar_type;
-      typedef Matrix< ordinal_type, scalar_type > matrix_type;
-      typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type;
-
-      // Pseudorandom normal(0,1) generator.  Default seed is OK,
-      // because this is a benchmark, not an accuracy test.
-      TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator;
-
-      // Set up TSQR implementation.
-      node_tsqr_type actor (num_cores, cache_size_hint);
-
-      matrix_type A (nrows, ncols);
-      matrix_type A_copy (nrows, ncols);
-      matrix_type Q (nrows, ncols);
-      matrix_type R (ncols, ncols, scalar_type(0));
-
-      // Fill R with zeros, since the factorization may not overwrite
-      // the strict lower triangle of R.
-      deep_copy (R, scalar_type {});
-
-      // Create a test problem
-      nodeTestProblem (generator, nrows, ncols, A.data(), A.stride(1), false);
-
-      // Copy A into A_copy, since TSQR overwrites the input.  If
-      // specified, rearrange the data in A_copy so that the data in
-      // each cache block is contiguously stored.
-      if (contiguous_cache_blocks) {
-        actor.cache_block (nrows, ncols, A_copy.data(), A.data(), A.stride(1));
-      }
-      else {
-        deep_copy (A_copy, A);
-      }
-
-      // Do a few timing runs and throw away the results, just to warm
-      // up any libraries that do autotuning.
-      const int numWarmupRuns = 5;
-      for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
-        // Factor the matrix in-place in A_copy, and extract the
-        // resulting R factor into R.
-        typedef typename node_tsqr_type::FactorOutput factor_output_type;
-        factor_output_type factor_output =
-          actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                        R.data(), R.stride(1), contiguous_cache_blocks);
-        // Compute the explicit Q factor (which was stored
-        // implicitly in A_copy and factor_output) and store in Q.
-        // We don't need to un-cache-block the output, because we
-        // aren't verifying it here.
-        actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                          factor_output, ncols, Q.data(), Q.stride(1),
-                          contiguous_cache_blocks);
-      }
-
-      // Benchmark TBB-based TSQR for ntrials trials.
-      //
-      // Name of timer doesn't matter here; we only need the timing.
-      timer_type timer("TbbTsqr");
-      timer.start();
-      for (int trial_num = 0; trial_num < ntrials; ++trial_num) {
-        // Factor the matrix in-place in A_copy, and extract the
-        // resulting R factor into R.
-        typedef typename node_tsqr_type::FactorOutput factor_output_type;
-        factor_output_type factor_output =
-          actor.factor (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                        R.data(), R.stride(1), contiguous_cache_blocks);
-        // Compute the explicit Q factor (which was stored
-        // implicitly in A_copy and factor_output) and store in Q.
-        // We don't need to un-cache-block the output, because we
-        // aren't verifying it here.
-        actor.explicit_Q (nrows, ncols, A_copy.data(), A_copy.stride(1),
-                          factor_output, ncols, Q.data(), Q.stride(1),
-                          contiguous_cache_blocks);
-      }
-      const double tbb_tsqr_timing = timer.stop();
-
-      // Print the results
-      if (human_readable) {
-        cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl
-             << "Scalar type: " << scalarTypeName << endl
-             << "# rows: " << nrows << endl
-             << "# columns: " << ncols << endl
-             << "# cores: " << num_cores << endl
-             << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
-             << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
-             << "# trials: " << ntrials << endl
-             << "Total time (s) = " << tbb_tsqr_timing << endl
-             << "Total time (s) in factor() (min over all tasks): "
-             << (ntrials * actor.min_seq_factor_timing()) << endl
-             << "Total time (s) in factor() (max over all tasks): "
-             << (ntrials * actor.max_seq_factor_timing()) << endl
-             << "Total time (s) in apply() (min over all tasks): "
-             << (ntrials * actor.min_seq_apply_timing()) << endl
-             << "Total time (s) in apply() (max over all tasks): "
-             << (ntrials * actor.max_seq_apply_timing()) << endl
-             << endl << endl;
-        cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl;
-
-        std::vector<TimeStats> stats;
-        actor.getStats (stats);
-        std::vector<std::string> labels;
-        actor.getStatsLabels (labels);
-
-        const std::string labelLabel ("label");
-        for (std::vector<std::string>::size_type k = 0; k < labels.size(); ++k) {
-          const bool printHeaders = (k == 0);
-          if (stats[k].count() > 0)
-            stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders);
-        }
-      }
-      else {
-        if (printFieldNames) {
-          const char prefix[] = "%";
-          cout << prefix
-               << "method"
-               << ",scalarType"
-               << ",numRows"
-               << ",numCols"
-               << ",numThreads"
-               << ",cacheSizeHint"
-               << ",contiguousCacheBlocks"
-               << ",numTrials"
-               << ",timing"
-               << endl;
-        }
-
-        // We don't include {min,max}_seq_apply_timing() here, because
-        // those times don't benefit from the accuracy of benchmarking
-        // for ntrials > 1.  Thus, it's misleading to include them
-        // with tbb_tsqr_timing, the total time over ntrials trials.
-        cout << "TbbTsqr"
-             << "," << scalarTypeName
-             << "," << nrows
-             << "," << ncols
-             << "," << num_cores
-             << "," << actor.cache_size_hint()
-             << "," << contiguous_cache_blocks
-             << "," << ntrials
-             << "," << tbb_tsqr_timing
-             << endl;
-      }
-    }
-  } // namespace Test
-} // namespace TSQR
-
-#endif // __TSQR_Test_TbbTest_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp
new file mode 100644
index 000000000000..ba99ac49332a
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.cpp
@@ -0,0 +1,84 @@
+#include "Tsqr_Test_MpiAndKokkosScope.hpp"
+#include "Kokkos_Core.hpp"
+#include "Teuchos_oblackholestream.hpp"
+#include "Teuchos_CommHelpers.hpp"
+#ifdef HAVE_MPI
+#  include "Teuchos_DefaultMpiComm.hpp"
+#  include "Teuchos_Assert.hpp"
+#else
+#  include "Teuchos_DefaultSerialComm.hpp"
+#endif // HAVE_MPI
+#include <iostream>
+#include <sstream>
+
+namespace TSQR {
+namespace Test {
+
+#ifdef HAVE_MPI
+MpiScope::MpiScope(int* argc, char*** argv) {
+  (void) MPI_Init(argc, argv);
+
+  int rawSize = 0;
+  (void) MPI_Comm_size(MPI_COMM_WORLD, &rawSize);
+
+  std::ostringstream os;
+  os << "MpiScope: Result of MPI_Comm_size on MPI_COMM_WORLD: "
+     << rawSize << std::endl;
+  std::cerr << os.str();
+}
+MpiScope::~MpiScope() {
+  (void) MPI_Finalize();
+}
+#else
+MpiScope::MpiScope(int*, char***) {
+  std::cerr << "MpiScope: HAVE_MPI is NOT defined" << std::endl;
+}
+MpiScope::~MpiScope() {}
+#endif // HAVE_MPI
+
+Teuchos::RCP<const Teuchos::Comm<int>>
+MpiAndKokkosScope::getDefaultComm()
+{
+#ifdef HAVE_MPI
+  int initialized = 0;
+  (void) MPI_Initialized(&initialized);
+  TEUCHOS_ASSERT( initialized == 1 );
+
+  using comm_type = Teuchos::MpiComm<int>;
+  const auto comm = Teuchos::rcp(new comm_type(MPI_COMM_WORLD));
+#else
+  using comm_type = Teuchos::SerialComm<int>;
+  const auto comm = Teuchos::rcp(new comm_type);
+#endif // HAVE_MPI
+
+  return comm;
+}
+
+MpiAndKokkosScope::
+MpiAndKokkosScope(int* argc, char*** argv) :
+  mpiScope_(argc, argv),
+  blackHole_(new Teuchos::oblackholestream),
+  comm_(getDefaultComm()),
+  kokkosScope_(new Kokkos::ScopeGuard(*argc, *argv))
+{}
+
+Teuchos::RCP<const Teuchos::Comm<int>>
+MpiAndKokkosScope::getComm() const {
+  return comm_;
+}
+
+std::ostream& MpiAndKokkosScope::outStream() const {
+  // Only Process 0 gets to write to cout and cerr.  The other MPI
+  // processes send their output to a "black hole" (something that
+  // acts like /dev/null).
+  return comm_->getRank() == 0 ? std::cout :
+    static_cast<std::ostream&>(*blackHole_);
+}
+
+std::ostream& MpiAndKokkosScope::errStream() const {
+  return comm_->getRank() == 0 ? std::cerr :
+    static_cast<std::ostream&>(*blackHole_);
+}
+
+} // namespace Test
+} // namespace TSQR
diff --git a/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp
new file mode 100644
index 000000000000..fc317fbc9f55
--- /dev/null
+++ b/packages/tpetra/tsqr/src/Tsqr_Test_MpiAndKokkosScope.hpp
@@ -0,0 +1,50 @@
+#ifndef TSQR_TEST_MPIANDKOKKOSSCOPE_HPP
+#define TSQR_TEST_MPIANDKOKKOSSCOPE_HPP
+
+#include "Teuchos_RCP.hpp"
+#include <memory>
+#include <ostream>
+
+namespace Kokkos {
+class ScopeGuard;
+} // namespace Kokkos
+
+namespace Teuchos {
+template<class OrdinalType> class Comm;
+} // namespace Teuchos
+
+namespace TSQR {
+namespace Test {
+
+class MpiScope {
+public:
+  MpiScope(int* argc, char*** argv);
+  ~MpiScope();
+};
+
+// Scope guard for TSQR's tests, that automatically initializes and
+// finalizes both MPI (if building with MPI enabled) and Kokkos.
+class MpiAndKokkosScope {
+public:
+  MpiAndKokkosScope(int* argc, char*** argv);
+
+  Teuchos::RCP<const Teuchos::Comm<int>> getComm() const;
+  std::ostream& outStream() const;
+  std::ostream& errStream() const;
+
+private:
+  static Teuchos::RCP<const Teuchos::Comm<int>> getDefaultComm();
+
+  MpiScope mpiScope_;
+  std::unique_ptr<std::ostream> blackHole_;
+  Teuchos::RCP<const Teuchos::Comm<int>> comm_;
+  // The only reason ever to handle a scope guard by pointer is for
+  // implementation hiding via the "pImpl" (pointer to implementation)
+  // idiom.
+  std::unique_ptr<Kokkos::ScopeGuard> kokkosScope_;
+};
+
+} // namespace Test
+} // namespace TSQR
+
+#endif // TSQR_TEST_MPIANDKOKKOSSCOPE_HPP
diff --git a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp b/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp
deleted file mode 100644
index dea7317ad040..000000000000
--- a/packages/tpetra/tsqr/src/Tsqr_TsqrTest.hpp
+++ /dev/null
@@ -1,801 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef __TSQR_Test_TsqrTest_hpp
-#define __TSQR_Test_TsqrTest_hpp
-
-#include "Tsqr.hpp"
-#ifdef HAVE_KOKKOSTSQR_TBB
-#  include "TbbTsqr.hpp"
-#endif // HAVE_KOKKOSTSQR_TBB
-#include "Tsqr_TestSetup.hpp"
-#include "Tsqr_GlobalVerify.hpp"
-#include "Tsqr_printGlobalMatrix.hpp"
-#include "Tsqr_verifyTimerConcept.hpp"
-#include "Teuchos_ScalarTraits.hpp"
-#include <cstring> // size_t
-#include <iostream>
-#include <stdexcept>
-#include <string>
-
-namespace TSQR {
-  namespace Test {
-    template<class TsqrType>
-    class TsqrVerifier {
-    public:
-      using tsqr_type = TsqrType;
-      using scalar_type = typename tsqr_type::scalar_type;
-      using ordinal_type = typename tsqr_type::ordinal_type;
-      using matrix_type = Matrix<ordinal_type, scalar_type>;
-      using factor_output_type = typename tsqr_type::FactorOutput;
-      using messenger_type = MessengerBase<scalar_type>;
-      using messenger_ptr = Teuchos::RCP<messenger_type>;
-
-      static void
-      verify (tsqr_type& tsqr,
-              const messenger_ptr& scalarComm,
-              const matrix_type& A_local,
-              matrix_type& A_copy,
-              matrix_type& Q_local,
-              matrix_type& R,
-              const bool contiguousCacheBlocks,
-              const bool b_debug = false)
-      {
-        using std::cerr;
-        using std::endl;
-
-        const ordinal_type nrows_local = A_local.extent(0);
-        const ordinal_type ncols = A_local.extent(1);
-
-        // If specified, rearrange cache blocks in the copy.
-        if (contiguousCacheBlocks) {
-          tsqr.cache_block (nrows_local, ncols, A_copy.data(),
-                            A_local.data(), A_local.stride(1));
-          if (b_debug) {
-            scalarComm->barrier ();
-            if (scalarComm->rank () == 0)
-              cerr << "-- Cache-blocked input matrix to factor." << endl;
-          }
-        }
-        else {
-          deep_copy (A_copy, A_local);
-        }
-
-        const bool testFactorExplicit = true;
-        if (testFactorExplicit) {
-          tsqr.factorExplicit (A_copy.view(), Q_local.view(),
-                               R.view(), contiguousCacheBlocks);
-          if (b_debug) {
-            scalarComm->barrier ();
-            if (scalarComm->rank () == 0) {
-              cerr << "-- Finished Tsqr::factorExplicit" << endl;
-            }
-          }
-        }
-        else {
-          // Factor the (copy of the) matrix.
-          factor_output_type factorOutput =
-            tsqr.factor (nrows_local, ncols,
-                         A_copy.data(), A_copy.stride(1),
-                         R.data(), R.stride(1),
-                         contiguousCacheBlocks);
-          if (b_debug) {
-            scalarComm->barrier ();
-            if (scalarComm->rank () == 0) {
-              cerr << "-- Finished Tsqr::factor" << endl;
-            }
-          }
-
-          // Compute the explicit Q factor in Q_local
-          tsqr.explicit_Q (nrows_local,
-                           ncols, A_copy.data(), A_copy.stride(1),
-                           factorOutput,
-                           ncols, Q_local.data(), Q_local.stride(1),
-                           contiguousCacheBlocks);
-          if (b_debug) {
-            scalarComm->barrier ();
-            if (scalarComm->rank () == 0) {
-              cerr << "-- Finished Tsqr::explicit_Q" << endl;
-            }
-          }
-        }
-
-        // "Un"-cache-block the output, if contiguous cache blocks were
-        // used.  This is only necessary because global_verify() doesn't
-        // currently support contiguous cache blocks.
-        if (contiguousCacheBlocks) {
-          // We can use A_copy as scratch space for un-cache-blocking
-          // Q_local, since we're done using A_copy for other things.
-          tsqr.un_cache_block (nrows_local, ncols, A_copy.data(),
-                               A_copy.stride(1), Q_local.data());
-          // Overwrite Q_local with the un-cache-blocked Q factor.
-          deep_copy (Q_local, A_copy);
-
-          if (b_debug) {
-            scalarComm->barrier ();
-            if (scalarComm->rank () == 0) {
-              cerr << "-- Un-cache-blocked output Q factor" << endl;
-            }
-          }
-        }
-      }
-    };
-
-    /// \function verifyTsqr
-    /// \brief Test and print to stdout the accuracy of parallel TSQR
-    ///
-    /// \param which [in] Valid values: "MpiTbbTSQR" (for TBB-parallel
-    ///   node-level TSQR underneath MPI-parallel TSQR), "MpiSeqTSQR"
-    ///   (for cache-blocked sequential node-level TSQR underneath
-    ///   MPI-parallel TSQR)
-    ///
-    /// \param scalarTypeName [in] Name of the Scalar type
-    ///
-    /// \param generator [in/out] Normal(0,1) (pseudo)random number
-    ///   generator.  Only touched on MPI process 0.  Used to generate
-    ///   random test matrices for the factorization.
-    ///
-    /// \param nrows_global [in] Number of rows in the entire test
-    ///   matrix (over all processes) to generate.  The matrix will be
-    ///   divided up in blocks of contiguous rows among the processes.
-    ///
-    /// \param ncols [in] Number of columns in the test matrix to
-    ///   generate.
-    ///
-    /// \param ordinalComm [in/out] Object for communicating Ordinal
-    ///   (integer index) objects among the processes
-    ///
-    /// \param scalarComm [in/out] Object for communicating Scalar
-    ///   (matrix data) objects among the processes
-    ///
-    /// \param num_cores [in] Number of cores to use per MPI process
-    ///   for Intel TBB parallelism within that process
-    ///
-    /// \param cache_size_hint [in] Cache size hint (per core) in
-    ///   bytes.  If zero, a sensible default is used.
-    ///
-    /// \param contiguousCacheBlocks [in] Whether cache blocks
-    ///   should be stored contiguously
-    ///
-    /// \param printFieldNames [in] Whether to print field names (only
-    ///   appliable if not human_readable)
-    ///
-    /// \param human_readable [in] Whether output should be human
-    ///   readable, or machine parseable
-    ///
-    /// \param b_debug [in] Whether to print debug output
-    ///
-    template<class Ordinal, class Scalar, class Generator>
-    void
-    verifyTsqr (const std::string& which,
-                const std::string& scalarTypeName,
-                Generator& generator,
-                const Ordinal nrows_global,
-                const Ordinal ncols,
-                const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
-                const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
-                const int num_cores = 1,
-                const size_t cache_size_hint = 0,
-                const bool contiguousCacheBlocks,
-                const bool printFieldNames,
-                const bool human_readable = false,
-                const bool b_debug = false)
-    {
-      typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-
-      const bool b_extra_debug = false;
-      const int nprocs = scalarComm->size();
-      const int my_rank = scalarComm->rank();
-      if (b_debug) {
-        scalarComm->barrier ();
-        if (my_rank == 0) {
-          cerr << "tsqr_verify:" << endl;
-        }
-        scalarComm->barrier ();
-      }
-      const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
-
-      // Set up storage for the test problem.
-      Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
-      Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A_local, std::numeric_limits<Scalar>::quiet_NaN ());
-        deep_copy (Q_local, std::numeric_limits<Scalar>::quiet_NaN ());
-      }
-      Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar(0));
-
-      // Generate the test problem.
-      distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
-      if (b_debug) {
-        scalarComm->barrier ();
-        if (my_rank == 0) {
-          cerr << "-- Generated test problem." << endl;
-        }
-      }
-
-      // Make sure that the test problem (the matrix to factor) was
-      // distributed correctly.
-      if (b_extra_debug && b_debug) {
-        if (my_rank == 0) {
-          cerr << "Test matrix A:" << endl;
-        }
-        scalarComm->barrier ();
-        printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
-        scalarComm->barrier ();
-      }
-
-      // Factoring the matrix stored in A_local overwrites it, so we
-      // make a copy of A_local.  Initialize with NaNs to make sure
-      // that cache blocking works correctly (if applicable).
-      Matrix<Ordinal, Scalar> A_copy (nrows_local, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN ());
-      }
-
-      // actual_cache_size_hint: "cache_size_hint" is just a
-      // suggestion.  TSQR determines the cache size hint itself;
-      // this remembers it so we can print it out later.
-      size_t actual_cache_size_hint;
-
-      if (which == "MpiTbbTSQR") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-        using Teuchos::RCP;
-        typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
-        typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
-        typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
-
-        RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_size_hint));
-        RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
-        tsqr_type tsqr (node_tsqr, dist_tsqr);
-
-        // Compute the factorization and explicit Q factor.
-        TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy,
-                                           Q_local, R, contiguousCacheBlocks,
-                                           b_debug);
-        // Save the "actual" cache block size
-        actual_cache_size_hint = tsqr.cache_size_hint();
-#else
-        throw std::logic_error("TSQR not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-      }
-      else if (which == "MpiSeqTSQR") {
-        using Teuchos::RCP;
-        typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
-        typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
-        typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
-
-        RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_size_hint));
-        RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
-        tsqr_type tsqr (node_tsqr, dist_tsqr);
-
-        // Compute the factorization and explicit Q factor.
-        TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy,
-                                           Q_local, R, contiguousCacheBlocks,
-                                           b_debug);
-        // Save the "actual" cache block size
-        actual_cache_size_hint = tsqr.cache_size_hint();
-      }
-      else {
-        throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
-      }
-
-      // Print out the Q and R factors
-      if (b_extra_debug && b_debug) {
-        if (my_rank == 0) {
-          cerr << endl << "Q factor:" << endl;
-        }
-        scalarComm->barrier ();
-        printGlobalMatrix (cerr, Q_local, scalarComm.get (), ordinalComm.get ());
-        scalarComm->barrier ();
-        if (my_rank == 0) {
-          cerr << endl << "R factor:" << endl;
-          print_local_matrix (cerr, ncols, ncols, R.data(), R.stride(1));
-          cerr << endl;
-        }
-        scalarComm->barrier ();
-      }
-
-      // Test accuracy of the resulting factorization
-      std::vector< magnitude_type > results =
-        global_verify (nrows_local, ncols, A_local.data(), A_local.stride(1),
-                       Q_local.data(), Q_local.stride(1), R.data(), R.stride(1),
-                       scalarComm.get());
-      if (b_debug) {
-        scalarComm->barrier ();
-        if (my_rank == 0) {
-          cerr << "-- Finished global_verify" << endl;
-        }
-      }
-
-      // Print the results on Proc 0.
-      if (my_rank == 0) {
-        if (human_readable) {
-          std::string human_readable_name;
-
-          if (which == "MpiSeqTSQR") {
-            human_readable_name = "MPI parallel / cache-blocked TSQR";
-          }
-          else if (which == "MpiTbbTSQR") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-            human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
-#else
-            throw std::logic_error("TSQR not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-          }
-          else {
-            throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
-          }
-
-          cout << human_readable_name << ":" << endl
-               << "Scalar type: " << scalarTypeName << endl
-               << "# rows: " << nrows_global << endl
-               << "# columns: " << ncols << endl
-               << "# MPI processes: " << nprocs << endl;
-#ifdef HAVE_KOKKOSTSQR_TBB
-          if (which == "MpiTbbTSQR")
-            cout << "# cores per process = " << num_cores << endl;
-#endif // HAVE_KOKKOSTSQR_TBB
-          cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
-               << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
-               << "Absolute residual $\\| A - Q R \\|_2: "
-               << results[0] << endl
-               << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: "
-               << results[1] << endl
-               << "Test matrix norm $\\| A \\|_F$: "
-               << results[2] << endl
-               << endl;
-        }
-        else {
-          if (printFieldNames) {
-            cout << "%"
-                 << "method"
-                 << ",scalarType"
-                 << ",globalNumRows"
-                 << ",numCols"
-                 << ",numProcs"
-                 << ",numCores"
-                 << ",cacheSizeHint"
-                 << ",contiguousCacheBlocks"
-                 << ",absFrobResid"
-                 << ",absFrobOrthog"
-                 << ",frobA" << endl;
-          }
-
-          cout << which
-               << "," << scalarTypeName
-               << "," << nrows_global
-               << "," << ncols
-               << "," << nprocs;
-#ifdef HAVE_KOKKOSTSQR_TBB
-          if (which == "MpiTbbTSQR") {
-            cout << "," << num_cores;
-          } else {
-            cout << ",1";
-          }
-#else
-          cout << ",1" << endl;
-#endif // HAVE_KOKKOSTSQR_TBB
-          cout << "," << actual_cache_size_hint
-               << "," << contiguousCacheBlocks
-               << "," << results[0]
-               << "," << results[1]
-               << "," << results[2]
-               << endl;
-        }
-      }
-    }
-
-
-    template<class TsqrBase, class TimerType>
-    double
-    do_tsqr_benchmark (const std::string& which,
-                       TsqrBase& tsqr,
-                       const Teuchos::RCP<MessengerBase<typename TsqrBase::scalar_type>>& messenger,
-                       const Matrix<typename TsqrBase::ordinal_type,
-                                    typename TsqrBase::scalar_type>& A_local,
-                       Matrix<typename TsqrBase::ordinal_type,
-                              typename TsqrBase::scalar_type>& A_copy,
-                       Matrix<typename TsqrBase::ordinal_type,
-                              typename TsqrBase::scalar_type>& Q_local,
-                       Matrix<typename TsqrBase::ordinal_type,
-                              typename TsqrBase::scalar_type>& R,
-                       const int ntrials,
-                       const bool contiguousCacheBlocks,
-                       const bool human_readable,
-                       const bool b_debug = false)
-    {
-      typedef typename TsqrBase::FactorOutput factor_output_type;
-      typedef typename TsqrBase::ordinal_type ordinal_type;
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-
-      const ordinal_type nrows_local = A_local.extent(0);
-      const ordinal_type ncols = A_local.extent(1);
-
-      if (contiguousCacheBlocks) {
-        tsqr.cache_block (nrows_local, ncols, A_copy.data(),
-                          A_local.data(), A_local.stride(1));
-        if (b_debug) {
-          messenger->barrier ();
-          if (messenger->rank () == 0) {
-            cerr << "-- Cache-blocked input matrix to factor." << endl;
-          }
-        }
-      }
-      else {
-        deep_copy (A_copy, A_local);
-      }
-
-      if (b_debug) {
-        messenger->barrier ();
-        if (messenger->rank () == 0) {
-          cerr << "-- Starting timing loop" << endl;
-        }
-      }
-
-      // Benchmark TSQR for ntrials trials.  The answer (the numerical
-      // results of the factorization) is only valid if ntrials == 1,
-      // but this is a benchmark and not a verification routine.  Call
-      // tsqr_verify() if you want to determine whether TSQR computes
-      // the right answer.
-      //
-      // Name of timer doesn't matter here; we only need the timing.
-      TSQR::Test::verifyTimerConcept< TimerType >();
-      TimerType timer (which);
-
-
-      const bool testFactorExplicit = true;
-      double tsqr_timing;
-      if (testFactorExplicit) {
-        timer.start();
-        for (int trial_num = 0; trial_num < ntrials; ++trial_num)
-          tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(),
-                               contiguousCacheBlocks);
-        tsqr_timing = timer.stop();
-      }
-      else {
-        timer.start();
-        for (int trial_num = 0; trial_num < ntrials; ++trial_num) {
-          // Factor the matrix and compute the explicit Q factor.
-          // Don't worry about the fact that we're overwriting the
-          // input; this is a benchmark, not a numerical verification
-          // test.  (We have the latter implemented as tsqr_verify()
-          // in this file.)  For the same reason, don't worry about
-          // un-cache-blocking the output (when cache blocks are
-          // stored contiguously).
-          factor_output_type factor_output =
-            tsqr.factor (nrows_local, ncols, A_copy.data(), A_copy.stride(1),
-                         R.data(), R.stride(1), contiguousCacheBlocks);
-          tsqr.explicit_Q (nrows_local,
-                           ncols, A_copy.data(), A_copy.stride(1), factor_output,
-                           ncols, Q_local.data(), Q_local.stride(1),
-                           contiguousCacheBlocks);
-          // Timings in debug mode likely won't make sense, because
-          // Proc 0 is outputting the debug messages to cerr.
-          // Nevertheless, we don't put any "if(b_debug)" calls in the
-          // timing loop.
-        }
-        // Compute the resulting total time (in seconds) to execute
-        // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q().  The
-        // time may differ on different MPI processes.
-        tsqr_timing = timer.stop();
-      }
-
-      if (b_debug) {
-        messenger->barrier();
-        if (messenger->rank() == 0)
-          cerr << "-- Finished timing loop" << endl;
-      }
-      return tsqr_timing;
-    }
-
-    /// \function benchmarkTsqr
-    /// \brief Benchmark parallel TSQR and report timings to stdout
-    ///
-    /// Benchmark the MPI-parallel TSQR implementation specified by
-    /// the "which" parameter (either with cache-blocked TSQR or
-    /// TBB-parallel cache-blocked TSQR as the node-level
-    /// implementation), for "ntrials" trials.  Print the stdout the
-    /// cumulative run time (in seconds) for all ntrials trials.
-    ///
-    /// \param which [in] Valid values: "MpiTbbTSQR" (for TBB-parallel
-    ///   node-level TSQR underneath MPI-parallel TSQR), "MpiSeqTSQR"
-    ///   (for cache-blocked sequential node-level TSQR underneath
-    ///   MPI-parallel TSQR)
-    ///
-    /// \param scalarTypeName [in] Name of the Scalar type
-    ///
-    /// \param generator [in/out] Normal(0,1) (pseudo)random number
-    ///   generator.  Only touched on MPI process 0.  Used to generate
-    ///   random test matrices for the factorization.
-    ///
-    /// \param ntrials [in] Number of trials to use in the benchmark.
-    ///   Reported timings are cumulative over all trials.
-    ///
-    /// \param nrows_global [in] Number of rows in the entire test
-    ///   matrix (over all processes) to generate.  The matrix will be
-    ///   divided up in blocks of contiguous rows among the processes.
-    ///
-    /// \param ncols [in] Number of columns in the test matrix to
-    ///   generate.
-    ///
-    /// \param ordinalComm [in/out] Object for communicating Ordinal
-    ///   (integer index) objects among the processes
-    ///
-    /// \param scalarComm [in/out] Object for communicating Scalar
-    ///   (matrix data) objects among the processes
-    ///
-    /// \param num_cores [in] Number of cores to use per MPI process
-    ///   for Intel TBB parallelism within that process
-    ///
-    /// \param cache_size_hint [in] Cache block size (per core) in
-    ///   bytes.  If zero, a sensible default is used.
-    ///
-    /// \param contiguousCacheBlocks [in] Whether cache blocks
-    ///   should be stored contiguously
-    ///
-    /// \param printFieldNames [in] Whether to print field names (only
-    ///   appliable if not human_readable)
-    ///
-    /// \param human_readable [in] Whether output should be human
-    ///   readable, or machine parseable
-    ///
-    /// \param b_debug [in] Whether to print debug output
-    ///
-    template<class Ordinal, class Scalar, class Generator, class TimerType>
-    void
-    benchmarkTsqr (const std::string& which,
-                   const std::string& scalarTypeName,
-                   Generator& generator,
-                   const int ntrials,
-                   const Ordinal nrows_global,
-                   const Ordinal ncols,
-                   const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
-                   const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
-                   const Ordinal num_cores,
-                   const size_t cache_size_hint,
-                   const bool contiguousCacheBlocks,
-                   const bool printFieldNames,
-                   const bool human_readable,
-                   const bool b_debug)
-    {
-      using std::cerr;
-      using std::cout;
-      using std::endl;
-
-      TSQR::Test::verifyTimerConcept< TimerType >();
-      const bool b_extra_debug = false;
-      const int nprocs = scalarComm->size();
-      const int my_rank = scalarComm->rank();
-      if (b_debug)
-        {
-          scalarComm->barrier();
-          if (my_rank == 0)
-            cerr << "tsqr_benchmark:" << endl;
-          scalarComm->barrier();
-        }
-      const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
-
-      // Set up storage for the test problem.
-      Matrix<Ordinal, Scalar> A_local (nrows_local, ncols);
-      Matrix<Ordinal, Scalar> Q_local (nrows_local, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A_local, std::numeric_limits<Scalar>::quiet_NaN());
-        deep_copy (Q_local, std::numeric_limits<Scalar>::quiet_NaN());
-      }
-      Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar {});
-
-      // Generate the test problem.
-      distributedTestProblem (generator, A_local, ordinalComm.get(),
-                              scalarComm.get());
-      if (b_debug) {
-        scalarComm->barrier();
-        if (my_rank == 0) {
-          cerr << "-- Generated test problem." << endl;
-        }
-      }
-
-      // Make sure that the test problem (the matrix to factor) was
-      // distributed correctly.
-      if (b_extra_debug && b_debug) {
-        if (my_rank == 0) {
-          cerr << "Test matrix A:" << endl;
-        }
-        scalarComm->barrier ();
-        printGlobalMatrix (cerr, A_local, scalarComm.get(),
-                           ordinalComm.get());
-        scalarComm->barrier ();
-      }
-
-      // Factoring the matrix stored in A_local overwrites it, so we
-      // make a copy of A_local.  If specified, rearrange cache blocks
-      // in the copy.  Initialize with NaNs to make sure that cache
-      // blocking worked correctly.
-      Matrix<Ordinal, Scalar> A_copy (nrows_local, ncols);
-      if (std::numeric_limits<Scalar>::has_quiet_NaN) {
-        deep_copy (A_copy, std::numeric_limits< Scalar >::quiet_NaN());
-      }
-
-      // actual_cache_size_hint: "cache_size_hint" is just a
-      // suggestion.  TSQR determines the cache block size itself;
-      // this remembers it so we can print it out later.
-      size_t actual_cache_size_hint;
-      // Run time (in seconds, as a double-precision floating-point
-      // value) for TSQR on this MPI node.
-      double tsqr_timing;
-
-      if (which == "MpiTbbTSQR") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-        using Teuchos::RCP;
-        typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
-        typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
-        typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
-
-        RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint));
-        RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
-        tsqr_type tsqr (nodeTsqr, distTsqr);
-
-        // Run the benchmark.
-        tsqr_timing =
-          do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
-                                                     A_copy, Q_local, R, ntrials,
-                                                     contiguousCacheBlocks,
-                                                     human_readable, b_debug);
-
-        // Save the "actual" cache block size
-        actual_cache_size_hint = tsqr.cache_size_hint();
-#else
-        throw std::logic_error("TSQR not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-      }
-      else if (which == "MpiSeqTSQR") {
-        using Teuchos::RCP;
-        using node_tsqr_type = SequentialTsqr<Ordinal, Scalar>;
-        using dist_tsqr_type = TSQR::DistTsqr<Ordinal, Scalar>;
-        using tsqr_type = typedef Tsqr<Ordinal, Scalar, node_tsqr_type, dist_tsqr_type>;
-
-        // Set up TSQR.
-        RCP<node_tsqr_type> nodeTsqr (new node_tsqr_type (cache_size_hint));
-        RCP<dist_tsqr_type> distTsqr (new dist_tsqr_type (scalarComm));
-        tsqr_type tsqr (nodeTsqr, distTsqr);
-
-        // Run the benchmark.
-        tsqr_timing =
-          do_tsqr_benchmark<tsqr_type, TimerType> (which, tsqr, scalarComm, A_local,
-                                                   A_copy, Q_local, R, ntrials,
-                                                   contiguousCacheBlocks,
-                                                   human_readable, b_debug);
-        // Save the "actual" cache block size
-        actual_cache_size_hint = tsqr.cache_size_hint();
-      }
-      else {
-        throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
-      }
-
-      // Find the min and max TSQR timing on all processors.
-      const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing);
-      const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing);
-
-      // Print the results on Proc 0.
-      if (my_rank == 0) {
-        if (human_readable) {
-          std::string human_readable_name;
-
-          if (which == "MpiSeqTSQR") {
-            human_readable_name = "MPI parallel / cache-blocked TSQR";
-          }
-          else if (which == "MpiTbbTSQR") {
-#ifdef HAVE_KOKKOSTSQR_TBB
-            human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
-#else
-            throw std::logic_error("TSQR not built with Intel TBB support");
-#endif // HAVE_KOKKOSTSQR_TBB
-          }
-          else {
-            throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
-          }
-
-          cout << human_readable_name << ":" << endl
-               << "Scalar type: " << scalarTypeName << endl
-               << "# rows: " << nrows_global << endl
-               << "# columns: " << ncols << endl
-               << "# MPI processes: " << nprocs << endl;
-
-#ifdef HAVE_KOKKOSTSQR_TBB
-          if (which == "MpiTbbTSQR")
-            cout << "# cores per process: " << num_cores << endl;
-#endif // HAVE_KOKKOSTSQR_TBB
-
-          cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
-               << "contiguous cache blocks? " << contiguousCacheBlocks << endl
-               << "# trials: " << ntrials << endl
-               << "Min total time (s) over all MPI processes: "
-               << min_tsqr_timing << endl
-               << "Max total time (s) over all MPI processes: "
-               << max_tsqr_timing << endl
-               << endl;
-        }
-        else {
-          if (printFieldNames) {
-            cout << "%"
-                 << "method"
-                 << ",scalarType"
-                 << ",globalNumRows"
-                 << ",numCols"
-                 << ",numProcs"
-                 << ",numCores"
-                 << ",cacheSizeHint"
-                 << ",contiguousCacheBlocks"
-                 << ",numTrials"
-                 << ",minTiming"
-                 << ",maxTiming"
-                 << endl;
-          }
-          cout << which
-               << "," << scalarTypeName
-               << "," << nrows_global
-               << "," << ncols
-               << "," << nprocs;
-#ifdef HAVE_KOKKOSTSQR_TBB
-          if (which == "MpiTbbTSQR") {
-            cout << "," << num_cores;
-          }
-          else {
-            cout << ",1";
-          }
-#else
-          cout << ",1";
-#endif // HAVE_KOKKOSTSQR_TBB
-          cout << "," << actual_cache_size_hint
-               << "," << contiguousCacheBlocks
-               << "," << ntrials
-               << "," << min_tsqr_timing
-               << "," << max_tsqr_timing
-               << endl;
-        }
-      }
-    }
-  } // namespace Test
-} // namespace TSQR
-
-#endif // __TSQR_Test_TsqrTest_hpp
diff --git a/packages/tpetra/tsqr/src/Tsqr_Util.hpp b/packages/tpetra/tsqr/src/Tsqr_Util.hpp
index ddbe59f4f062..9cd657594977 100644
--- a/packages/tpetra/tsqr/src/Tsqr_Util.hpp
+++ b/packages/tpetra/tsqr/src/Tsqr_Util.hpp
@@ -40,14 +40,15 @@
 /// \file Tsqr_Util.hpp
 /// \brief Utilities for TSQR (the Tall Skinny QR factorization)
 
-#ifndef __TSQR_Tsqr_Util_hpp
-#define __TSQR_Tsqr_Util_hpp
+#ifndef TSQR_UTIL_HPP
+#define TSQR_UTIL_HPP
 
 #include "Teuchos_ScalarTraits.hpp"
+#include "Tsqr_MatView.hpp"
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
 #  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 #include <algorithm>
 #include <ostream>
@@ -143,36 +144,6 @@ namespace TSQR {
     }
   }
 
-  template< class Ordinal, class Scalar >
-  void
-  copy_upper_triangle (const Ordinal nrows,
-                       const Ordinal ncols,
-                       Scalar* const R_out,
-                       const Ordinal ldr_out,
-                       const Scalar* const R_in,
-                       const Ordinal ldr_in)
-  {
-    if (nrows >= ncols) {
-      for (Ordinal j = 0; j < ncols; ++j) {
-        Scalar* const A_j = &R_out[j*ldr_out];
-        const Scalar* const B_j = &R_in[j*ldr_in];
-        for (Ordinal i = 0; i <= j; ++i) {
-          A_j[i] = B_j[i];
-        }
-      }
-    }
-    else {
-      copy_upper_triangle (nrows, nrows, R_out, ldr_out, R_in, ldr_in);
-      for (Ordinal j = nrows; j < ncols; j++) {
-        Scalar* const A_j = &R_out[j*ldr_out];
-        const Scalar* const B_j = &R_in[j*ldr_in];
-        for (Ordinal i = 0; i < nrows; i++)
-          A_j[i] = B_j[i];
-      }
-    }
-  }
-
-
   template< class Scalar >
   class SumSquare {
   public:
@@ -181,7 +152,7 @@ namespace TSQR {
     }
   };
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
   // Specialization for complex numbers
   template<class Scalar>
   class SumSquare<std::complex<Scalar> >  {
@@ -192,58 +163,8 @@ namespace TSQR {
       return result + absval * absval;
     }
   };
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-  template<class Ordinal, class Scalar>
-  void
-  pack_R_factor (const Ordinal nrows,
-                 const Ordinal ncols,
-                 const Scalar R_in[],
-                 const Ordinal ldr_in,
-                 Scalar buffer[])
-  {
-    Ordinal count = 0; // current position in output buffer
-    if (nrows >= ncols) {
-      for (Ordinal j = 0; j < ncols; ++j) {
-        for (Ordinal i = 0; i <= j; ++i) {
-          buffer[count++] = R_in[i + j*ldr_in];
-        }
-      }
-    }
-    else {
-      for (Ordinal j = 0; j < nrows; ++j) {
-        for (Ordinal i = 0; i <= j; ++i) {
-          buffer[count++] = R_in[i + j*ldr_in];
-        }
-      }
-    }
-  }
-
-  template< class Ordinal, class Scalar >
-  void
-  unpack_R_factor (const Ordinal nrows,
-                   const Ordinal ncols,
-                   Scalar R_out[],
-                   const Ordinal ldr_out,
-                   const Scalar buffer[])
-  {
-    Ordinal count = 0; // current position in input buffer
-    if (nrows >= ncols) {
-      for (Ordinal j = 0; j < ncols; ++j) {
-        for (Ordinal i = 0; i <= j; ++i) {
-          R_out[i + j*ldr_out] = buffer[count++];
-        }
-      }
-    }
-    else {
-      for (Ordinal j = 0; j < nrows; ++j) {
-        for (Ordinal i = 0; i <= j; ++i) {
-          R_out[i + j*ldr_out] = buffer[count++];
-        }
-      }
-    }
-  }
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 } // namespace TSQR
 
-#endif // __TSQR_Tsqr_Util_hpp
+#endif // TSQR_UTIL_HPP
diff --git a/packages/tpetra/tsqr/test/CMakeLists.txt b/packages/tpetra/tsqr/test/CMakeLists.txt
index 26bc2e6a0cb6..5bcdb5a21905 100644
--- a/packages/tpetra/tsqr/test/CMakeLists.txt
+++ b/packages/tpetra/tsqr/test/CMakeLists.txt
@@ -1,104 +1,135 @@
-# It's not necessary to run the first five tests in an MPI build
-# ("COMM mpi"), since none of them need to run on more than one MPI
-# process.  However, it's useful to have the tests around in an MPI
-# build, so we also build the tests there.  In an MPI build, only
-# Process 0 in MPI_COMM_WORLD runs the tests; the other ranks are
-# quieted.
+# It's not necessary to run most of the tests below in an MPI build
+# ("COMM mpi"), since only two of them (DistTsqr and FullTsqr) need to
+# run on more than one MPI process.  However, it's useful to have the
+# tests around in an MPI build, so we also build the tests there.  In
+# an MPI build, only Process 0 in MPI_COMM_WORLD runs the tests; the
+# other ranks are quieted.
+
+ASSERT_DEFINED(TPL_ENABLE_CUDA)
+ASSERT_DEFINED(Kokkos_ENABLE_Cuda)
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUSOLVER)
+
+IF (TPL_ENABLE_CUDA AND Kokkos_ENABLE_Cuda AND ${PACKAGE_NAME}_ENABLE_CUBLAS AND ${PACKAGE_NAME}_ENABLE_CUSOLVER)
+  SET (TpetraTSQR_ENABLE_CUDA_TESTS ON)
+ELSE ()
+  SET (TpetraTSQR_ENABLE_CUDA_TESTS OFF)
+ENDIF ()
+
+IF (TpetraTSQR_ENABLE_CUDA_TESTS)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  CuSolver
+  SOURCES CuSolver.cpp
+  COMM serial mpi
+  ARGS ""
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 1
+)
+ENDIF (TpetraTSQR_ENABLE_CUDA_TESTS)
 
 # Performance and accuracy test suite for TSQR::Combine (which factors
 # cache blocks and combines triangular factors).
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+
+TRIBITS_ADD_EXECUTABLE(
   Combine
   SOURCES Tsqr_TestCombine.cpp
   COMM serial mpi
-  ARGS "--verify --testReal"
+  )
+
+TRIBITS_ADD_TEST(
+  Combine
+  NAME Combine_100rows_5cols
+  COMM serial mpi
+  ARGS "--verify --numRows=100 --numCols=5"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
   )
 
-# Test TSQR::SequentialTsqr (sequential cache-blocked TSQR).
-TRIBITS_ADD_EXECUTABLE(
-  SequentialTsqr
-  SOURCES Tsqr_TestSeqTsqr.cpp
+TRIBITS_ADD_TEST(
+  Combine
+  NAME Combine_100rows_50cols
   COMM serial mpi
+  ARGS "--verify --numRows=100 --numCols=50"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 1
   )
 
 TRIBITS_ADD_TEST(
-  SequentialTsqr
-  NAME SequentialTsqr_contiguousCacheBlocks
+  Combine
+  NAME Combine_10000rows_11cols
   COMM serial mpi
-  ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000 --contiguous-cache-blocks"
+  ARGS "--verify --numRows=10000 --numCols=11"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
   )
 
+# This executable can test any NodeTsqr subclass that
+# TSQR::NodeTsqrFactory can create.  It can check accuracy (--verify)
+# and/or timing (--benchmark).  For both of these, it can compare with
+# LAPACK.  Thus, this can serve as a check for your LAPACK
+# implementation as well.  Run the executable with --help to see all
+# the options.  It builds with or without MPI, but only runs with one
+# MPI process.
+
+TRIBITS_ADD_EXECUTABLE(
+  NodeTsqr
+  SOURCES Tsqr_TestNodeTsqr.cpp
+  COMM serial mpi
+  )
+
+SET(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN ON)
+SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "--verify --NodeTsqr=SequentialTsqr")
+IF(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN)
+  SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --noTestComplex")
+ELSE()
+  SET(TSQR_SEQUENTIALTSQR_BASE_ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --testComplex")
+ENDIF()
+
 TRIBITS_ADD_TEST(
-  SequentialTsqr
-  NAME SequentialTsqr_noncontiguousCacheBlocks
+  NodeTsqr
+  NAME SequentialTsqr_contiguousCacheBlocks
   COMM serial mpi
-  ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=5000"
+  ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --numRows=100000 --numCols=10 --cacheBlockSize=5000 --contiguousCacheBlocks"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
   )
 
-# Performance and accuracy test suite for TSQR::KokkosNodeTsqr
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  KokkosHostTsqr
-  SOURCES Tsqr_TestKokkosNodeTsqr.cpp
+TRIBITS_ADD_TEST(
+  NodeTsqr
+  NAME SequentialTsqr_noncontiguousCacheBlocks
   COMM serial mpi
-  ARGS "--verify --numRows=100000 --numCols=10"
+  ARGS "${TSQR_SEQUENTIALTSQR_BASE_ARGS} --numRows=100000 --numCols=10 --cacheBlockSize=5000"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
   )
 
-# This test uses LAPACK's QR factorization to get a reference for
-# performance and accuracy.  It doesn't run any parts of the TSQR
-# algorithm, but it does depend on some TSQR test code (for generating
-# the test matrix and measuring accuracy).
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  Lapack
-  SOURCES Tsqr_TestLapack.cpp
+TRIBITS_ADD_TEST(
+  NodeTsqr
+  NAME CombineNodeTsqr
   COMM serial mpi
-  ARGS "--verify --nrows=1000 --ncols=10 --ntrials=10"
+  ARGS "--verify --NodeTsqr=CombineNodeTsqr --numRows=1000 --numCols=15"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
   )
 
-# Performance and accuracy test suite for TSQR::TBB::TbbTsqr
-# (shared-memory parallel cache-blocked TSQR, parallelized via Intel's
-# Threading Building Blocks library).
-#
-# Only build TBB-enabled TSQR if (surprise!) TBB is enabled.
-IF (KokkosTSQR_ENABLE_TBB)
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    TbbTsqr
-    SOURCES Tsqr_TestTbbTsqr.cpp
+IF (TpetraTSQR_ENABLE_CUDA_TESTS)
+  TRIBITS_ADD_TEST(
+    NodeTsqr
+    NAME CuSolverNodeTsqr_11_5
     COMM serial mpi
-    ARGS "--verify --nrows=100000 --ncols=10 --cache-block-size=50000 --contiguous-cache-blocks"
+    ARGS "--verify --NodeTsqr=CuSolverNodeTsqr --numRows=11 --numCols=5"
     STANDARD_PASS_OUTPUT
     NUM_MPI_PROCS 1
     )
-ENDIF()
 
-# mfh 22 Dec 2014: Disable this test, since KokkosNodeTsqr no longer
-# works with the new Kokkos Node types.
-#
-# Performance and accuracy test suite for TSQR::KokkosNodeTsqr
-# ("generic" intranode parallel TSQR).  We pick an odd number of
-# partitions to ensure correct results in that case, not just for
-# powers of two (which everybody tests first).  The number of
-# partitions is the maximum parallelism available in the algorithm,
-# but it's up to the Kokkos Node implementation to decide what
-# hardware resources to use (e.g., how many CPU cores, how many
-# threads, ...).
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
-#  KokkosNodeTsqr
-#  SOURCES Tsqr_TestKokkosNodeTsqr.cpp
-#  COMM serial mpi
-#  ARGS "--verify --numRows=100000 --numCols=10 --numPartitions=7 --cacheSizeHint=50000 --contiguousCacheBlocks"
-#  STANDARD_PASS_OUTPUT
-#  NUM_MPI_PROCS 1
-#  )
+  TRIBITS_ADD_TEST(
+    NodeTsqr
+    NAME CuSolverNodeTsqr_5000_20
+    COMM serial mpi
+    ARGS "--verify --NodeTsqr=CuSolverNodeTsqr --numRows=5000 --numCols=20"
+    STANDARD_PASS_OUTPUT
+    NUM_MPI_PROCS 1
+    )
+ENDIF ()
 
 #
 # Tests for the distributed-memory (MPI) part of TSQR.
@@ -106,21 +137,99 @@ ENDIF()
 
 # Performance and accuracy test suite for TSQR::DistTsqr (which
 # combines triangular factors from different MPI processes).
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  DistTsqr_Accuracy
+
+# Accuracy test for TSQR::Tsqr (the full TSQR implementation).
+TRIBITS_ADD_EXECUTABLE(
+  DistTsqr
   SOURCES Tsqr_TestDistTsqr.cpp
-  COMM mpi
+  COMM serial mpi
+  )
+
+TRIBITS_ADD_TEST(
+  DistTsqr
+  NAME DistTsqr_1_proc
+  COMM serial mpi
   ARGS "--verify --ncols=5 --explicit --implicit --real"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 1
 )
 
+TRIBITS_ADD_TEST(
+  DistTsqr
+  NAME DistTsqr_4_proc
+  COMM mpi
+  ARGS "--verify --ncols=5 --explicit --implicit --real"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 4
+)
+
 # Accuracy test for TSQR::Tsqr (the full TSQR implementation).
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  FullTsqr_Accuracy
+TRIBITS_ADD_EXECUTABLE(
+  FullTsqr
   SOURCES Tsqr_TestFullTsqr.cpp
   COMM mpi
-  ARGS "--numRowsLocal=100 --numCols=5 --testFactorExplicit --testReal"
+  )
+
+SET(TSQR_FULL_BASE_ARGS "--testFactorExplicit")
+
+TRIBITS_ADD_TEST(
+  FullTsqr
+  NAME FullTsqr_Accuracy_100rows_5cols
+  COMM mpi
+  ARGS "--numRowsLocal=100 --numCols=5 ${TSQR_FULL_BASE_ARGS}"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 4
+)
+
+TRIBITS_ADD_TEST(
+  FullTsqr
+  NAME FullTsqr_Accuracy_100rows_20cols
+  COMM mpi
+  ARGS "--numRowsLocal=100 --numCols=20 ${TSQR_FULL_BASE_ARGS}"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 4
+)
+
+TRIBITS_ADD_TEST(
+  FullTsqr
+  NAME FullTsqr_Accuracy_10000rows_5cols
+  COMM mpi
+  ARGS "--numRowsLocal=10000 --numCols=5 ${TSQR_FULL_BASE_ARGS}"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 4
+)
+
+TRIBITS_ADD_TEST(
+  FullTsqr
+  NAME FullTsqr_Accuracy_10000rows_20cols
+  COMM mpi
+  ARGS "--numRowsLocal=10000 --numCols=20 ${TSQR_FULL_BASE_ARGS}"
+  STANDARD_PASS_OUTPUT
+  NUM_MPI_PROCS 4
+)
+
+IF (TpetraTSQR_ENABLE_CUDA_TESTS)
+  TRIBITS_ADD_TEST(
+    FullTsqr
+    NAME FullTsqr_Accuracy_1000rows_15cols_CuSolver
+    COMM mpi
+    ARGS "--numRowsLocal=1000 --numCols=15 --NodeTsqr=CuSolverNodeTsqr ${TSQR_FULL_BASE_ARGS}"
+    STANDARD_PASS_OUTPUT
+    NUM_MPI_PROCS 4
+  )
+ENDIF ()
+
+IF(TSQR_SEQUENTIALTSQR_COMPLEX_BROKEN)
+  SET(TSQR_FULL_BASE_ARGS_SEQ "--noTestComplex")
+ELSE()
+  SET(TSQR_FULL_BASE_ARGS_SEQ "--testComplex")
+ENDIF()
+
+TRIBITS_ADD_TEST(
+  FullTsqr
+  NAME FullTsqr_Accuracy_5000rows_100cols_Sequential
+  COMM mpi
+  ARGS "--numRowsLocal=5000 --numCols=100 --NodeTsqr=SequentialTsqr ${TSQR_FULL_BASE_ARGS_SEQ}"
   STANDARD_PASS_OUTPUT
   NUM_MPI_PROCS 4
 )
diff --git a/packages/tpetra/tsqr/test/CuSolver.cpp b/packages/tpetra/tsqr/test/CuSolver.cpp
new file mode 100644
index 000000000000..c9e801e393ec
--- /dev/null
+++ b/packages/tpetra/tsqr/test/CuSolver.cpp
@@ -0,0 +1,161 @@
+//@HEADER
+// ************************************************************************
+//
+//          Kokkos: Node API and Parallel Node Kernels
+//              Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+
+#include "Tsqr_Impl_CuBlasHandle.hpp"
+#include "Tsqr_Impl_CuSolverHandle.hpp"
+#include "Tsqr_Impl_CuBlas.hpp"
+#include "Tsqr_Impl_CuSolver.hpp"
+#include "Tsqr_Impl_CuTypes.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+#include "Teuchos_UnitTestHarness.hpp"
+#include "Kokkos_Core.hpp"
+#include <iostream>
+#include <type_traits>
+
+namespace { // (anonymous)
+
+template<class RealType>
+void
+verifyReal (std::ostream& out, bool& success)
+{
+  using TSQR::Impl::CuSolver;
+  using TSQR::Impl::CuSolverHandle;
+  using TSQR::Impl::CudaValue;
+  using std::endl;
+
+  CuSolverHandle s = CuSolverHandle::getSingleton ();
+  TEST_ASSERT( s.getHandle () != nullptr );
+
+  Kokkos::View<int, Kokkos::CudaSpace> info ("info");
+  CuSolver<RealType> solver (s, info.data ());
+
+  using IST = typename CudaValue<RealType>::type;
+  static_assert (std::is_same<RealType, IST>::value,
+                 "CudaValue::type is wrong.");
+  const RealType x (666.0);
+  out << "Original x: " << x << ": Converted x: "
+      << CudaValue<RealType>::makeValue (x) << endl;
+
+  using TSQR::Impl::CuBlas;
+  using TSQR::Impl::CuBlasHandle;
+  CuBlasHandle b = CuBlasHandle::getSingleton ();
+  TEST_ASSERT( b.getHandle () != nullptr );
+
+  CuBlas<RealType> blas (b);
+}
+
+#ifdef HAVE_TPETRATSQR_COMPLEX
+template<class ComplexType>
+void
+verifyComplex (std::ostream& out, bool& success)
+{
+  using TSQR::Impl::CuSolver;
+  using TSQR::Impl::CuSolverHandle;
+  using TSQR::Impl::CudaValue;
+  using std::endl;
+
+  CuSolverHandle s = CuSolverHandle::getSingleton ();
+  TEST_ASSERT( s.getHandle () != nullptr );
+
+  Kokkos::View<int, Kokkos::CudaSpace> info ("info");
+  CuSolver<ComplexType> solver (s, info.data ());
+
+  using IST = typename CudaValue<ComplexType>::type;
+
+  using expected_z_IST = cuDoubleComplex;
+  using expected_c_IST = cuFloatComplex;
+  constexpr bool is_z =
+    std::is_same<ComplexType, std::complex<double>>::value;
+  using expected_IST = typename std::conditional<
+    is_z,
+    expected_z_IST,
+    expected_c_IST>::type;
+  static_assert (std::is_same<expected_IST, IST>::value,
+                 "CudaValue::type is wrong.");
+  const ComplexType x (666.0, 418.0);
+  const IST x_out = CudaValue<ComplexType>::makeValue (x);
+  out << "Original x: " << x << ": Converted x: ("
+      << x_out.x << "," << x_out.y << ")" << endl;
+
+  using TSQR::Impl::CuBlas;
+  using TSQR::Impl::CuBlasHandle;
+  CuBlasHandle b = CuBlasHandle::getSingleton ();
+  TEST_ASSERT( b.getHandle () != nullptr );
+
+  CuBlas<ComplexType> blas (b);
+}
+#endif // HAVE_TPETRATSQR_COMPLEX
+
+void
+verify (std::ostream& out, bool& success)
+{
+  verifyReal<double> (out, success);
+  verifyReal<float> (out, success);
+
+#ifdef HAVE_TPETRATSQR_COMPLEX
+  verifyComplex<std::complex<double>> (out, success);
+  verifyComplex<std::complex<float>> (out, success);
+#endif // HAVE_TPETRATSQR_COMPLEX
+}
+
+} // namespace (anonymous)
+
+int
+main (int argc, char *argv[])
+{
+  using std::cout;
+  using std::endl;
+
+  cout << "Test cuBLAS and cuSOLVER handle creation" << endl;
+
+  bool success = true;
+  try {
+    Kokkos::ScopeGuard kokkosScope (argc, argv);
+    verify (cout, success);
+    // The Trilinos test framework expects a message like this.
+    if (success) {
+      cout << "\nEnd Result: TEST PASSED" << endl;
+    }
+    else {
+      cout << "\nEnd Result: TEST FAILED" << endl;
+    }
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
+  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
+}
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp
index 9e1344065d38..eab1f261cf03 100644
--- a/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp
+++ b/packages/tpetra/tsqr/test/Tsqr_TestCombine.cpp
@@ -37,30 +37,21 @@
 // ************************************************************************
 //@HEADER
 
-#include "Tsqr_ConfigDefs.hpp"
-#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI
-#include "Teuchos_Tuple.hpp"
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
 #include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_Time.hpp"
 #include "Teuchos_StandardCatchMacros.hpp"
+#include "Teuchos_Time.hpp"
 #include "Tsqr_CombineBenchmark.hpp"
 #include "Tsqr_CombineTest.hpp"
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
 #  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
 
-#include <fstream>
+#include "Kokkos_Core.hpp"
 #include <sstream>
 #include <stdexcept>
 #include <vector>
 
-
 namespace {
   using Teuchos::RCP;
 
@@ -76,67 +67,52 @@ namespace {
   // parameters.
   //
   struct TestParameters {
-    TestParameters () :
-      verify (false),
-      benchmark (false),
-      numRows (100),
-      numCols (5),
-      numTrials (3),
-      calibrate (false),
-      averageTimings (true),
-      testReal (true),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      testComplex (true),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      printFieldNames (true),
-      printTrilinosTestStuff (true),
-      strictPerfTests (false),
-      allowance (1.2),
-      verbose (true),
-      debug (false)
-      {}
-
     // Whether to run the accuracy test.
-    bool verify;
+    bool verify = true;
     // Whether to run the performance test.
-    bool benchmark;
+    bool benchmark = false;
     // Number of rows in the test matrix.
-    int numRows;
+    int numRows = 100;
     // Number of columns in the test matrix.
-    int numCols;
+    int numCols = 5;
     // Number of trials (benchmark only).
-    int numTrials;
+    int numTrials = 3;
     // Whether to pick the number of trials automatically, using an
     // iterative calibration process (benchmark only).
-    bool calibrate;
-    // Whether to print averaged timings over all trials (true), or the cumulative timing over all trials (false).
-    bool averageTimings;
+    bool calibrate = false;
+    // Whether to print averaged timings over all trials (true), or
+    // the cumulative timing over all trials (false).
+    bool averageTimings = true;
     // Whether to test real-arithmetic routines.
-    bool testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    // Whether to test complex-arithmetic routines.  We don't let this
-    // option exist unless TSQR was built with complex arithmetic
-    // support.
-    bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+    bool testReal = true;
+    // Whether to test complex-arithmetic routines.  If TSQR was not
+    // built with complex arithmetic support, then this must always be
+    // false.
+#ifdef HAVE_TPETRATSQR_COMPLEX
+    bool testComplex = true;
+#else
+    bool testComplex = false;
+#endif // HAVE_TPETRATSQR_COMPLEX
     // Whether to print column (field) names.
-    bool printFieldNames;
+    bool printFieldNames = true;
     // Whether to print output that the Trilinos test framework
     // expects, in order to judge a test as passed or failed.
-    bool printTrilinosTestStuff;
+    bool printTrilinosTestStuff = true;
     // Whether the benchmark should fail if performance of
-    // TSQR::CombineNative (and TSQR::CombineFortran, if applicable)
-    // relative to that of TSQR::CombineDefault is not good enough.
-    bool strictPerfTests;
+    // TSQR::CombineNative relative to that of TSQR::CombineDefault is
+    // not good enough.
+    bool strictPerfTests = false;
     // If strictPerfTests is true: how much slower CombineNative (and
     // CombineFortran, if applicable) is allowed to be, relative to
     // CombineDefault.
-    double allowance;
+    double allowance = 1.2;
     // Whether to print verbose status output.
-    bool verbose;
+    bool verbose = true;
     // Whether to print debugging output to stderr.
-    bool debug;
-    std::string additionalFieldNames, additionalData;
+    bool debug = false;
+
+    std::string additionalFieldNames;
+    std::string additionalData;
   };
 
   // Benchmark TSQR::Combine.
@@ -148,76 +124,60 @@ namespace {
   //   the following fields: numRows, numCols, numTrials,
   //   testReal, testComplex.
   //
-  // Warning: Call only on (MPI) rank 0.  Otherwise, you'll run the
-  //   test routine on every MPI rank simultaneously, but only report
-  //   results on rank 0.
+  // Warning: Call only on (MPI) Process 0.  Otherwise, you'll run the
+  //   test routine on every MPI process simultaneously, but only
+  //   report results on Process 0.
   void
-    benchmark (std::ostream& out,
-        const TestParameters& params)
-    {
-      std::vector<int> seed(4);
-      const bool useSeedValues = false; // Fill in seed with defaults.
-
-      using TSQR::Test::benchmarkCombine;
-      typedef Teuchos::Time timer_type;
-
-      TSQR::Test::CombineBenchmarkParameters testParams;
-      testParams.numRows = params.numRows;
-      testParams.numCols = params.numCols;
-      testParams.testReal = params.testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      testParams.testComplex = params.testComplex;
-#else
-      testParams.testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      testParams.numTrials = params.numTrials;
-      testParams.calibrate = params.calibrate;
-      testParams.averageTimings = params.averageTimings;
-      testParams.strictPerfTests = params.strictPerfTests;
-      testParams.allowance = params.allowance;
-      testParams.seed = seed;
-      testParams.useSeedValues = useSeedValues;
-      testParams.additionalFieldNames = params.additionalFieldNames;
-      testParams.additionalData = params.additionalData;
-      testParams.printFieldNames = params.printFieldNames;
-      testParams.debug = params.debug;
-
-      benchmarkCombine<timer_type> (out, testParams);
-    }
+  benchmark(std::ostream& out,
+            const TestParameters& params)
+  {
+    std::vector<int> seed(4);
+    const bool useSeedValues = false; // Fill in seed with defaults.
+
+    TSQR::Test::CombineBenchmarkParameters testParams;
+    testParams.numRows = params.numRows;
+    testParams.numCols = params.numCols;
+    testParams.testReal = params.testReal;
+    testParams.testComplex = params.testComplex;
+    testParams.numTrials = params.numTrials;
+    testParams.calibrate = params.calibrate;
+    testParams.averageTimings = params.averageTimings;
+    testParams.strictPerfTests = params.strictPerfTests;
+    testParams.allowance = params.allowance;
+    testParams.seed = seed;
+    testParams.useSeedValues = useSeedValues;
+    testParams.additionalFieldNames = params.additionalFieldNames;
+    testParams.additionalData = params.additionalData;
+    testParams.printFieldNames = params.printFieldNames;
+    testParams.debug = params.debug;
+
+    using timer_type = Teuchos::Time;
+    TSQR::Test::benchmarkCombine<timer_type>(out, testParams);
+  }
 
   // Test accuracy of TSQR::Combine.
   //
-  // out [out] output stream for benchmark results.
-  //   It will only be used on rank 0.
+  // out [out] output stream for benchmark results.  It will only be
+  //   used on Process 0.
   //
-  // params [in] test parameter struct.  This method reads
-  //   the following fields: numRows, numCols, numTrials,
-  //   testReal, testComplex.
+  // params [in] test parameter struct.  This method reads the
+  //   following fields: numRows, numCols, numTrials, testReal,
+  //   testComplex.
   //
-  // Warning: Call only on (MPI) rank 0.  Otherwise, you'll run
-  //   the test routine on every MPI rank simultaneously, but
-  //   only report results on rank 0.
+  // Warning: Call only on (MPI) Process 0.  Otherwise, you'll run the
+  //   test routine on every MPI process simultaneously, but only
+  //   report results on Process 0.
   void
-    verify (std::ostream& out,
-        const TestParameters& params)
-    {
-      typedef int ordinal_type;
-
-      const ordinal_type numRows = params.numRows;
-      const ordinal_type numCols = params.numCols;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      const bool testComplex = params.testComplex;
-#else
-      const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      const bool printFieldNames = params.printFieldNames;
-      const bool simulateSequentialTsqr = false;
-      const bool debug = false;
-
-      using TSQR::Test::verifyCombine;
-      verifyCombine (numRows, numCols, params.testReal, testComplex,
-          printFieldNames, simulateSequentialTsqr, debug);
-    }
+  verify(std::ostream& out, const TestParameters& params)
+  {
+    constexpr bool simulateSequentialTsqr = false;
+    constexpr bool debug = false;
+
+    using TSQR::Test::verifyCombine;
+    verifyCombine(params.numRows, params.numCols, params.testReal,
+                  params.testComplex, params.printFieldNames,
+                  simulateSequentialTsqr, debug);
+  }
 
   // \brief Parse command-line options for this test
   //
@@ -232,197 +192,178 @@ namespace {
   //
   // Return: Encapsulation of command-line options.
   TestParameters
-    parseOptions (int argc,
-        char* argv[],
-        const bool allowedToPrint,
-        bool& printedHelp)
-    {
-      using std::cerr;
-      using std::endl;
-
-      printedHelp = false;
-
-      // Command-line parameters, set to their default values.
-      TestParameters params;
-      try {
-        using Teuchos::CommandLineProcessor;
-
-        CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-            /* recognizeAllOptions=*/ true);
-        cmdLineProc.setDocString (docString);
-        cmdLineProc.setOption ("verify",
-            "noverify",
-            &params.verify,
-            "Test accuracy of TSQR::Combine implementations.");
-        cmdLineProc.setOption ("benchmark",
-            "nobenchmark",
-            &params.benchmark,
-            "Test performance of TSQR::Combine implementations.");
-        cmdLineProc.setOption ("debug",
-            "nodebug",
-            &params.debug,
-            "Print copious debugging information to stderr.");
-        cmdLineProc.setOption ("numRows",
-            &params.numRows,
-            "Number of rows in the cache block test.");
-        cmdLineProc.setOption ("numCols",
-            &params.numCols,
-            "Number of columns in the cache block test, and "
-            "number of rows and columns in each upper triangular "
-            "matrix in the pair test.");
-        cmdLineProc.setOption ("numTrials",
-            &params.numTrials,
-            "For benchmarks: Number of trials.  "
-            "Ignored if --calibrate option is set.");
-        cmdLineProc.setOption ("calibrate",
-            "noCalibrate",
-            &params.calibrate,
-            "For benchmarks: ignore numTrials, and calibrate "
-            "the number of trials based on computed timer "
-            "resolution and problem size (numRows and "
-            "numCols).");
-        cmdLineProc.setOption ("meanTimings",
-            "sumTimings",
-            &params.averageTimings,
-            "For benchmarks: whether timings should be "
-            "computed as an arithmetic mean (true) or as a "
-            "sum (false) over all trials.");
-        cmdLineProc.setOption ("testReal",
-            "noTestReal",
-            &params.testReal,
-            "Test real-arithmetic routines.");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-        cmdLineProc.setOption ("testComplex",
-            "noTestComplex",
-            &params.testComplex,
-            "Test complex-arithmetic routines.  This option "
-            "may only be set if Trilinos was built with "
-            "complex arithmetic support.");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-        cmdLineProc.setOption ("strictPerfTests",
-            "noStrictPerfTests",
-            &params.strictPerfTests,
-            "For benchmarks: whether the test should fail if "
-            "run time of TSQR::CombineNative / run time of "
-            "TSQR::CombineDefault (both for the cache block "
-            "benchmark) is greater than the given slowdown "
-            "allowance.  Ditto for TSQR::CombineFortran, if "
-            "TSQR was built with Fortran support.");
-        cmdLineProc.setOption ("allowance",
-            &params.allowance,
-            "For benchmarks: if strictPerfTests is true: "
-            "allowed slowdown factor.  If exceeded, the test "
-            "fails.");
-        cmdLineProc.setOption ("additionalFieldNames",
-            &params.additionalFieldNames,
-            "Any additional field name(s) (comma-delimited "
-            "string) to add to the benchmark output.  Empty "
-            "by default.  Good for things known when invoking "
-            "the benchmark executable, but not (easily) known "
-            "inside the benchmark -- e.g., environment "
-            "variables.");
-        cmdLineProc.setOption ("additionalData",
-            &params.additionalData,
-            "Any additional data to add to the output, "
-            "corresponding to the above field name(s). "
-            "Empty by default.");
-        cmdLineProc.setOption ("printFieldNames",
-            "noPrintFieldNames",
-            &params.printFieldNames,
-            "Print field names for benchmark output (including "
-            "any arguments to --fieldNames).");
-        cmdLineProc.setOption ("printTrilinosTestStuff",
-            "noPrintTrilinosTestStuff",
-            &params.printTrilinosTestStuff,
-            "Print output that makes the Trilinos test "
-            "framework happy (but makes benchmark results "
-            "parsing scripts unhappy)");
-        cmdLineProc.parse (argc, argv);
-      }
-      catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-        if (allowedToPrint)
-          cerr << "Unrecognized command-line option: " << e.what() << endl;
-        throw e;
-      }
-      catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-        printedHelp = true;
-        return params; // Don't verify parameters in this case
-      }
-
-      // Validate.  TODO (mfh 08 Jul 2010) Figure out how to do this with
-      // ParameterList validators.
-      if (params.numRows <= 0)
-        throw std::invalid_argument ("Number of rows must be positive");
-      else if (params.numCols <= 0)
-        throw std::invalid_argument ("Number of columns must be positive");
-      else if (params.numRows < params.numCols)
-        throw std::invalid_argument ("Number of rows must be >= number of columns");
-      else if (params.benchmark && params.numTrials < 1)
-        throw std::invalid_argument ("Benchmark requires numTrials >= 1");
-
-      return params;
+  parseOptions(int argc,
+               char* argv[],
+               std::ostream& err,
+               bool& printedHelp)
+  {
+    using std::endl;
+
+    printedHelp = false;
+
+    // Command-line parameters, set to their default values.
+    TestParameters params {};
+    try {
+      constexpr bool throwExceptions = true;
+      constexpr bool recognizeAllOptions = true;
+      using CLP = Teuchos::CommandLineProcessor;
+      CLP cmdLineProc(throwExceptions, recognizeAllOptions);
+      cmdLineProc.setDocString(docString);
+      cmdLineProc.setOption("verify",
+                            "noverify",
+                            &params.verify,
+                            "Test accuracy of TSQR::Combine implementations.");
+      cmdLineProc.setOption("benchmark",
+                            "nobenchmark",
+                            &params.benchmark,
+                            "Test performance of TSQR::Combine implementations.");
+      cmdLineProc.setOption("debug",
+                            "nodebug",
+                            &params.debug,
+                            "Print copious debugging information to stderr.");
+      cmdLineProc.setOption("numRows",
+                            &params.numRows,
+                            "Number of rows in the cache block test.");
+      cmdLineProc.setOption("numCols",
+                            &params.numCols,
+                            "Number of columns in the cache block test, and "
+                            "number of rows and columns in each upper triangular "
+                            "matrix in the pair test.");
+      cmdLineProc.setOption("numTrials",
+                            &params.numTrials,
+                            "For benchmarks: Number of trials.  "
+                            "Ignored if --calibrate option is set.");
+      cmdLineProc.setOption("calibrate",
+                            "noCalibrate",
+                            &params.calibrate,
+                            "For benchmarks: ignore numTrials, and calibrate "
+                            "the number of trials based on computed timer "
+                            "resolution and problem size (numRows and "
+                            "numCols).");
+      cmdLineProc.setOption("meanTimings",
+                            "sumTimings",
+                            &params.averageTimings,
+                            "For benchmarks: whether timings should be "
+                            "computed as an arithmetic mean (true) or as a "
+                            "sum (false) over all trials.");
+      cmdLineProc.setOption("testReal",
+                            "noTestReal",
+                            &params.testReal,
+                            "Test real-arithmetic routines.");
+      cmdLineProc.setOption("testComplex",
+                            "noTestComplex",
+                            &params.testComplex,
+                            "Test complex-arithmetic routines.  This option "
+                            "may only be true if Trilinos was built with "
+                            "complex arithmetic support.");
+      cmdLineProc.setOption("strictPerfTests",
+                            "noStrictPerfTests",
+                            &params.strictPerfTests,
+                            "For benchmarks: whether the test should fail if "
+                            "run time of TSQR::CombineNative / run time of "
+                            "TSQR::CombineDefault (both for the cache block "
+                            "benchmark) is greater than the given slowdown "
+                            "allowance.  Ditto for TSQR::CombineFortran, if "
+                            "TSQR was built with Fortran support.");
+      cmdLineProc.setOption("allowance",
+                            &params.allowance,
+                            "For benchmarks: if strictPerfTests is true: "
+                            "allowed slowdown factor.  If exceeded, the test "
+                            "fails.");
+      cmdLineProc.setOption("additionalFieldNames",
+                            &params.additionalFieldNames,
+                            "Any additional field name(s) (comma-delimited "
+                            "string) to add to the benchmark output.  Empty "
+                            "by default.  Good for things known when invoking "
+                            "the benchmark executable, but not (easily) known "
+                            "inside the benchmark -- e.g., environment "
+                            "variables.");
+      cmdLineProc.setOption("additionalData",
+                            &params.additionalData,
+                            "Any additional data to add to the output, "
+                            "corresponding to the above field name(s). "
+                            "Empty by default.");
+      cmdLineProc.setOption("printFieldNames",
+                            "noPrintFieldNames",
+                            &params.printFieldNames,
+                            "Print field names for benchmark output (including "
+                            "any arguments to --fieldNames).");
+      cmdLineProc.setOption("printTrilinosTestStuff",
+                            "noPrintTrilinosTestStuff",
+                            &params.printTrilinosTestStuff,
+                            "Print output that makes the Trilinos test "
+                            "framework happy (but makes benchmark results "
+                            "parsing scripts unhappy)");
+      cmdLineProc.parse(argc, argv);
+    }
+    catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
+      err << "Unrecognized command-line option: " << e.what() << endl;
+      throw e;
+    }
+    catch(Teuchos::CommandLineProcessor::HelpPrinted& e) {
+      printedHelp = true;
+      return params; // Don't verify parameters in this case
     }
-} // namespace (anonymous)
-
 
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.numRows <= 0, std::invalid_argument, "Number of "
+       "rows must be positive, but you set --numRows=" <<
+       params.numRows << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.numCols <= 0, std::invalid_argument, "Number of "
+       "columns must be positive, but you set --numCols=" <<
+       params.numCols << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.numRows < params.numCols, std::invalid_argument,
+       "Number of rows must be >= number of columns, but "
+       "--numRows=" << params.numRows << " and --numCols=" <<
+       params.numCols << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.benchmark && params.numTrials < 1,
+       std::invalid_argument, "If you set --benchmark, then the "
+       "number of trials must be positive, but you set --numTrials="
+       << params.numTrials << ".");
+#ifndef HAVE_TPETRATSQR_COMPLEX
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.testComplex, std::invalid_argument, "Complex "
+       "arithmetic support was not enabled at configure time, "
+       "but you set --testComplex.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+    return params;
+  }
+} // namespace (anonymous)
 
-  int
-main (int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
-  using Teuchos::RCP;
-
-#ifdef HAVE_MPI
-  typedef RCP< const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to stdout.  The other MPI process ranks
-  // send their output to something that looks like /dev/null (and
-  // likely is, on Unix-y operating systems).
-  std::ostream& out = (myRank == 0) ? std::cout : blackhole;
-  // Only Rank 0 performs the tests.
-  const bool performingTests = (myRank == 0);
-  const bool allowedToPrint = (myRank == 0);
-
-#else // Don't HAVE_MPI: single-node test
-
-  const bool performingTests = true;
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-#endif // HAVE_MPI
+  using std::cout;
+  using std::cerr;
+  using std::endl;
 
   // Fetch command-line parameters.
   bool printedHelp = false;
-  TestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp)
-    return 0;
-
+  auto params = parseOptions(argc, argv, cerr, printedHelp);
+  if(printedHelp) {
+    return EXIT_SUCCESS;
+  }
   bool success = false;
-  bool verbose = false;
+  constexpr bool actually_print_caught_exceptions = true;
   try {
-    if (performingTests)
-    {
-      using std::endl;
-
-      if (params.benchmark)
-        benchmark (out, params);
-
-      // We allow the same run to do both benchmark and verify.
-      if (params.verify)
-        verify (out, params);
-
-      success = true;
-
-      if (params.printTrilinosTestStuff)
-        // The Trilinos test framework expects a message like this.
-        out << "\nEnd Result: TEST PASSED" << endl;
+    Kokkos::ScopeGuard kokkosScope(argc, argv);
+    if(params.benchmark) {
+      benchmark(cout, params);
+    }
+    // We allow the same run to do both benchmark and verify.
+    if(params.verify) {
+      verify(cout, params);
+    }
+    success = true;
+    if(params.printTrilinosTestStuff) {
+      // The Trilinos test framework expects a message like this.
+      cout << "\nEnd Result: TEST PASSED" << endl;
     }
   }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
+  TEUCHOS_STANDARD_CATCH_STATEMENTS
+    (actually_print_caught_exceptions, cerr, success);
   return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
 }
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp
index 33210c6c81f4..a02891745b3f 100644
--- a/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp
+++ b/packages/tpetra/tsqr/test/Tsqr_TestDistTsqr.cpp
@@ -37,163 +37,854 @@
 // ************************************************************************
 //@HEADER
 
-#include "Tsqr_ConfigDefs.hpp"
-
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
+#include "Tsqr_Random_NormalGenerator.hpp"
+#include "Tsqr_generateStack.hpp"
+#include "Tsqr_DistTsqr.hpp"
+#include "Tsqr_GlobalTimeStats.hpp"
+#include "Tsqr_GlobalVerify.hpp"
+#include "Tsqr_printGlobalMatrix.hpp"
 
+#include "Tsqr_Test_MpiAndKokkosScope.cpp"
+#include "Tsqr_TeuchosMessenger.hpp"
 #include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_RCP.hpp"
 #include "Teuchos_Time.hpp"
 #include "Teuchos_StandardCatchMacros.hpp"
 
-#include "Tsqr_ParTest.hpp"
-#include "Tsqr_TeuchosMessenger.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#include <algorithm>
+#ifdef HAVE_TPETRATSQR_COMPLEX
 #  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
+#endif // HAVE_TPETRATSQR_COMPLEX
+#include <iomanip>
+#include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <vector>
 
-using TSQR::MessengerBase;
-using TSQR::TeuchosMessenger;
-using TSQR::Test::DistTsqrVerifier;
-using TSQR::Test::DistTsqrBenchmarker;
+namespace TSQR {
+  namespace Test {
+    /// \class DistTsqrVerifier
+    /// \brief Generic version of \c DistTsqr accuracy test.
+    template<class Ordinal, class Scalar>
+    class DistTsqrVerifier {
+      TSQR::Random::NormalGenerator<Ordinal, Scalar> gen_;
+      Teuchos::RCP<MessengerBase<Ordinal> > const ordinalComm_;
+      Teuchos::RCP<MessengerBase<Scalar> > const scalarComm_;
+      std::string scalarTypeName_;
+      std::ostream& out_;
+      std::ostream& err_;
+      const bool testFactorExplicit_, testFactorImplicit_;
+      const bool humanReadable_, printMatrices_, debug_;
 
-using Teuchos::RCP;
-using Teuchos::rcp;
-using Teuchos::rcp_implicit_cast;
-using Teuchos::Tuple;
+    public:
+      using ordinal_type = Ordinal;
+      using scalar_type = Scalar;
+      using mag_type =
+        typename Teuchos::ScalarTraits<scalar_type>::magnitudeType;
+      using result_type = std::vector<mag_type>;
 
+      /// \brief Constructor, with custom seed value
+      ///
+      /// \param scalarComm [in/out] Communicator object over which to
+      ///   test.
+      /// \param seed [in] 4-element vector; the random seed input of
+      ///   TSQR::Random::NormalGenerator (which see, since there are
+      ///   restrictions on the set of valid seeds)
+      /// \param scalarTypeName [in] Human-readable name of the Scalar
+      ///   template type parameter
+      /// \param out [out] Output stream to which to write results
+      /// \param err [out] Output stream to which to write any
+      ///   debugging outputs (if applicable) or errors
+      /// \param testFactorExplicit [in] Whether to test
+      ///   DistTsqr::factorExplicit()
+      /// \param testFactorImplicit [in] Whether to test
+      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
+      /// \param humanReadable [in] Whether printed results should be
+      ///   easy for humans to read (vs. easy for parsers to parse)
+      /// \param debug [in] Whether to write verbose debug output to
+      ///   err
+      DistTsqrVerifier(const Teuchos::RCP<MessengerBase<Ordinal> >& ordinalComm,
+                       const Teuchos::RCP<MessengerBase<Scalar> >& scalarComm,
+                       const std::vector<int>& seed,
+                       const std::string& scalarTypeName,
+                       std::ostream& out,
+                       std::ostream& err,
+                       const bool testFactorExplicit,
+                       const bool testFactorImplicit,
+                       const bool humanReadable,
+                       const bool printMatrices,
+                       const bool debug) :
+        gen_(seed),
+        ordinalComm_(ordinalComm),
+        scalarComm_(scalarComm),
+        scalarTypeName_(scalarTypeName),
+        out_(out),
+        err_(err),
+        testFactorExplicit_(testFactorExplicit),
+        testFactorImplicit_(testFactorImplicit),
+        humanReadable_(humanReadable),
+        printMatrices_(printMatrices),
+        debug_(debug)
+      {}
 
-template< class Ordinal, class Scalar >
-class MessengerPairMaker {
-  public:
-    typedef int ordinal_type;
-    typedef Scalar scalar_type;
+      /// \brief Constructor, with default seed value
+      ///
+      /// This constructor sets a default seed (for the pseudorandom
+      /// number generator), which is the same seed (0,0,0,1) each
+      /// time.
+      ///
+      /// \param scalarComm [in/out] Communicator object over which to
+      ///   test.
+      /// \param scalarTypeName [in] Human-readable name of the Scalar
+      ///   template type parameter
+      /// \param out [out] Output stream to which to write results
+      /// \param err [out] Output stream to which to write any
+      ///   debugging outputs (if applicable) or errors
+      /// \param testFactorExplicit [in] Whether to test
+      ///   DistTsqr::factorExplicit()
+      /// \param testFactorImplicit [in] Whether to test
+      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
+      /// \param humanReadable [in] Whether printed results should be
+      ///   easy for humans to read (vs. easy for parsers to parse)
+      /// \param debug [in] Whether to write verbose debug output to
+      ///   err
+      DistTsqrVerifier(const Teuchos::RCP<MessengerBase<Ordinal> >& ordinalComm,
+                       const Teuchos::RCP<MessengerBase<Scalar> >& scalarComm,
+                       const std::string& scalarTypeName,
+                       std::ostream& out,
+                       std::ostream& err,
+                       const bool testFactorExplicit,
+                       const bool testFactorImplicit,
+                       const bool humanReadable,
+                       const bool printMatrices,
+                       const bool debug) :
+        ordinalComm_(ordinalComm),
+        scalarComm_(scalarComm),
+        scalarTypeName_(scalarTypeName),
+        out_(out),
+        err_(err),
+        testFactorExplicit_(testFactorExplicit),
+        testFactorImplicit_(testFactorImplicit),
+        humanReadable_(humanReadable),
+        printMatrices_(printMatrices),
+        debug_(debug)
+      {}
 
-    typedef std::pair<RCP<MessengerBase<ordinal_type> >, RCP<MessengerBase<scalar_type> > > pair_type;
+      /// \brief Get seed vector for pseudorandom number generator
+      ///
+      /// Fill seed (changing size of vector as necessary) with the
+      /// seed vector used by the pseudorandom number generator.  You
+      /// can use this to resume the pseudorandom number stream from
+      /// where you last were.
+      void
+      getSeed(std::vector<int>& seed) const
+      {
+        gen_.getSeed(seed);
+      }
 
-    static pair_type
-      makePair (const RCP< const Teuchos::Comm<int> >& comm)
+      /// \brief Run the DistTsqr accuracy test
+      ///
+      /// \param numCols [in] Number of columns in the matrix to test.
+      ///   Number of rows := (# MPI processors) * ncols.
+      void
+      verify(const Ordinal numCols,
+             const std::string& additionalFieldNames,
+             const std::string& additionalData,
+             const bool printFieldNames)
       {
-        RCP<TeuchosMessenger<ordinal_type> > derivedOrdinalComm =
-          rcp (new TeuchosMessenger<ordinal_type> (comm));
-        RCP<MessengerBase<ordinal_type> > ordinalComm =
-          rcp_implicit_cast<MessengerBase<ordinal_type> > (derivedOrdinalComm);
-        RCP<TeuchosMessenger<scalar_type> > derivedScalarComm =
-          rcp (new TeuchosMessenger<scalar_type> (comm));
-        RCP<MessengerBase<scalar_type> > scalarComm =
-          rcp_implicit_cast<MessengerBase<scalar_type> > (derivedScalarComm);
-
-        return std::make_pair (ordinalComm, scalarComm);
+        using std::endl;
+
+        const int myRank = scalarComm_->rank();
+        if(debug_) {
+          scalarComm_->barrier();
+          if(myRank == 0) {
+            err_ << "Verifying DistTsqr:" << endl;
+          }
+          scalarComm_->barrier();
+        }
+
+        // Generate test problem.
+        Matrix<Ordinal, Scalar> A_local, Q_local, R;
+        testProblem(A_local, Q_local, R, numCols);
+        if(debug_) {
+          scalarComm_->barrier();
+          if(myRank == 0) {
+            err_ << "-- Generated test problem." << endl;
+          }
+          scalarComm_->barrier();
+        }
+
+        // Set up TSQR implementation.
+        DistTsqr<Ordinal, Scalar> par;
+        par.init (scalarComm_);
+        if(debug_) {
+          scalarComm_->barrier();
+          if(myRank == 0) {
+            err_ << "-- DistTsqr object initialized" << endl << endl;
+          }
+        }
+
+        // Whether we've printed field names (i.e., column headers)
+        // yet.  Only matters for non-humanReadable output.
+        bool printedFieldNames = false;
+
+        // Test DistTsqr::factor() and DistTsqr::explicit_Q().
+        if(testFactorImplicit_) {
+          // Factor the matrix A (copied into R, which will be
+          // overwritten on output)
+          typedef typename DistTsqr<Ordinal, Scalar>::FactorOutput
+            factor_output_type;
+          factor_output_type factorOutput = par.factor (R.view());
+          if(debug_) {
+            scalarComm_->barrier();
+            if(myRank == 0) {
+              err_ << "-- Finished DistTsqr::factor" << endl;
+            }
+          }
+          // Compute the explicit Q factor
+          par.explicit_Q(numCols, Q_local.data(), Q_local.stride(1),
+                         factorOutput);
+          if(debug_) {
+            scalarComm_->barrier();
+            if(myRank == 0) {
+              err_ << "-- Finished DistTsqr::explicit_Q" << endl;
+            }
+          }
+          // Verify the factorization
+          auto result =
+            global_verify(numCols, numCols, A_local.data(),
+                          A_local.stride(1), Q_local.data(),
+                          Q_local.stride(1), R.data(), R.stride(1),
+                          scalarComm_.get());
+          if(debug_) {
+            scalarComm_->barrier();
+            if(myRank == 0) {
+              err_ << "-- Finished global_verify" << endl;
+            }
+          }
+          reportResults("DistTsqr", numCols, result,
+                        additionalFieldNames, additionalData,
+                        printFieldNames && (! printedFieldNames));
+          if(printFieldNames && (! printedFieldNames)) {
+            printedFieldNames = true;
+          }
+        }
+
+        // Test DistTsqr::factorExplicit()
+        if(testFactorExplicit_) {
+          // Factor the matrix and compute the explicit Q factor, both
+          // in a single operation.
+          par.factorExplicit(R.view(), Q_local.view());
+          if(debug_) {
+            scalarComm_->barrier();
+            if(myRank == 0) {
+              err_ << "-- Finished DistTsqr::factorExplicit" << endl;
+            }
+          }
+
+          if(printMatrices_) {
+            if(myRank == 0) {
+              err_ << std::endl << "Computed Q factor:" << std::endl;
+            }
+            printGlobalMatrix(err_, Q_local, scalarComm_.get(),
+                              ordinalComm_.get());
+            if(myRank == 0) {
+              err_ << std::endl << "Computed R factor:" << std::endl;
+              print_local_matrix (err_, R.extent(0), R.extent(1),
+                                  R.data(), R.stride(1));
+              err_ << std::endl;
+            }
+          }
+
+          // Verify the factorization
+          result_type result =
+            global_verify(numCols, numCols, A_local.data(),
+                          A_local.stride(1), Q_local.data(),
+                          Q_local.stride(1), R.data(), R.stride(1),
+                          scalarComm_.get());
+          if(debug_) {
+            scalarComm_->barrier();
+            if(myRank == 0) {
+              err_ << "-- Finished global_verify" << endl;
+            }
+          }
+          reportResults("DistTsqrRB", numCols, result,
+                        additionalFieldNames, additionalData,
+                        printFieldNames && (! printedFieldNames));
+          if(printFieldNames && (! printedFieldNames)) {
+            printedFieldNames = true;
+          }
+        }
       }
-};
 
+    private:
+      /// Report verification results.  Call on ALL MPI processes, not
+      /// just Process 0.
+      ///
+      /// \param method [in] String to print before reporting results
+      /// \param numCols [in] Number of columns in the matrix tested.
+      /// \param result [in] (relative residual, orthogonality)
+      void
+      reportResults (const std::string& method,
+                     const Ordinal numCols,
+                     const result_type& result,
+                     const std::string& additionalFieldNames,
+                     const std::string& additionalData,
+                     const bool printFieldNames)
+      {
+        using std::endl;
+
+        const int numProcs = scalarComm_->size();
+        const int myRank = scalarComm_->rank();
+
+        if(myRank == 0) {
+          if(humanReadable_) {
+            out_ << method << " accuracy results:" << endl
+                 << "Scalar: " << scalarTypeName_ << endl
+                 << "numCols: " << numCols << endl
+                 << "Number of (MPI) processes: " << numProcs << endl
+                 << "Absolute residual $\\| A - Q R \\|_2: "
+                 << result[0] << endl
+                 << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: "
+                 << result[1] << endl
+                 << "Test matrix norm $\\| A \\|_F$: "
+                 << result[2] << endl;
+          }
+          else {
+            // Use scientific notation for floating-point numbers
+            out_ << std::scientific;
+
+            if(printFieldNames) {
+              out_ << "%method,scalarType,numCols,numProcs"
+                ",absFrobResid,absFrobOrthog,frobA";
+              if(! additionalFieldNames.empty())
+                out_ << "," << additionalFieldNames;
+              out_ << endl;
+            }
+
+            out_ << method
+                 << "," << scalarTypeName_
+                 << "," << numCols
+                 << "," << numProcs
+                 << "," << result[0]
+                 << "," << result[1]
+                 << "," << result[2];
+            if(! additionalData.empty()) {
+              out_ << "," << additionalData;
+            }
+            out_ << endl;
+          }
+        }
+      }
+
+      void
+      testProblem(Matrix<Ordinal, Scalar>& A_local,
+                  Matrix<Ordinal, Scalar>& Q_local,
+                  Matrix<Ordinal, Scalar>& R,
+                  const Ordinal numCols)
+      {
+        const Ordinal numRowsLocal = numCols;
+
+        // A_local: Space for the matrix A to factor -- local to each
+        //   processor.
+        //
+        // A_global: Global matrix (only nonempty on Proc 0); only
+        //   used temporarily.
+        Matrix<Ordinal, Scalar> A_global;
+
+        // This modifies A_local on all procs, and A_global on Proc 0.
+        par_tsqr_test_problem(gen_, A_local, A_global, numCols, scalarComm_);
+
+        if(printMatrices_) {
+          const int myRank = scalarComm_->rank();
+          if(myRank == 0) {
+            err_ << "Input matrix A:" << std::endl;
+          }
+          printGlobalMatrix(err_, A_local, scalarComm_.get(),
+                            ordinalComm_.get());
+          if(myRank == 0) {
+            err_ << std::endl;
+          }
+        }
+
+        // Copy the test problem input into R, since the factorization
+        // will overwrite it in place with the final R factor.
+        R.reshape(numCols, numCols);
+        deep_copy(R, Scalar{});
+        deep_copy(R, A_local);
+
+        // Prepare space in which to construct the explicit Q factor
+        // (local component on this processor)
+        Q_local.reshape(numRowsLocal, numCols);
+        deep_copy(Q_local, Scalar {});
+      }
+    };
+
+    /// \class DistTsqrBenchmarker
+    /// \brief Generic version of DistTsqr performance test.
+    template< class Ordinal, class Scalar>
+    class DistTsqrBenchmarker {
+      TSQR::Random::NormalGenerator<Ordinal, Scalar> gen_;
+      Teuchos::RCP<MessengerBase<Scalar>> scalarComm_;
+      Teuchos::RCP<MessengerBase<double>> doubleComm_;
+      std::string scalarTypeName_;
+
+      std::ostream& out_;
+      std::ostream& err_;
+      const bool testFactorExplicit_;
+      const bool testFactorImplicit_;
+      const bool humanReadable_;
+      const bool debug_;
+
+    public:
+      using ordinal_type = Ordinal;
+      using scalar_type = Scalar;
+      using timer_type = Teuchos::Time;
+
+      /// \brief Constructor, with custom seed value
+      ///
+      /// \param scalarComm [in/out] Communicator object over which
+      ///   to test.
+      /// \param doubleComm [in/out] Communicator object for doubles,
+      ///   used for finding the min and max of timing results over
+      ///   all the MPI processes.
+      /// \param seed [in] 4-element vector; the random seed input of
+      ///   TSQR::Random::NormalGenerator (which see, since there are
+      ///   restrictions on the set of valid seeds)
+      /// \param scalarTypeName [in] Human-readable name of the Scalar
+      ///   template type parameter
+      /// \param out [out] Output stream to which to write results
+      /// \param err [out] Output stream to which to write any
+      ///   debugging outputs (if applicable) or errors
+      /// \param testFactorExplicit [in] Whether to test
+      ///   DistTsqr::factorExplicit()
+      /// \param testFactorImplicit [in] Whether to test
+      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
+      /// \param humanReadable [in] Whether printed results should be
+      ///   easy for humans to read (vs. easy for parsers to parse)
+      /// \param debug [in] Whether to write verbose debug output to
+      ///   err
+      DistTsqrBenchmarker(const Teuchos::RCP<MessengerBase<Scalar>>& scalarComm,
+                          const Teuchos::RCP<MessengerBase<double>>& doubleComm,
+                          const std::vector<int>& seed,
+                          const std::string& scalarTypeName,
+                          std::ostream& out,
+                          std::ostream& err,
+                          const bool testFactorExplicit,
+                          const bool testFactorImplicit,
+                          const bool humanReadable,
+                          const bool debug) :
+        gen_(seed),
+        scalarComm_(scalarComm),
+        doubleComm_(doubleComm),
+        scalarTypeName_(scalarTypeName),
+        out_(out),
+        err_(err),
+        testFactorExplicit_(testFactorExplicit),
+        testFactorImplicit_(testFactorImplicit),
+        humanReadable_(humanReadable),
+        debug_(debug)
+      {}
+
+      /// \brief Constructor, with default seed value
+      ///
+      /// This constructor sets a default seed (for the pseudorandom
+      /// number generator), which is the same seed (0,0,0,1) each
+      /// time.
+      ///
+      /// \param scalarComm [in/out] Communicator object over which
+      ///   to test.
+      /// \param doubleComm [in/out] Communicator object for doubles,
+      ///   used for finding the min and max of timing results over
+      ///   all the MPI processes.
+      /// \param scalarTypeName [in] Human-readable name of the Scalar
+      ///   template type parameter
+      /// \param out [out] Output stream to which to write results
+      /// \param err [out] Output stream to which to write any
+      ///   debugging outputs (if applicable) or errors
+      /// \param testFactorExplicit [in] Whether to test
+      ///   DistTsqr::factorExplicit()
+      /// \param testFactorImplicit [in] Whether to test
+      ///   DistTsqr::factor() and DistTsqr::explicit_Q()
+      /// \param humanReadable [in] Whether printed results should be
+      ///   easy for humans to read (vs. easy for parsers to parse)
+      /// \param debug [in] Whether to write verbose debug output to
+      ///   err
+      DistTsqrBenchmarker(const Teuchos::RCP<MessengerBase<Scalar>>& scalarComm,
+                          const Teuchos::RCP<MessengerBase<double>>& doubleComm,
+                          const std::string& scalarTypeName,
+                          std::ostream& out,
+                          std::ostream& err,
+                          const bool testFactorExplicit,
+                          const bool testFactorImplicit,
+                          const bool humanReadable,
+                          const bool debug) :
+        scalarComm_(scalarComm),
+        doubleComm_(doubleComm),
+        scalarTypeName_(scalarTypeName),
+        out_(out),
+        err_(err),
+        testFactorExplicit_(testFactorExplicit),
+        testFactorImplicit_(testFactorImplicit),
+        humanReadable_(humanReadable),
+        debug_(debug)
+      {}
+
+      /// \brief Get seed vector for pseudorandom number generator
+      ///
+      /// Fill seed (changing size of vector as necessary) with the
+      /// seed vector used by the pseudorandom number generator.  You
+      /// can use this to resume the pseudorandom number stream from
+      /// where you last were.
+      void
+      getSeed(std::vector<int>& seed) const
+      {
+        gen_.getSeed(seed);
+      }
+
+      /// \brief Run the DistTsqr benchmark
+      ///
+      /// \param numTrials [in] Number of times to repeat the computation
+      ///   in a single timing run
+      /// \param numCols [in] Number of columns in the matrix to test.
+      ///   Number of rows := (# MPI processors) * ncols
+      void
+      benchmark(const int numTrials,
+                const Ordinal numCols,
+                const std::string& additionalFieldNames,
+                const std::string& additionalData,
+                const bool printFieldNames)
+      {
+        using std::endl;
+
+        // Set up test problem.
+        Matrix<Ordinal, Scalar> A_local, Q_local, R;
+        testProblem(A_local, Q_local, R, numCols);
+
+        // Set up TSQR implementation.
+        DistTsqr<Ordinal, Scalar> par;
+        par.init(scalarComm_);
+
+        // Whether we've printed field names (i.e., column headers)
+        // yet.  Only matters for non-humanReadable output.
+        bool printedFieldNames = false;
+
+        if(testFactorImplicit_) {
+          std::string timerName("DistTsqr");
+
+          // Throw away some number of runs, because some MPI libraries
+          // (recent versions of OpenMPI at least) do autotuning for the
+          // first few collectives calls.
+          const int numThrowAwayRuns = 5;
+          for(int runNum = 0; runNum < numThrowAwayRuns; ++runNum) {
+            auto factorOutput = par.factor(R.view());
+            par.explicit_Q(numCols, Q_local.data(),
+                           Q_local.stride(1), factorOutput);
+          }
+
+          // Now do the actual timing runs.  Benchmark DistTsqr
+          // (factor() and explicit_Q()) for numTrials trials.
+          timer_type timer (timerName);
+          timer.start();
+          for(int trialNum = 0; trialNum < numTrials; ++trialNum) {
+            auto factorOutput = par.factor(R.view());
+            par.explicit_Q(numCols, Q_local.data(),
+                           Q_local.stride(1), factorOutput);
+          }
+          // Cumulative timing on this MPI process.  "Cumulative"
+          // means the elapsed time of numTrials executions.
+          const double localCumulativeTiming = timer.stop();
+
+          // reportResults() must be called on all processes, since this
+          // figures out the min and max timings over all processes.
+          reportResults(timerName, numTrials, numCols,
+                        localCumulativeTiming, additionalFieldNames,
+                        additionalData,
+                        printFieldNames && (! printedFieldNames));
+          if(printFieldNames && (! printedFieldNames)) {
+            printedFieldNames = true;
+          }
+        }
+
+        if(testFactorExplicit_) {
+          std::string timerName ("DistTsqrRB");
+
+          // Throw away some number of runs, because some MPI libraries
+          // (recent versions of OpenMPI at least) do autotuning for the
+          // first few collectives calls.
+          const int numThrowAwayRuns = 5;
+          for(int runNum = 0; runNum < numThrowAwayRuns; ++runNum) {
+            par.factorExplicit(R.view(), Q_local.view());
+          }
+
+          // Benchmark DistTsqr::factorExplicit() for numTrials trials.
+          timer_type timer(timerName);
+          timer.start();
+          for(int trialNum = 0; trialNum < numTrials; ++trialNum) {
+            par.factorExplicit(R.view(), Q_local.view());
+          }
+          // Cumulative timing on this MPI process.
+          // "Cumulative" means the elapsed time of numTrials executions.
+          const double localCumulativeTiming = timer.stop();
+
+          // Report cumulative (not per-invocation) timing results
+          reportResults(timerName, numTrials, numCols, localCumulativeTiming,
+                        additionalFieldNames, additionalData,
+                        printFieldNames && (! printedFieldNames));
+          if(printFieldNames && (! printedFieldNames)) {
+            printedFieldNames = true;
+          }
+
+          // Per-invocation timings (for factorExplicit() benchmark
+          // only).  localTimings were computed on this MPI process;
+          // globalTimings are statistical summaries of those over
+          // all MPI processes.  We only collect that data for
+          // factorExplicit().
+          std::vector<TimeStats> localTimings;
+          std::vector<TimeStats> globalTimings;
+          par.getFactorExplicitTimings(localTimings);
+          for(size_t k = 0; k < localTimings.size(); ++k) {
+            globalTimings.push_back
+              (globalTimeStats(*doubleComm_, localTimings[k]));
+          }
+          std::vector<std::string> timingLabels;
+          par.getFactorExplicitTimingLabels(timingLabels);
+
+          if(humanReadable_) {
+            out_ << timerName << " per-invocation benchmark results:" << endl;
+          }
+          const std::string labelLabel("label,scalarType");
+          for (size_t k = 0; k < timingLabels.size(); ++k) {
+            // Only print column headers (i.e., field names) once, if at all.
+            const bool printHeaders = (k == 0) && printFieldNames;
+            globalTimings[k].print (out_, humanReadable_,
+                                    timingLabels[k] + "," + scalarTypeName_,
+                                    labelLabel, printHeaders);
+          }
+        }
+      }
+
+    private:
+      /// Report timing results to the given output stream
+      ///
+      /// \param method [in] String to print before reporting results
+      /// \param numTrials [in] Number of times to repeat the computation
+      ///   in a single timing run
+      /// \param numCols [in] Number of columns in the matrix to test.
+      ///   Number of rows := (# MPI processors) * ncols
+      /// \param timing [in] Total benchmark time, as measured on this
+      ///   MPI process.  This may differ on each process; we report
+      ///   the min and the max.
+      ///
+      /// \warning Call on ALL MPI processes, not just Rank 0!
+      void
+      reportResults(const std::string& method,
+                    const int numTrials,
+                    const ordinal_type numCols,
+                    const double localTiming,
+                    const std::string& additionalFieldNames,
+                    const std::string& additionalData,
+                    const bool printFieldNames)
+      {
+        using std::endl;
+
+        // Find min and max timing over all MPI processes
+        TimeStats localStats;
+        localStats.update (localTiming);
+        TimeStats globalStats = globalTimeStats (*doubleComm_, localStats);
+
+        // Only Rank 0 prints the final results.
+        const bool printResults = (doubleComm_->rank() == 0);
+        if(printResults) {
+          const int numProcs = doubleComm_->size();
+          if(humanReadable_) {
+            out_ << method << " cumulative benchmark results "
+                 << "(total time over all trials):" << endl
+                 << "Scalar: " << scalarTypeName_ << endl
+                 << "numCols: " << numCols << endl
+                 << "MPI comm size: " << numProcs << endl
+                 << "numTrials: " << numTrials << endl
+                 << "Min timing (s): " << globalStats.min() << endl
+                 << "Mean timing (s): " << globalStats.mean() << endl
+                 << "Max timing (s): " << globalStats.max() << endl
+                 << endl;
+          }
+          else {
+            // Use scientific notation for floating-point numbers
+            out_ << std::scientific;
+
+            if(printFieldNames) {
+              out_ << "%method,scalarType,numCols,numProcs,numTrials"
+                   << ",minTiming,meanTiming,maxTiming";
+              if(! additionalFieldNames.empty()) {
+                out_ << "," << additionalFieldNames;
+              }
+              out_ << endl;
+            }
+
+            out_ << method
+                 << "," << scalarTypeName_
+                 << "," << numCols
+                 << "," << numProcs
+                 << "," << numTrials
+                 << "," << globalStats.min()
+                 << "," << globalStats.mean()
+                 << "," << globalStats.max();
+            if(! additionalData.empty()) {
+              out_ << "," << additionalData;
+            }
+            out_ << endl;
+          }
+        }
+      }
+
+      void
+      testProblem(Matrix<Ordinal, Scalar>& A_local,
+                  Matrix<Ordinal, Scalar>& Q_local,
+                  Matrix<Ordinal, Scalar>& R,
+                  const Ordinal numCols)
+      {
+        const Ordinal numRowsLocal = numCols;
+
+        // A_local: Space for the matrix A to factor -- local to each
+        //   (MPI) process.
+        //
+        // A_global: Global matrix (only nonempty on Proc 0); only
+        //   used temporarily.
+        Matrix<Ordinal, Scalar> A_global;
+
+        // This modifies A_local on all procs, and A_global on Proc 0.
+        par_tsqr_test_problem(gen_, A_local, A_global, numCols,
+                              scalarComm_);
+
+        // Copy the test problem input into R, since the factorization
+        // will overwrite it in place with the final R factor.
+        R.reshape(numCols, numCols);
+        deep_copy(R, A_local);
+
+        // Prepare space in which to construct the explicit Q factor
+        // (local component on this processor)
+        Q_local.reshape(numRowsLocal, numCols);
+        deep_copy(Q_local, Scalar {});
+      }
+    };
+  } // namespace Test
+} // namespace TSQR
+
+template<class Ordinal, class Scalar>
+class MessengerPairMaker {
+public:
+  using ordinal_type = Ordinal;
+  using scalar_type = Scalar;
+
+  using pair_type = std::pair<
+    Teuchos::RCP<TSQR::MessengerBase<ordinal_type>>,
+    Teuchos::RCP<TSQR::MessengerBase<scalar_type>>
+    >;
+
+  static pair_type
+  makePair(const Teuchos::RCP<const Teuchos::Comm<int>>& comm)
+  {
+    using Teuchos::RCP;
+    using Teuchos::rcp;
+    using Teuchos::rcp_implicit_cast;
+    using TSQR::MessengerBase;
+    using TSQR::TeuchosMessenger;
+
+    auto derivedOrdinalComm =
+      rcp(new TeuchosMessenger<ordinal_type>(comm));
+    auto ordinalComm =
+      rcp_implicit_cast<MessengerBase<ordinal_type>>(derivedOrdinalComm);
+    auto derivedScalarComm =
+      rcp (new TeuchosMessenger<scalar_type>(comm));
+    auto scalarComm =
+      rcp_implicit_cast<MessengerBase<scalar_type>>(derivedScalarComm);
+
+    return {ordinalComm, scalarComm};
+  }
+};
 
 #define TSQR_TEST_DIST_TSQR( ScalarType, typeString )                   \
   do {                                                                  \
-    typedef int ordinal_type;                                           \
-    typedef ScalarType scalar_type;                                     \
-    typedef MessengerPairMaker<ordinal_type, scalar_type>::pair_type pair_type; \
-    typedef DistTsqrVerifier<int, scalar_type> verifier_type;           \
-    \
+    using TSQR::Test::DistTsqrVerifier;                                 \
+    using LO = int;                                                     \
+    using SC = ScalarType;                                              \
+    using verifier_type = DistTsqrVerifier<LO, SC>;                     \
+                                                                        \
     std::string scalarTypeName (typeString);                            \
-    pair_type messPair = MessengerPairMaker< ordinal_type, scalar_type >::makePair (comm); \
+    auto messPair = MessengerPairMaker<LO, SC>::makePair (comm);        \
     verifier_type verifier (messPair.first, messPair.second, seed,      \
-        scalarTypeName, out, err,                       \
-        testFactorExplicit, testFactorImplicit, \
-        humanReadable, printMatrices, debug);           \
+                            scalarTypeName, out, err,                   \
+                            testFactorExplicit, testFactorImplicit,     \
+                            humanReadable, printMatrices, debug);       \
     verifier.verify (numCols, params.additionalFieldNames,              \
-        params.additionalData, params.printFieldNames); \
+                     params.additionalData, params.printFieldNames);    \
     verifier.getSeed (seed);                                            \
-  } while(false)
+  } while (false)
 
 
 #define TSQR_BENCHMARK_DIST_TSQR( theType, typeString )                 \
   do {                                                                  \
-    typedef theType scalar_type;                                                \
-    typedef MessengerBase< scalar_type > base_messenger_type;           \
-    typedef RCP< base_messenger_type > base_messenger_ptr;              \
-    typedef TeuchosMessenger< scalar_type > derived_messenger_type;       \
-    typedef RCP< derived_messenger_type > derived_messenger_ptr;                \
-    typedef DistTsqrBenchmarker<int, scalar_type, timer_type>           \
-    benchmarker_type;                                                   \
-    \
+    using TSQR::Test::DistTsqrBenchmarker;                              \
+    using Teuchos::RCP;                                                 \
+    using SC = theType;                                                 \
+    using base_messenger_type = TSQR::MessengerBase<SC>;                \
+    using base_messenger_ptr = RCP<base_messenger_type>;                \
+    using derived_messenger_type = TSQR::TeuchosMessenger<SC>;          \
+    using derived_messenger_ptr = RCP<derived_messenger_type>;          \
+    using benchmarker_type = DistTsqrBenchmarker<int, SC>;              \
+                                                                        \
     std::string scalarTypeName (typeString);                            \
-    derived_messenger_ptr scalarCommDerived (new derived_messenger_type (comm)); \
-    base_messenger_ptr scalarComm =                                     \
-    rcp_implicit_cast< base_messenger_type > (scalarCommDerived);       \
+    derived_messenger_ptr scalarCommDerived                             \
+      (new derived_messenger_type (comm));                              \
+    auto scalarComm =                                                   \
+      rcp_implicit_cast<base_messenger_type> (scalarCommDerived);       \
     benchmarker_type benchmarker (scalarComm, doubleComm, seed,         \
-        scalarTypeName, out, err,               \
-        testFactorExplicit, testFactorImplicit, \
-        humanReadable, debug);                  \
+                                  scalarTypeName, out, err,             \
+                                  testFactorExplicit,                   \
+                                  testFactorImplicit,                   \
+                                  humanReadable, debug);                \
     benchmarker.benchmark (numTrials, numCols,                          \
-        params.additionalFieldNames,                    \
-        params.additionalData,                          \
-        params.printFieldNames);                        \
+                           params.additionalFieldNames,                 \
+                           params.additionalData,                       \
+                           params.printFieldNames);                     \
     benchmarker.getSeed (seed);                                         \
-  } while(false)
-
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
+  } while (false)
 
 /// \class DistTsqrTestParameters
 /// \brief Encapsulates values of command-line parameters
-///
 struct DistTsqrTestParameters {
-  DistTsqrTestParameters () :
-    numCols (10),
-    numTrials (10),
-    verify (false),
-    benchmark (false),
-    testReal (true),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    testComplex (true),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    testFactorExplicit (true),
-    testFactorImplicit (true),
-    printFieldNames (true),
-    printTrilinosTestStuff (true),
-    humanReadable (false),
-    printMatrices (false),
-    debug (false)
-    {}
-
-  std::string additionalFieldNames, additionalData;
-  int numCols, numTrials;
-  bool verify, benchmark;
-  bool testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-  bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-  bool testFactorExplicit, testFactorImplicit;
-  bool printFieldNames, printTrilinosTestStuff;
-  bool humanReadable, printMatrices, debug;
+  int numCols = 10;
+  int numTrials = 10;
+  bool verify = true;
+  bool benchmark = false;
+  bool testReal = true;
+#ifdef HAVE_TPETRATSQR_COMPLEX
+  bool testComplex = true;
+#else
+  bool testComplex = false;
+#endif // HAVE_TPETRATSQR_COMPLEX
+  bool testFactorExplicit = true;
+  bool testFactorImplicit = true;
+  bool printFieldNames = true;
+  bool printTrilinosTestStuff = true;
+  bool humanReadable = false;
+  bool printMatrices = false;
+  bool debug = false;
+
+  std::string additionalFieldNames;
+  std::string additionalData;
 };
 
-  static void
-verify (RCP< const Teuchos::Comm<int> > comm,
-    const DistTsqrTestParameters& params,
-    std::ostream& out,
-    std::ostream& err,
-    std::vector<int>& seed,
-    const bool useSeed)
+static void
+verify(Teuchos::RCP<const Teuchos::Comm<int>> comm,
+       const DistTsqrTestParameters& params,
+       std::ostream& out,
+       std::ostream& err,
+       std::vector<int>& seed,
+       const bool useSeed)
 {
   const bool testReal = params.testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
   const bool testComplex = params.testComplex;
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
-  const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
   const int numCols = params.numCols;
   const bool testFactorExplicit = params.testFactorExplicit;
   const bool testFactorImplicit = params.testFactorImplicit;
@@ -201,52 +892,44 @@ verify (RCP< const Teuchos::Comm<int> > comm,
   const bool printMatrices = params.printMatrices;
   const bool debug = params.debug;
 
-  if (! useSeed)
-  {
-    seed.resize (4);
+  if(! useSeed) {
+    seed.resize(4);
     seed[0] = 0;
     seed[1] = 0;
     seed[2] = 0;
     seed[3] = 1;
   }
-  if (testReal)
-  {
+  if(testReal) {
     TSQR_TEST_DIST_TSQR( float, "float" );
     TSQR_TEST_DIST_TSQR( double, "double" );
   }
-  if (testComplex)
-  {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+  if(testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
     using std::complex;
 
     TSQR_TEST_DIST_TSQR( complex<float>, "complex<float>" );
     TSQR_TEST_DIST_TSQR( complex<double>, "complex<double>" );
 
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
+#else // Don't HAVE_TPETRATSQR_COMPLEX
     throw std::logic_error("TSQR was not built with complex "
-        "arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+                           "arithmetic support");
+#endif // HAVE_TPETRATSQR_COMPLEX
   }
 }
 
 
-  static void
-benchmark (RCP< const Teuchos::Comm<int> > comm,
-    const DistTsqrTestParameters& params,
-    std::ostream& out,
-    std::ostream& err,
-    std::vector<int>& seed,
-    const bool useSeed)
+static void
+benchmark(Teuchos::RCP<const Teuchos::Comm<int>> comm,
+          const DistTsqrTestParameters& params,
+          std::ostream& out,
+          std::ostream& err,
+          std::vector<int>& seed,
+          const bool useSeed)
 {
-  typedef Teuchos::Time timer_type;
+  using timer_type = Teuchos::Time;
 
   const bool testReal = params.testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
   const bool testComplex = params.testComplex;
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
-  const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
   const int numCols = params.numCols;
   const int numTrials = params.numTrials;
   const bool testFactorExplicit = params.testFactorExplicit;
@@ -254,34 +937,36 @@ benchmark (RCP< const Teuchos::Comm<int> > comm,
   const bool humanReadable = params.humanReadable;
   const bool debug = params.debug;
 
-  if (! useSeed)
-  {
-    seed.resize (4);
+  if(! useSeed) {
+    seed.resize(4);
     seed[0] = 0;
     seed[1] = 0;
     seed[2] = 0;
     seed[3] = 1;
   }
-  RCP< MessengerBase< double > > doubleComm =
-    rcp_implicit_cast< MessengerBase< double > > (RCP< TeuchosMessenger< double > > (new TeuchosMessenger< double > (comm)));
+  using Teuchos::rcp;
+  auto doubleCommSub =
+    rcp(new TSQR::TeuchosMessenger<double>(comm));
+  using TSQR::MessengerBase;
+  using Teuchos::rcp_implicit_cast;
+  auto doubleComm =
+    rcp_implicit_cast<MessengerBase<double>>(doubleCommSub);
 
-  if (testReal)
-  {
+  if(testReal) {
     TSQR_BENCHMARK_DIST_TSQR( float, "float" );
     TSQR_BENCHMARK_DIST_TSQR( double, "double" );
   }
-  if (testComplex)
-  {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+  if(testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
     using std::complex;
 
     TSQR_BENCHMARK_DIST_TSQR( complex<float>, "complex<float>" );
     TSQR_BENCHMARK_DIST_TSQR( complex<double>, "complex<double>" );
 
-#else // Don't HAVE_KOKKOSTSQR_COMPLEX
+#else // Don't HAVE_TPETRATSQR_COMPLEX
     throw std::logic_error("TSQR was not built with complex "
-        "arithmetic support");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+                           "arithmetic support");
+#endif // HAVE_TPETRATSQR_COMPLEX
   }
 }
 
@@ -296,44 +981,44 @@ benchmark (RCP< const Teuchos::Comm<int> > comm,
 ///   "help" display (summary of command-line options)
 ///
 /// \return Encapsulation of command-line options
-  static DistTsqrTestParameters
-parseOptions (int argc,
-    char* argv[],
-    const bool allowedToPrint,
-    bool& printedHelp)
+static DistTsqrTestParameters
+parseOptions(int argc,
+             char* argv[],
+             std::ostream& err,
+             bool& printedHelp)
 {
-  using std::cerr;
   using std::endl;
-
   printedHelp = false;
 
   // Command-line parameters, set to their default values.
-  DistTsqrTestParameters params;
+  DistTsqrTestParameters params {};
   try {
-    Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-        /* recognizeAllOptions=*/ true);
+    constexpr bool throwExceptions = true;
+    constexpr bool recognizeAllOptions = true;
+    using CLP = Teuchos::CommandLineProcessor;
+    CLP cmdLineProc(throwExceptions, recognizeAllOptions);
 
     const char docString[] = "This program tests TSQR::DistTsqr, which "
       "implements the internode-parallel part of TSQR (TSQR::Tsqr).  "
       "Accuracy and performance tests are included.";
-    cmdLineProc.setDocString (docString);
-    cmdLineProc.setOption ("verify",
+    cmdLineProc.setDocString(docString);
+    cmdLineProc.setOption("verify",
         "noverify",
         &params.verify,
         "Test accuracy");
-    cmdLineProc.setOption ("benchmark",
+    cmdLineProc.setOption("benchmark",
         "nobenchmark",
         &params.benchmark,
         "Test performance");
-    cmdLineProc.setOption ("implicit",
+    cmdLineProc.setOption("implicit",
         "noimplicit",
         &params.testFactorImplicit,
         "Test DistTsqr\'s factor() and explicit_Q()");
-    cmdLineProc.setOption ("explicit",
+    cmdLineProc.setOption("explicit",
         "noexplicit",
         &params.testFactorExplicit,
         "Test DistTsqr\'s factorExplicit()");
-    cmdLineProc.setOption ("field-names",
+    cmdLineProc.setOption("field-names",
         &params.additionalFieldNames,
         "Any additional field name(s) (comma-delimited "
         "string) to add to the benchmark output.  Empty "
@@ -341,55 +1026,54 @@ parseOptions (int argc,
         "the benchmark executable, but not (easily) known "
         "inside the benchmark -- e.g., environment "
         "variables.");
-    cmdLineProc.setOption ("output-data",
+    cmdLineProc.setOption("output-data",
         &params.additionalData,
         "Any additional data to add to the output, "
         "corresponding to the above field name(s). "
         "Empty by default.");
-    cmdLineProc.setOption ("print-field-names",
+    cmdLineProc.setOption("print-field-names",
         "no-print-field-names",
         &params.printFieldNames,
         "Print field names (for machine-readable output only)");
-    cmdLineProc.setOption ("print-trilinos-test-stuff",
+    cmdLineProc.setOption("print-trilinos-test-stuff",
         "no-print-trilinos-test-stuff",
         &params.printTrilinosTestStuff,
         "Print output that makes the Trilinos test "
         "framework happy (but makes benchmark results "
         "parsing scripts unhappy)");
-    cmdLineProc.setOption ("print-matrices",
+    cmdLineProc.setOption("print-matrices",
         "no-print-matrices",
         &params.printMatrices,
         "Print global test matrices and computed results to stderr");
-    cmdLineProc.setOption ("debug",
+    cmdLineProc.setOption("debug",
         "nodebug",
         &params.debug,
         "Print debugging information");
-    cmdLineProc.setOption ("human-readable",
+    cmdLineProc.setOption("human-readable",
         "machine-readable",
         &params.humanReadable,
         "If set, make output easy to read by humans "
         "(but hard to parse)");
-    cmdLineProc.setOption ("ncols",
+    cmdLineProc.setOption("ncols",
         &params.numCols,
         "Number of columns in the test matrix");
-    cmdLineProc.setOption ("ntrials",
+    cmdLineProc.setOption("ntrials",
         &params.numTrials,
         "Number of trials (only used when \"--benchmark\"");
-    cmdLineProc.setOption ("real",
+    cmdLineProc.setOption("real",
         "noreal",
         &params.testReal,
         "Test real arithmetic routines");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    cmdLineProc.setOption ("complex",
+    cmdLineProc.setOption("complex",
         "nocomplex",
         &params.testComplex,
-        "Test complex arithmetic routines");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+        "Test complex arithmetic routines (only set to true if "
+        "complex arithmetic support was enabled at configure "
+        "time)");
     cmdLineProc.parse (argc, argv);
   }
   catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-    if (allowedToPrint)
-      cerr << "Unrecognized command-line option: " << e.what() << endl;
+    err << "Unrecognized command-line option: " << e.what() << endl;
     throw e;
   }
   catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
@@ -398,73 +1082,60 @@ parseOptions (int argc,
 
   // Validate command-line options.  We provide default values
   // for unset options, so we don't have to validate those.
-  if (params.numCols <= 0)
-    throw std::invalid_argument ("Number of columns must be positive");
-  else if (params.benchmark && params.numTrials < 1)
-    throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");
-
+  TEUCHOS_TEST_FOR_EXCEPTION
+    (params.numCols <= 0, std::invalid_argument,
+     "You set --numCols=" << params.numCols << ".  The number of "
+     "columns in the matrix to test must be positive.");
+  TEUCHOS_TEST_FOR_EXCEPTION
+    (params.benchmark && params.numTrials < 1, std::invalid_argument,
+     "\"--benchmark\" option requires positive --numTrials, but you "
+     "set --numTrials=" << params.numTrials << ".");
+#ifndef HAVE_TPETRATSQR_COMPLEX
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (params.testComplex, std::invalid_argument, "Complex "
+       "arithmetic support was not enabled at configure time, "
+       "but you set --testComplex.");
+#endif // HAVE_TPETRATSQR_COMPLEX
   return params;
 }
 
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-  int
-main (int argc, char *argv[])
+int
+main(int argc, char *argv[])
 {
-#ifdef HAVE_MPI
-  typedef RCP< const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to cout and cerr.  The other MPI
-  // process ranks send their output to a "black hole" (something that
-  // acts like /dev/null, and may be /dev/null).
-  const bool allowedToPrint = (myRank == 0);
-  std::ostream& out = allowedToPrint ? std::cout : blackhole;
-  std::ostream& err = allowedToPrint ? std::cerr : blackhole;
-
-#else // Don't HAVE_MPI: single-node test
-
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-  std::ostream& err = std::cerr;
-#endif // HAVE_MPI
+  TSQR::Test::MpiAndKokkosScope testScope(&argc, &argv);
+  auto comm = testScope.getComm();
+  std::ostream& out = testScope.outStream();
+  std::ostream& err = testScope.errStream();
 
   // Fetch command-line parameters.
   bool printedHelp = false;
-  DistTsqrTestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp)
-    return 0;
-
+  auto params = parseOptions(argc, argv, err, printedHelp);
+  if(printedHelp) {
+    return EXIT_SUCCESS;
+  }
   bool success = false;
-  bool verbose = false;
+  constexpr bool actually_print_caught_exceptions = true;
   try {
-    if (params.verify)
-    {
+    if(params.verify) {
       std::vector<int> seed(4);
       const bool useSeed = false;
-      verify (comm, params, out, err, seed, useSeed);
+      verify(comm, params, out, err, seed, useSeed);
     }
 
-    if (params.benchmark)
-    {
+    if(params.benchmark) {
       std::vector<int> seed(4);
       const bool useSeed = false;
-      benchmark (comm, params, out, err, seed, useSeed);
+      benchmark(comm, params, out, err, seed, useSeed);
     }
 
     success = true;
 
-    if (allowedToPrint && params.printTrilinosTestStuff)
+    if(params.printTrilinosTestStuff) {
       // The Trilinos test framework expects a message like this.
       out << "\nEnd Result: TEST PASSED" << std::endl;
+    }
   }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
-  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
+  TEUCHOS_STANDARD_CATCH_STATEMENTS
+    (actually_print_caught_exceptions, err, success);
+  return success ? EXIT_SUCCESS : EXIT_FAILURE;
 }
-
-
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp
index c60d652fc651..6b14b977b01f 100644
--- a/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp
+++ b/packages/tpetra/tsqr/test/Tsqr_TestFullTsqr.cpp
@@ -38,65 +38,65 @@
 //@HEADER
 
 #include "Tsqr_FullTsqrTest.hpp"
-
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
+#include "Tsqr_Test_MpiAndKokkosScope.cpp"
 #include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_StandardCatchMacros.hpp"
 
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
+#ifdef HAVE_TPETRATSQR_COMPLEX
 #  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+#endif // HAVE_TPETRATSQR_COMPLEX
 
 namespace {
-  //
-  // Documentation string to print out if --help is a command-line argument.
-  //
-  const char docString[] = "This program tests correctness and accuracy of "
-    "TSQR::Tsqr, which is the full implementation of TSQR.";
+  using Teuchos::parameterList;
+
+  // Documentation string to print out if --help is a command-line
+  // argument.
+  const char docString[] = "This program tests correctness and "
+    "accuracy of TSQR::Tsqr, which is the full implementation of "
+    "TSQR.";
 
-  //
   // Encapsulation of all command-line parameters.
-  //
   struct CmdLineOptions {
-    //
-    // Given a default valid parameter list from FullTsqrVerifierCaller,
-    // fill in the command-line options with their default values.
-    //
-    CmdLineOptions (const Teuchos::RCP<const Teuchos::ParameterList>& testParams) :
-      cacheSizeHint (testParams->get<size_t> ("cacheSizeHint")),
-      numRowsLocal (testParams->get<int> ("numRowsLocal")),
-      numCols (testParams->get<int> ("numCols")),
-      contiguousCacheBlocks (testParams->get<bool> ("contiguousCacheBlocks")),
-      testFactorExplicit (testParams->get<bool> ("testFactorExplicit")),
-      testRankRevealing (testParams->get<bool> ("testRankRevealing")),
-      printFieldNames (testParams->get<bool> ("printFieldNames")),
-      printResults (testParams->get<bool> ("printResults")),
-      failIfInaccurate (testParams->get<bool> ("failIfInaccurate")),
-      debug (testParams->get<bool> ("debug")),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      testComplex (false),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      testReal (false) // default is not to test _anything_
-      {}
+    // Given a default valid parameter list from
+    // FullTsqrVerifierCaller, fill in the command-line options with
+    // their default values.
+    CmdLineOptions(const Teuchos::RCP<const Teuchos::ParameterList>& testParams) :
+      cacheSizeHint(testParams->get<size_t>("Cache Size Hint")),
+      numRowsLocal(testParams->get<int>("numRowsLocal")),
+      numCols(testParams->get<int>("numCols")),
+      contiguousCacheBlocks(testParams->get<bool>("contiguousCacheBlocks")),
+      testFactorExplicit(testParams->get<bool>("testFactorExplicit")),
+      testRankRevealing(testParams->get<bool>("testRankRevealing")),
+      printFieldNames(testParams->get<bool>("printFieldNames")),
+      printResults(testParams->get<bool>("printResults")),
+      failIfInaccurate(testParams->get<bool>("failIfInaccurate")),
+      nodeTsqr(testParams->get<std::string>("NodeTsqr")),
+#ifdef HAVE_TPETRATSQR_COMPLEX
+      testComplex(true),
+#else
+      testComplex(false),
+#endif // HAVE_TPETRATSQR_COMPLEX
+      testReal(true),
+      verbose(testParams->get<bool>("verbose"))
+    {}
 
-    size_t cacheSizeHint;
-    int numRowsLocal;
-    int numCols;
-    bool contiguousCacheBlocks;
-    bool testFactorExplicit;
-    bool testRankRevealing;
-    bool printFieldNames;
-    bool printResults;
-    bool failIfInaccurate;
-    bool debug;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    bool testReal;
+    size_t cacheSizeHint = 0;
+    int numRowsLocal = 10000;
+    int numCols = 5;
+    bool contiguousCacheBlocks = false;
+    bool testFactorExplicit = true;
+    bool testRankRevealing = true;
+    bool printFieldNames = true;
+    bool printResults = true;
+    bool failIfInaccurate = true;
+    std::string nodeTsqr {"Default"};
+#ifdef HAVE_TPETRATSQR_COMPLEX
+    bool testComplex = true;
+#else
+    bool testComplex = false;
+#endif // HAVE_TPETRATSQR_COMPLEX
+    bool testReal = true;
+    bool verbose = false;
 
     // \brief Read command-line options.
     //
@@ -108,100 +108,116 @@ namespace {
     //
     // \param argv [in] As usual in C(++).
     //
-    // \param allowedToPrint [in] Whether this (MPI) process is allowed
-    //   to print to stdout/stderr.  Different per (MPI) process.
-    //
-    // \param printedHelp [out] Whether this (MPI) process printed the
-    //   "help" display (summary of command-line options)
-    //
     // \param testParams [in] List of test parameters for the
     //   FullTsqrVerifierCaller.
     //
+    // \param err [out] Output stream to which to print error
+    //   messages.  Different per (MPI) process.
+    //
     // \return Whether help was printed.
     bool
-      read (int argc,
-          char* argv[],
-          const Teuchos::RCP<const Teuchos::ParameterList>& defaultParams,
-          const bool allowedToPrint)
-      {
-        using std::cerr;
-        using std::endl;
-
-        try {
-          Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-              /* recognizeAllOptions=*/ true);
-          cmdLineProc.setDocString (docString);
-          cmdLineProc.setOption ("testReal",
-              "noTestReal",
-              &testReal,
-              "Test real Scalar types");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          cmdLineProc.setOption ("testComplex",
-              "noTestComplex",
-              &testComplex,
-              "Test complex Scalar types");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-          // CommandLineProcessor takes int arguments, but not size_t
-          // arguments, so we have to read in the argument as an int and
-          // convert back to size_t later.
-          int cacheSizeHintAsInt = cacheSizeHint;
-          cmdLineProc.setOption ("cacheSizeHint",
-              &cacheSizeHintAsInt,
-              defaultParams->getEntry("cacheSizeHint").docString().c_str());
-          cmdLineProc.setOption ("numRowsLocal",
-              &numRowsLocal,
-              defaultParams->getEntry("numRowsLocal").docString().c_str());
-          cmdLineProc.setOption ("numCols",
-              &numCols,
-              defaultParams->getEntry("numCols").docString().c_str());
-          cmdLineProc.setOption ("contiguousCacheBlocks",
-              "noContiguousCacheBlocks",
-              &contiguousCacheBlocks,
-              defaultParams->getEntry("contiguousCacheBlocks").docString().c_str());
-          cmdLineProc.setOption ("testFactorExplicit",
-              "noTestFactorExplicit",
-              &testFactorExplicit,
-              defaultParams->getEntry("testFactorExplicit").docString().c_str());
-          cmdLineProc.setOption ("testRankRevealing",
-              "noTestRankRevealing",
-              &testRankRevealing,
-              defaultParams->getEntry("testRankRevealing").docString().c_str());
-          cmdLineProc.setOption ("printFieldNames",
-              "noPrintFieldNames",
-              &printFieldNames,
-              defaultParams->getEntry("printFieldNames").docString().c_str());
-          cmdLineProc.setOption ("printResults",
-              "noPrintResults",
-              &printResults,
-              defaultParams->getEntry("printResults").docString().c_str());
-          cmdLineProc.setOption ("failIfInaccurate",
-              "noFailIfInaccurate",
-              &failIfInaccurate,
-              defaultParams->getEntry("failIfInaccurate").docString().c_str());
-          cmdLineProc.setOption ("debug",
-              "nodebug",
-              &debug,
-              defaultParams->getEntry("debug").docString().c_str());
-          cmdLineProc.parse (argc, argv);
-          cacheSizeHint = static_cast<size_t> (cacheSizeHintAsInt);
-        }
-        catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-          if (allowedToPrint)
-            cerr << "Unrecognized command-line option: " << e.what() << endl;
-          throw e;
-        }
-        catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-          return true;
-        }
+    read(int argc,
+         char* argv[],
+         const Teuchos::RCP<const Teuchos::ParameterList>& defaultParams,
+         std::ostream& err)
+    {
+      using Teuchos::CommandLineProcessor;
+      using std::endl;
 
-        // Validate command-line options.  We provide default values
-        // for unset options, so we don't have to validate those.
-        TEUCHOS_TEST_FOR_EXCEPTION(numRowsLocal <= 0, std::invalid_argument,
-            "Number of rows per process must be positive.");
-        TEUCHOS_TEST_FOR_EXCEPTION(numCols <= 0, std::invalid_argument,
-            "Number of columns must be positive.");
-        return false; // Did not print help
+      try {
+        const bool throwExceptions = true;
+        const bool recognizeAllOptions = true;
+        CommandLineProcessor cmdLineProc(throwExceptions,
+                                         recognizeAllOptions);
+        cmdLineProc.setDocString(docString);
+        cmdLineProc.setOption("testReal",
+                              "noTestReal",
+                              &testReal,
+                              "Test real Scalar types");
+        cmdLineProc.setOption("testComplex",
+                              "noTestComplex",
+                              &testComplex,
+                              "Test complex Scalar types; must be "
+                              "false if complex Scalar types were "
+                              "disabled at configure (pre-build) "
+                              "time");
+        // CommandLineProcessor takes int arguments, but not size_t
+        // arguments, so we have to read in the argument as an int and
+        // convert back to size_t later.
+        int cacheSizeHintAsInt = cacheSizeHint;
+        cmdLineProc.setOption("cacheSizeHint",
+                              &cacheSizeHintAsInt,
+                              defaultParams->getEntry
+                              ("Cache Size Hint").docString().c_str());
+        cmdLineProc.setOption("numRowsLocal",
+                              &numRowsLocal,
+                              defaultParams->getEntry
+                              ("numRowsLocal").docString().c_str());
+        cmdLineProc.setOption("numCols",
+                              &numCols,
+                              defaultParams->getEntry
+                              ("numCols").docString().c_str());
+        cmdLineProc.setOption("contiguousCacheBlocks",
+                              "noContiguousCacheBlocks",
+                              &contiguousCacheBlocks,
+                              defaultParams->getEntry
+                              ("contiguousCacheBlocks").docString().c_str());
+        cmdLineProc.setOption("testFactorExplicit",
+                              "noTestFactorExplicit",
+                              &testFactorExplicit,
+                              defaultParams->getEntry
+                              ("testFactorExplicit").docString().c_str());
+        cmdLineProc.setOption("testRankRevealing",
+                              "noTestRankRevealing",
+                              &testRankRevealing,
+                              defaultParams->getEntry
+                              ("testRankRevealing").docString().c_str());
+        cmdLineProc.setOption("printFieldNames",
+                              "noPrintFieldNames",
+                              &printFieldNames,
+                              defaultParams->getEntry
+                              ("printFieldNames").docString().c_str());
+        cmdLineProc.setOption("printResults",
+                              "noPrintResults",
+                              &printResults,
+                              defaultParams->getEntry
+                              ("printResults").docString().c_str());
+        cmdLineProc.setOption("failIfInaccurate",
+                              "noFailIfInaccurate",
+                              &failIfInaccurate,
+                              defaultParams->getEntry
+                              ("failIfInaccurate").docString().c_str());
+        cmdLineProc.setOption("NodeTsqr",
+                              &nodeTsqr,
+                              defaultParams->getEntry
+                              ("NodeTsqr").docString().c_str());
+        cmdLineProc.setOption("verbose",
+                              "quiet",
+                              &verbose,
+                              defaultParams->getEntry
+                              ("verbose").docString().c_str());
+        cmdLineProc.parse(argc, argv);
+        cacheSizeHint = size_t(cacheSizeHintAsInt);
+      }
+      catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
+        err << "Unrecognized command-line option: " << e.what()
+            << endl;
+        throw e;
       }
+      catch(Teuchos::CommandLineProcessor::HelpPrinted& e) {
+        return true;
+      }
+
+      // Validate command-line options.  We provide default values
+      // for unset options, so we don't have to validate those.
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (numRowsLocal <= 0, std::invalid_argument,
+         "Number of rows per process must be positive.");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (numCols <= 0, std::invalid_argument,
+         "Number of columns must be positive.");
+      return false; // Did not print help
+    }
   };
 
   //
@@ -210,37 +226,34 @@ namespace {
   // the command line), return a parameter list describing the test.
   //
   Teuchos::RCP<Teuchos::ParameterList>
-    testParameters (const Teuchos::RCP<const Teuchos::ParameterList>& validParams,
-        const CmdLineOptions& options)
-    {
-      using Teuchos::ParameterList;
-      using Teuchos::parameterList;
-      using Teuchos::RCP;
-
-      RCP<ParameterList> testParams = parameterList ("FullTsqrVerifier");
-      testParams->set ("cacheSizeHint", options.cacheSizeHint);
-      testParams->set ("numRowsLocal", options.numRowsLocal);
-      testParams->set ("numCols", options.numCols);
-      testParams->set ("testFactorExplicit", options.testFactorExplicit);
-      testParams->set ("testRankRevealing", options.testRankRevealing);
-      testParams->set ("contiguousCacheBlocks", options.contiguousCacheBlocks);
-      testParams->set ("printFieldNames", options.printFieldNames);
-      testParams->set ("printResults", options.printResults);
-      testParams->set ("failIfInaccurate", options.failIfInaccurate);
-      testParams->set ("debug", options.debug);
+  testParameters(const Teuchos::RCP<const Teuchos::ParameterList>& validParams,
+                 const CmdLineOptions& options)
+  {
+    auto testParams = parameterList ("FullTsqrVerifier");
+    testParams->set("Cache Size Hint", options.cacheSizeHint);
+    testParams->set("numRowsLocal", options.numRowsLocal);
+    testParams->set("numCols", options.numCols);
+    testParams->set("testFactorExplicit",
+                    options.testFactorExplicit);
+    testParams->set("testRankRevealing", options.testRankRevealing);
+    testParams->set("contiguousCacheBlocks",
+                    options.contiguousCacheBlocks);
+    testParams->set("printFieldNames", options.printFieldNames);
+    testParams->set("printResults", options.printResults);
+    testParams->set("failIfInaccurate", options.failIfInaccurate);
+    testParams->set("NodeTsqr", options.nodeTsqr);
+    testParams->set("verbose", options.verbose);
 
-      testParams->validateParametersAndSetDefaults (*validParams);
-      return testParams;
-    }
+    testParams->validateParametersAndSetDefaults(*validParams);
+    return testParams;
+  }
 
-  //
   // Return true if all tests were successful, else false.
-  //
   bool
-  test (int argc,
-        char* argv[],
-        const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
-        const bool allowedToPrint)
+  test(int argc,
+       char* argv[],
+       const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
+       std::ostream& err)
   {
     using TSQR::Test::NullCons;
     using TSQR::Test::Cons;
@@ -249,41 +262,35 @@ namespace {
     using Teuchos::parameterList;
     using Teuchos::RCP;
     using Teuchos::rcp;
-    //
-    // Get a default random seed, and set up the Caller (that iterates
-    // the test over all Scalar types of interest).
-    //
-    typedef TSQR::Test::FullTsqrVerifierCaller caller_type;
-    std::vector<int> randomSeed = caller_type::defaultRandomSeed ();
-    caller_type caller (comm, randomSeed);
 
-    //
+    // The Caller iterates the test over all Scalar types.
+    using caller_type = TSQR::Test::FullTsqrVerifierCaller;
+    caller_type caller(comm, caller_type::defaultRandomSeed ());
+
     // Read command-line options
-    //
-    RCP<const ParameterList> defaultParams = caller.getValidParameterList();
-    CmdLineOptions cmdLineOpts (defaultParams);
-    const bool printedHelp = cmdLineOpts.read (argc, argv, defaultParams, allowedToPrint);
+    auto defaultParams = caller.getValidParameterList();
+    CmdLineOptions cmdLineOpts(defaultParams);
+    const bool printedHelp =
+      cmdLineOpts.read(argc, argv, defaultParams, err);
     // Don't run the tests (and do succeed) if help was printed.
-    if (printedHelp)
+    if(printedHelp) {
       return true;
+    }
 
     //
     // Use read-in command-line options to set up test parameters.
     //
-    RCP<ParameterList> testParams = testParameters (defaultParams, cmdLineOpts);
+    auto testParams = testParameters(defaultParams, cmdLineOpts);
     defaultParams = null; // save a little space
 
-    //
     // Define lists of Scalar types to test.  We keep separate lists
     // for real and complex types, since callers can control whether
     // each of these is tested independently on the command line.
-    //
-    typedef Cons<float, Cons<double, NullCons> > real_type_list;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    typedef Cons<std::complex<float>, Cons<std::complex<double>, NullCons> > complex_type_list;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+    using real_type_list = Cons<float, Cons<double, NullCons>>;
+#ifdef HAVE_TPETRATSQR_COMPLEX
+    using complex_type_list = Cons<std::complex<float>, Cons<std::complex<double>, NullCons>>;
+#endif // HAVE_TPETRATSQR_COMPLEX
 
-    //
     // Run the tests.  If the tests are set up to fail on
     // insufficiently inaccurate results, run() will throw an
     // exception in that case.  Otherwise, the tests return nothing,
@@ -292,15 +299,18 @@ namespace {
     // The testReal and testComplex options are read in at the command
     // line, but since they do not apply to all Scalar types, they
     // don't belong in testParams.
-    //
-    if (cmdLineOpts.testReal)
-      caller.run<real_type_list> (testParams);
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    if (cmdLineOpts.testComplex)
-      caller.run<complex_type_list> (testParams);
-#endif // HAVE_KOKKOSTSQR_COMPLEX
+    const bool realResult = cmdLineOpts.testReal ?
+      caller.run<real_type_list>(testParams) :
+      true;
+#ifdef HAVE_TPETRATSQR_COMPLEX
+    const bool complexResult = cmdLineOpts.testComplex ?
+      caller.run<complex_type_list>(testParams) :
+      true;
+#else
+    const bool complexResult = true;
+#endif // HAVE_TPETRATSQR_COMPLEX
 
-    return true; // for success
+    return realResult && complexResult;
   }
 } // namespace (anonymous)
 
@@ -308,47 +318,22 @@ namespace {
 int
 main (int argc, char* argv[])
 {
-  using TSQR::Test::NullCons;
-  using TSQR::Test::Cons;
-  using Teuchos::null;
-  using Teuchos::ParameterList;
-  using Teuchos::parameterList;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
   using std::endl;
+  TSQR::Test::MpiAndKokkosScope testScope(&argc, &argv);
+  auto comm = testScope.getComm();
+  std::ostream& out = testScope.outStream();
+  std::ostream& err = testScope.errStream();
 
-#ifdef HAVE_MPI
-  typedef RCP<const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to cout and cerr.  The other MPI
-  // process ranks send their output to a "black hole" (something that
-  // acts like /dev/null, and may be /dev/null).
-  const bool allowedToPrint = (myRank == 0);
-  std::ostream& out = allowedToPrint ? std::cout : blackhole;
-  std::ostream& err = allowedToPrint ? std::cerr : blackhole;
-  // Make sure that err gets "used"
-  (void) err;
-
-#else // Don't HAVE_MPI: single-process test
-
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-  std::ostream& err = std::cerr;
-#endif // HAVE_MPI
-
-  bool success = false;
-  bool verbose = false;
+  constexpr bool actually_print_caught_exceptions = true;
+  bool success = false; // hopefully this will be true later
   try {
-    success = test (argc, argv, comm, allowedToPrint);
-    if (allowedToPrint && success) {
+    success = test(argc, argv, comm, err);
+    if(success) {
       // The Trilinos test framework expects a message like this.
       out << "\nEnd Result: TEST PASSED" << endl;
     }
   }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
+  TEUCHOS_STANDARD_CATCH_STATEMENTS
+    (actually_print_caught_exceptions, err, success);
   return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
 }
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp
deleted file mode 100644
index d47000f68846..000000000000
--- a/packages/tpetra/tsqr/test/Tsqr_TestKokkosNodeTsqr.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_StandardCatchMacros.hpp"
-#include "Tsqr_KokkosNodeTsqrTest.hpp"
-#include "Kokkos_Core.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-#  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-namespace {
-  //
-  // The documentation string for this test executable to print out at
-  // the command line on request.
-  //
-  const char docString[] = "This program tests TSQR::KokkosNodeTsqr, "
-    "which implements an intranode parallel version of TSQR for "
-    "Kokkos::DefaultHostExecutionSpace.  Accuracy and performance "
-    "tests are included.";
-
-  //
-  // TestParameters encapsulates values of command-line parameters, as
-  // well as state that may change from one benchmark / verify
-  // invocation to the next.
-  //
-  class TestParameters {
-  public:
-    TestParameters () = default;
-    TestParameters (const std::vector<int> /* theSeed */);
-
-    bool verify = true;
-    bool benchmark = false;
-    int numRows = 100000;
-    int numCols = 10;
-    int numTrials = 1;
-    bool testReal = true;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    bool testComplex = true;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    int numPartitions = 16;
-    int cacheSizeHint = 0;
-    bool contiguousCacheBlocks = false;
-    bool printFieldNames = true;
-    bool humanReadable = true;
-    bool debug = false;
-  };
-
-  // Run the test(s) for a particular Scalar type T.
-  // Used by Cons, which in turn is used by runTests().
-  template<class T>
-  class Dispatcher {
-  public:
-    typedef T dispatch_type;
-
-    static void
-    benchmark (std::vector<int>&,
-               const TestParameters& params,
-               bool& printFieldNames)
-    {
-      using TSQR::Test::benchmarkKokkosNodeTsqr;
-      benchmarkKokkosNodeTsqr<int, T> (params.numTrials,
-                                       params.numRows,
-                                       params.numCols,
-                                       params.numPartitions,
-                                       params.cacheSizeHint,
-                                       params.contiguousCacheBlocks,
-                                       printFieldNames,
-                                       params.humanReadable);
-      printFieldNames = false;
-    }
-
-    static void
-    verify (std::vector<int>& seed,
-            const TestParameters& params,
-            bool& printFieldNames)
-    {
-      TSQR::Random::NormalGenerator<int, T> gen (seed);
-      using TSQR::Test::verifyKokkosNodeTsqr;
-      verifyKokkosNodeTsqr<int, T> (gen,
-                                    params.numRows,
-                                    params.numCols,
-                                    params.numPartitions,
-                                    params.cacheSizeHint,
-                                    params.contiguousCacheBlocks,
-                                    printFieldNames,
-                                    params.humanReadable,
-                                    params.debug);
-      printFieldNames = false;
-      // Save the seed for next time, since we can't use the same
-      // NormalGenerator for a different Scalar type T.
-      gen.getSeed (seed);
-    }
-  };
-
-  //
-  // Class for executing a template function over a compile-time
-  // fixed-length list of types.  See runTests() for an example.
-  //
-  template<class CarType, class CdrType>
-  class Cons {
-  public:
-    static void
-    verify (std::vector<int>& seed,
-            const TestParameters& params,
-            bool& printFieldNames)
-    {
-      Dispatcher<CarType>::verify (seed, params, printFieldNames);
-      CdrType::verify (seed, params, printFieldNames);
-    }
-
-    static void
-    benchmark (std::vector<int>& seed,
-               const TestParameters& params,
-               bool& printFieldNames)
-    {
-      Dispatcher<CarType>::benchmark (seed, params, printFieldNames);
-      CdrType::benchmark (seed, params, printFieldNames);
-    }
-  };
-
-  // Base case for Cons template recursion.
-  class NullCons {
-  public:
-    static void
-    verify (std::vector<int>&,
-            const TestParameters&,
-            bool& printFieldNames) {}
-
-    static void
-    benchmark (std::vector<int>&,
-               const TestParameters&,
-               bool& printFieldNames) {}
-  };
-
-  // Run the tests for all types of interest.
-  void
-  runTests (const TestParameters& params)
-  {
-    using real_tests = Cons<float, Cons<double, NullCons>>;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    using complex_tests =
-      Cons<std::complex<float>, Cons<std::complex<double>, NullCons>>;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-    // Length-4 seed for the pseudorandom number generator.  The last
-    // entry must be an odd number.  There are other restrictions on
-    // these values; see the LAPACK documentation for details.  (0, 0,
-    // 0, 1) is a typical initial seed if you want reproducible
-    // results, but don't actually care much about randomness.
-    std::vector<int> seed {{0, 0, 0, 1}};
-
-    bool printFieldNames = params.printFieldNames;
-    if (params.verify) {
-      if (params.testReal) {
-        real_tests::verify (seed, params, printFieldNames);
-      }
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      if (params.testComplex) {
-        complex_tests::verify (seed, params, printFieldNames);
-      }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    }
-    // Reset this, since the first call of verify() sets it to false.
-    printFieldNames = params.printFieldNames;
-    if (params.benchmark) {
-      if (params.testReal) {
-        real_tests::benchmark (seed, params, printFieldNames);
-      }
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      if (params.testComplex) {
-        complex_tests::benchmark (seed, params, printFieldNames);
-      }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    }
-  }
-
-  // Parse command-line options for this test.
-  //
-  // argc [in] As usual in C(++)
-  //
-  // argv [in] As usual in C(++)
-  //
-  // allowedToPrint [in] Whether this (MPI) process is allowed
-  //   to print to stdout/stderr.  Different per (MPI) process.
-  //
-  // printedHelp [out] Whether this (MPI) process printed the
-  //   "help" display (summary of command-line options).
-  //
-  // Return an encapsulation of the command-line options.
-  TestParameters
-  parseOptions (int argc,
-                char* argv[],
-                const bool allowedToPrint,
-                bool& printedHelp)
-  {
-    using std::cerr;
-    using std::endl;
-
-    printedHelp = false;
-
-    // Command-line parameters, set to their default values.
-    TestParameters params;
-    /// We really want the cache size hint as a size_t, but
-    /// Teuchos::CommandLineProcessor doesn't offer that option.  So
-    /// we read it in as an int, which means negative inputs are
-    /// possible.  We check for those below in the input validation
-    /// phase.
-    //
-    // Fetch default value of cacheSizeHint.
-    int cacheSizeHint = params.cacheSizeHint;
-    try {
-      using Teuchos::CommandLineProcessor;
-
-      CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-                                        /* recognizeAllOptions=*/ true);
-      cmdLineProc.setDocString (docString);
-      cmdLineProc.setOption ("verify",
-                             "noverify",
-                             &params.verify,
-                             "Test accuracy");
-      cmdLineProc.setOption ("benchmark",
-                             "nobenchmark",
-                             &params.benchmark,
-                             "Test performance");
-      cmdLineProc.setOption ("numRows",
-                             &params.numRows,
-                             "Number of rows in the test matrix");
-      cmdLineProc.setOption ("numCols",
-                             &params.numCols,
-                             "Number of columns in the test matrix");
-      cmdLineProc.setOption ("numTrials",
-                             &params.numTrials,
-                             "Number of trials (only used when \"--benchmark\"");
-      cmdLineProc.setOption ("testReal",
-                             "noTestReal",
-                             &params.testReal,
-                             "Test real arithmetic");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      cmdLineProc.setOption ("testComplex",
-                             "noTestComplex",
-                             &params.testComplex,
-                             "Test complex arithmetic");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      params.numPartitions = Kokkos::DefaultHostExecutionSpace::concurrency();
-      cmdLineProc.setOption ("numPartitions",
-                             &params.numPartitions,
-                             "Number of partitions to use (max available parallelism)");
-      cmdLineProc.setOption ("cacheSizeHint",
-                             &cacheSizeHint,
-                             "Cache size hint in bytes (0 means pick a reasonable default)");
-      cmdLineProc.setOption ("contiguousCacheBlocks",
-                             "noncontiguousCacheBlocks",
-                             &params.contiguousCacheBlocks,
-                             "Whether cache blocks should be stored contiguously");
-      cmdLineProc.setOption ("printFieldNames",
-                             "noPrintFieldNames",
-                             &params.printFieldNames,
-                             "Print field names (for machine-readable output only)");
-      cmdLineProc.setOption ("humanReadable",
-                             "machineReadable",
-                             &params.humanReadable,
-                             "If set, make output easy to read by humans "
-                             "(but hard to parse)");
-      cmdLineProc.setOption ("debug",
-                             "noDebug",
-                             &params.debug,
-                             "Print debugging information");
-      cmdLineProc.parse (argc, argv);
-    }
-    catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-      if (allowedToPrint)
-        cerr << "Unrecognized command-line option: " << e.what() << endl;
-      throw e;
-    }
-    catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-      printedHelp = true;
-      return params; // Don't verify parameters in this case
-    }
-
-    // Validate command-line options.  We provide default values
-    // for unset options, so we don't have to validate those.
-    if (params.numRows <= 0) {
-      throw std::invalid_argument ("Number of rows must be positive");
-    } else if (params.numCols <= 0) {
-      throw std::invalid_argument ("Number of columns must be positive");
-    } else if (params.numRows < params.numCols) {
-      throw std::invalid_argument ("Number of rows must be >= number of columns");
-    } else if (params.benchmark && params.numTrials < 1) {
-      throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");
-    } else if (params.numPartitions < 1) {
-      throw std::invalid_argument ("\"--numPartitions\" option must be >= 1");
-    } else if (params.cacheSizeHint < 0) {
-      throw std::invalid_argument ("Cache size hint must be nonnegative");
-    }
-    return params;
-  }
-} // namespace (anonymous)
-
-//
-// The "main" test driver.
-//
-int
-main (int argc, char *argv[])
-{
-  using Teuchos::ParameterList;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
-
-  bool performingTests = true;
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-
-  // Fetch command-line parameters.
-  bool printedHelp = false;
-  TestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp) {
-    return EXIT_SUCCESS;
-  }
-
-  bool success = false;
-  bool verbose = false;
-  try {
-    if (performingTests) {
-      Kokkos::ScopeGuard kokkosScope (argc, argv);
-      runTests (params);
-      success = true;
-      // The Trilinos test framework expects a message like this.
-      out << "\nEnd Result: TEST PASSED" << std::endl;
-    }
-  }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
-  return success ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp b/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp
deleted file mode 100644
index 3c4da413287b..000000000000
--- a/packages/tpetra/tsqr/test/Tsqr_TestLapack.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#include "Tsqr_ConfigDefs.hpp"
-#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI
-#include "Teuchos_Tuple.hpp"
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
-#include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_StandardCatchMacros.hpp"
-#include "Tsqr_SeqTest.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-#  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-
-namespace TSQR {
-  namespace Trilinos {
-    namespace Test {
-
-      const char docString[] = "This program compares LAPACK\'s QR factorization"
-        " (with TSQR).  Accuracy and performance tests are included.";
-
-      using Teuchos::RCP;
-      using Teuchos::Tuple;
-
-      /// \class LapackTestParameters
-      /// \brief Encapsulates values of command-line parameters
-      ///
-      struct LapackTestParameters {
-        LapackTestParameters () :
-          verify (false),
-          benchmark (false),
-          numRows (1000),
-          numCols (10),
-          numTrials (10),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          testComplex (true),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-          printFieldNames (true),
-          printTrilinosTestStuff (true),
-          humanReadable (false),
-          debug (false)
-          {}
-
-        bool verify, benchmark;
-        int numRows, numCols, numTrials;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-        bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-        std::string additionalFieldNames, additionalData;
-        bool printFieldNames, printTrilinosTestStuff, humanReadable, debug;
-      };
-
-      static void
-        benchmark (std::ostream& out,
-            const LapackTestParameters& params)
-        {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          const bool testComplex = params.testComplex;
-#else
-          const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-          using TSQR::Test::benchmarkLapack;
-          benchmarkLapack (out,
-              params.numRows,
-              params.numCols,
-              params.numTrials,
-              testComplex,
-              params.additionalFieldNames,
-              params.additionalData,
-              params.printFieldNames,
-              params.humanReadable);
-        }
-
-      static void
-        verify (std::ostream& out,
-            const LapackTestParameters& params)
-        {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          const bool testComplex = params.testComplex;
-#else
-          const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-          using TSQR::Test::verifyLapack;
-          verifyLapack (out,
-              params.numRows,
-              params.numCols,
-              testComplex,
-              params.additionalFieldNames,
-              params.additionalData,
-              params.printFieldNames,
-              params.humanReadable,
-              params.debug);
-        }
-
-      /// \brief Parse command-line options for this test
-      ///
-      /// \param argc [in] As usual in C(++)
-      /// \param argv [in] As usual in C(++)
-      /// \param allowedToPrint [in] Whether this (MPI) process is allowed
-      ///   to print to stdout/stderr.  Different per (MPI) process.
-      /// \param printedHelp [out] Whether this (MPI) process printed the
-      ///   "help" display (summary of command-line options)
-      ///
-      /// \return Encapsulation of command-line options
-      static LapackTestParameters
-        parseOptions (int argc,
-            char* argv[],
-            const bool allowedToPrint,
-            bool& printedHelp)
-        {
-          using std::cerr;
-          using std::endl;
-
-          printedHelp = false;
-
-          // Command-line parameters, set to their default values.
-          LapackTestParameters params;
-
-          try {
-            using Teuchos::CommandLineProcessor;
-
-            CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-                /* recognizeAllOptions=*/ true);
-            cmdLineProc.setDocString (docString);
-            cmdLineProc.setOption ("verify",
-                "noverify",
-                &params.verify,
-                "Test accuracy");
-            cmdLineProc.setOption ("benchmark",
-                "nobenchmark",
-                &params.benchmark,
-                "Test performance");
-            cmdLineProc.setOption ("nrows",
-                &params.numRows,
-                "Number of rows in the test matrix");
-            cmdLineProc.setOption ("ncols",
-                &params.numCols,
-                "Number of columns in the test matrix");
-            cmdLineProc.setOption ("ntrials",
-                &params.numTrials,
-                "Number of trials (only used when \"--benchmark\"");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-            cmdLineProc.setOption ("complex",
-                "nocomplex",
-                &params.testComplex,
-                "Test complex arithmetic, as well as real");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-            cmdLineProc.setOption ("field-names",
-                &params.additionalFieldNames,
-                "Any additional field name(s) (comma-delimited "
-                "string) to add to the benchmark output.  Empty "
-                "by default.  Good for things known when invoking "
-                "the benchmark executable, but not (easily) known "
-                "inside the benchmark -- e.g., environment "
-                "variables.");
-            cmdLineProc.setOption ("output-data",
-                &params.additionalData,
-                "Any additional data to add to the output, "
-                "corresponding to the above field name(s). "
-                "Empty by default.");
-            cmdLineProc.setOption ("print-field-names",
-                "no-print-field-names",
-                &params.printFieldNames,
-                "Print field names for benchmark output (including "
-                "any arguments to --field-names).");
-            cmdLineProc.setOption ("print-trilinos-test-stuff",
-                "no-print-trilinos-test-stuff",
-                &params.printTrilinosTestStuff,
-                "Print output that makes the Trilinos test "
-                "framework happy (but makes benchmark results "
-                "parsing scripts unhappy)");
-            cmdLineProc.setOption ("human-readable",
-                "machine-readable",
-                &params.humanReadable,
-                "If set, make output easy to read by humans "
-                "(but hard to parse)");
-            cmdLineProc.setOption ("debug",
-                "nodebug",
-                &params.debug,
-                "Print debugging information");
-            cmdLineProc.parse (argc, argv);
-          }
-          catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-            if (allowedToPrint)
-              cerr << "Unrecognized command-line option: " << e.what() << endl;
-            throw e;
-          }
-          catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-            printedHelp = true;
-            return params; // Don't verify parameters in this case
-          }
-
-          // Validate command-line options.  We provide default values
-          // for unset options, so we don't have to validate those.
-          if (params.numRows <= 0)
-            throw std::invalid_argument ("Number of rows must be positive");
-          else if (params.numCols <= 0)
-            throw std::invalid_argument ("Number of columns must be positive");
-          else if (params.numRows < params.numCols)
-            throw std::invalid_argument ("Number of rows must be >= number of columns");
-          else if (params.benchmark && params.numTrials < 1)
-            throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");
-          return params;
-        }
-
-    } // namespace Test
-  } // namespace Trilinos
-} // namespace TSQR
-
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-  int
-main (int argc, char *argv[])
-{
-  using Teuchos::RCP;
-  using TSQR::Trilinos::Test::LapackTestParameters;
-  using TSQR::Trilinos::Test::parseOptions;
-  using std::endl;
-
-#ifdef HAVE_MPI
-  typedef RCP< const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to stdout.  The other MPI process ranks
-  // send their output to something that looks like /dev/null (and
-  // likely is, on Unix-y operating systems).
-  std::ostream& out = (myRank == 0) ? std::cout : blackhole;
-  // Only Rank 0 performs the tests.
-  const bool performingTests = (myRank == 0);
-  const bool allowedToPrint = (myRank == 0);
-
-#else // Don't HAVE_MPI: single-node test
-
-  const bool performingTests = true;
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-#endif // HAVE_MPI
-
-  // Fetch command-line parameters.
-  bool printedHelp = false;
-  LapackTestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp)
-    return 0;
-
-  bool success = false;
-  bool verbose = false;
-  try {
-    if (performingTests)
-    {
-      if (params.benchmark)
-        TSQR::Trilinos::Test::benchmark (out, params);
-
-      // We allow the same run to do both benchmark and verify.
-      if (params.verify)
-        TSQR::Trilinos::Test::verify (out, params);
-
-      success = true;
-
-      if (params.printTrilinosTestStuff)
-        // The Trilinos test framework expects a message like this.
-        out << "\nEnd Result: TEST PASSED" << endl;
-    }
-  }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
-  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
-}
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp
new file mode 100644
index 000000000000..85a96725c507
--- /dev/null
+++ b/packages/tpetra/tsqr/test/Tsqr_TestNodeTsqr.cpp
@@ -0,0 +1,1532 @@
+//@HEADER
+// ************************************************************************
+//
+//          Kokkos: Node API and Parallel Node Kernels
+//              Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+#include "Teuchos_Time.hpp"
+
+#include "Tsqr_Impl_Lapack.hpp"
+#include "Tsqr_Random_NormalGenerator.hpp"
+#include "Tsqr_LocalVerify.hpp"
+#include "Tsqr_Matrix.hpp"
+#include "Tsqr_NodeTsqrFactory.hpp"
+#include "Tsqr_nodeTestProblem.hpp"
+#include "Tsqr_Util.hpp"
+
+#include <algorithm>
+#include <complex>
+#include <cstring> // size_t definition
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+namespace TSQR {
+  namespace Test {
+
+    using execution_space = Kokkos::DefaultExecutionSpace;
+    using memory_space = execution_space::memory_space;
+    using device_type =
+      Kokkos::Device<execution_space, memory_space>;
+
+    // Command-line arguments and other test parameters.
+    struct NodeTestParameters {
+      NodeTestParameters() = default;
+
+      std::string nodeTsqrType {"Default"};
+      bool verify = true;
+      bool benchmark = false;
+      int numRows = 10000;
+      int numCols = 10;
+      int numTrials = 10;
+      bool testReal = true;
+#ifdef HAVE_TPETRATSQR_COMPLEX
+      bool testComplex = true;
+#else
+      bool testComplex = false;
+#endif // HAVE_TPETRATSQR_COMPLEX
+      size_t cacheSizeHint = 0;
+      bool contiguousCacheBlocks = false;
+      bool printFieldNames = true;
+      bool printTrilinosTestStuff = true;
+      bool humanReadable = false;
+      bool verbose = false;
+      bool saveMatrices = false;
+    };
+
+    void
+    printNodeTestParameters(std::ostream& out,
+                            const NodeTestParameters& p,
+                            const std::string& prefix)
+    {
+      using std::endl;
+      out << prefix << "NodeTsqr: " << p.nodeTsqrType << endl
+          << prefix << "numRows: " << p.numRows << endl
+          << prefix << "numCols: " << p.numCols << endl
+          << prefix << "numTrials: " << p.numTrials << endl
+          << prefix << "testReal: "
+          << (p.testReal ? "true" : "false") << endl
+          << prefix << "testComplex: "
+          << (p.testComplex ? "true" : "false") << endl
+          << prefix << "cacheSizeHint: " << p.cacheSizeHint << endl
+          << prefix << "contiguousCacheBlocks: "
+          << (p.contiguousCacheBlocks ? "true" : "false") << endl
+          << prefix << "printFieldNames: "
+          << (p.printFieldNames ? "true" : "false") << endl
+          << prefix << "printTrilinosTestStuff: "
+          << (p.printTrilinosTestStuff ? "true" : "false") << endl
+          << prefix << "humanReadable: "
+          << (p.humanReadable ? "true" : "false") << endl
+          << prefix << "verbose: "
+          << (p.verbose ? "true" : "false") << endl
+          << prefix << "saveMatrices: "
+          << (p.saveMatrices ? "true" : "false") << endl;
+    }
+
+    void
+    setBoolCmdLineOpt(Teuchos::CommandLineProcessor& cmdLineProc,
+                      bool* variable,
+                      const char trueString[],
+                      const char falseString[],
+                      const char docString[])
+    {
+      cmdLineProc.setOption(trueString, falseString, variable,
+                            docString);
+    }
+
+    // \brief Parse command-line options for this test
+    //
+    // \param argc [in] As usual in C(++).
+    // \param argv [in] As usual in C(++).
+    // \param printedHelp [out] Whether this function printed the
+    //   "help" display (summary of command-line options).
+    //
+    // \return Encapsulation of command-line options
+    static NodeTestParameters
+    parseOptions(int argc,
+                 char* argv[],
+                 bool& printedHelp)
+    {
+      using std::cerr;
+      using std::endl;
+
+      printedHelp = false;
+
+      // Command-line parameters, set to their default values.
+      NodeTestParameters params;
+      /// We really want the cache block size as a size_t, but
+      /// Teuchos::CommandLineProcessor doesn't offer that option.
+      /// So we read it in as an int, which means negative inputs
+      /// are possible.  We check for those below in the input
+      /// validation phase.
+      //
+      // Fetch default value of cacheSizeHint.
+      int cacheSizeHintAsInt = static_cast<int>(params.cacheSizeHint);
+      try {
+        const bool throwExceptions = true;
+        const bool recognizeAllOptions = false;
+        using Teuchos::CommandLineProcessor;
+        CommandLineProcessor cmdLineProc(throwExceptions,
+                                         recognizeAllOptions);
+        const char docString[] = "This program tests TSQR::NodeTsqr, "
+          "which implements the intraprocess part of TSQR.  "
+          "Accuracy and performance tests are included.";
+        cmdLineProc.setDocString(docString);
+
+        setBoolCmdLineOpt(cmdLineProc, &params.verify,
+                          "verify",
+                          "noverify",
+                          "Test accuracy");
+        setBoolCmdLineOpt(cmdLineProc, &params.benchmark,
+                          "benchmark",
+                          "nobenchmark",
+                          "Test performance");
+        cmdLineProc.setOption("numRows",
+                              &params.numRows,
+                              "Number of rows in the test matrix");
+        cmdLineProc.setOption("numCols",
+                              &params.numCols,
+                              "Number of columns in the test matrix");
+        cmdLineProc.setOption("numTrials",
+                              &params.numTrials,
+                              "Number of trials (only used when "
+                              "\"--benchmark\"");
+        setBoolCmdLineOpt(cmdLineProc, &params.testReal,
+                          "testReal",
+                          "noTestReal",
+                          "Test real arithmetic");
+        setBoolCmdLineOpt(cmdLineProc, &params.testComplex,
+                          "testComplex",
+                          "noTestComplex",
+                          "Test complex arithmetic");
+        cmdLineProc.setOption("cacheBlockSize",
+                              &cacheSizeHintAsInt,
+                              "Cache size hint in bytes (0 means "
+                              "pick a reasonable default)");
+        setBoolCmdLineOpt(cmdLineProc,
+                          &params.contiguousCacheBlocks,
+                          "contiguousCacheBlocks",
+                          "noncontiguousCacheBlocks",
+                          "Whether cache blocks should be stored contiguously");
+        setBoolCmdLineOpt(cmdLineProc, &params.printFieldNames,
+                          "printFieldNames",
+                          "noPrintFieldNames",
+                          "Print field names (for machine-readable output only)");
+        setBoolCmdLineOpt(cmdLineProc, &params.printTrilinosTestStuff,
+                          "printTrilinosTestStuff",
+                          "noPrintTrilinosTestStuff",
+                          "Print output that makes the Trilinos test "
+                          "framework happy, but may make benchmark "
+                          "results' parsing scripts unhappy.");
+        setBoolCmdLineOpt(cmdLineProc, &params.humanReadable,
+                          "humanReadable",
+                          "machineReadable",
+                          "If set, make output easy to read by "
+                          "humans, but harder to parse.");
+        setBoolCmdLineOpt(cmdLineProc, &params.verbose,
+                          "verbose",
+                          "quiet",
+                          "Print verbose debugging information");
+        setBoolCmdLineOpt(cmdLineProc, &params.saveMatrices,
+                          "saveMatrices",
+                          "noSaveMatrices",
+                          "If set, dump matrices to files.");
+        cmdLineProc.setOption("NodeTsqr",
+                              &params.nodeTsqrType,
+                              "NodeTsqr subclass type");
+        cmdLineProc.parse(argc, argv);
+      }
+      catch(Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
+        cerr << "Unrecognized command-line option: " << e.what()
+             << endl;
+        throw e;
+      }
+      catch(Teuchos::CommandLineProcessor::HelpPrinted& e) {
+        printedHelp = true;
+        return params; // Don't verify parameters in this case
+      }
+
+      // Validate command-line options.  We provide default values
+      // for unset options, so we don't have to validate those.
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (params.numRows <= 0, std::invalid_argument, "Number of "
+         "rows must be positive, but you set --numRows=" <<
+         params.numRows << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (params.numCols <= 0, std::invalid_argument, "Number of "
+         "columns must be positive, but you set --numCols=" <<
+         params.numCols << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (params.numRows < params.numCols, std::invalid_argument,
+         "Number of rows must be >= number of columns, but you set "
+         "--numRows=" << params.numRows << " and --numCols=" <<
+         params.numCols << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (params.benchmark && params.numTrials < 1,
+         std::invalid_argument, "Since you set --benchmark, the "
+         "number of trials must be positive, but you set --numTrials="
+         << params.numTrials << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (cacheSizeHintAsInt < 0, std::invalid_argument, "Cache size "
+         "hint must be nonnegative, but you set --cacheBlockSize=" <<
+         cacheSizeHintAsInt << ".");
+      params.cacheSizeHint = size_t(cacheSizeHintAsInt);
+      return params;
+    }
+
+    template<class Scalar>
+    using kokkos_value_type = typename std::conditional<
+        std::is_const<Scalar>::value,
+        const typename Kokkos::ArithTraits<
+          typename std::remove_const<Scalar>::type>::val_type,
+        typename Kokkos::ArithTraits<Scalar>::val_type
+      >::type;
+
+    template<class LO, class Scalar>
+    Kokkos::View<kokkos_value_type<Scalar>**,
+                 Kokkos::LayoutLeft, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    getHostMatrixView(const MatView<LO, Scalar>& A)
+    {
+      using Kokkos::ALL;
+      using Kokkos::subview;
+      using IST = kokkos_value_type<Scalar>;
+      using host_mat_view_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft, Kokkos::HostSpace,
+          Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+      const size_t nrows(A.extent(0));
+      const size_t ncols(A.extent(1));
+      const size_t lda(A.stride(1));
+      IST* A_raw = reinterpret_cast<IST*>(A.data());
+      host_mat_view_type A_full(A_raw, lda, ncols);
+      const std::pair<size_t, size_t> rowRange(0, nrows);
+      return Kokkos::subview(A_full, rowRange, Kokkos::ALL());
+    }
+
+    template<class LO, class Scalar>
+    Kokkos::View<typename Kokkos::ArithTraits<Scalar>::val_type**,
+                 Kokkos::LayoutLeft>
+    getDeviceMatrixCopy(const MatView<LO, Scalar>& A,
+                        const std::string& label)
+    {
+      using Kokkos::view_alloc;
+      using Kokkos::WithoutInitializing;
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      const size_t nrows(A.extent(0));
+      const size_t ncols(A.extent(1));
+      device_matrix_type A_dev
+        (view_alloc(label, WithoutInitializing), nrows, ncols);
+      auto A_host = getHostMatrixView(A);
+      Kokkos::deep_copy(A_dev, A_host);
+      return A_dev;
+    }
+
+
+    template<template<class SC> class LapackType, class Scalar>
+    static int
+    lworkQueryLapackQr(LapackType<Scalar>& lapack,
+                       const int nrows,
+                       const int ncols,
+                       const int lda)
+    {
+      const int lwork_geqrf =
+        lapack.compute_QR_lwork(nrows, ncols, nullptr, lda);
+      // A workspace query appropriate for computing the explicit Q
+      // factor (nrows x ncols) in place, from the QR factorization of
+      // an nrows x ncols matrix with leading dimension lda.
+      const int lwork_ungqr =
+        lapack.compute_explicit_Q_lwork(nrows, ncols, ncols,
+                                        nullptr, lda, nullptr);
+      return std::max(lwork_geqrf, lwork_ungqr);
+    }
+
+    template<class SC>
+    Teuchos::RCP<
+      typename ::TSQR::NodeTsqrFactory<SC, int, device_type>::node_tsqr_type
+    >
+    getNodeTsqr(const NodeTestParameters& p,
+                const std::string& overrideNodeTsqrType = "")
+    {
+      const std::string nodeTsqrType = [&] () {
+        if(overrideNodeTsqrType == "") {
+          return p.nodeTsqrType;
+        }
+        else {
+          return overrideNodeTsqrType;
+        }
+      }();
+      using fct_type = ::TSQR::NodeTsqrFactory<SC, int, device_type>;
+      auto nodeTsqr = fct_type::getNodeTsqr(nodeTsqrType);
+      TEUCHOS_ASSERT( ! nodeTsqr.is_null() );
+      auto nodeTsqrParams = Teuchos::parameterList("NodeTsqr");
+      nodeTsqrParams->set("Cache Size Hint", p.cacheSizeHint);
+      nodeTsqr->setParameterList(nodeTsqrParams);
+      return nodeTsqr;
+    }
+
+    static void
+    printVerifyFieldNames(std::ostream& out)
+    {
+      const char prefix[] = "%";
+      out << prefix << "method"
+          << ",scalarType"
+          << ",numRows"
+          << ",numCols"
+          << ",cacheSizeHint"
+          << ",contiguousCacheBlocks"
+          << ",frobA"
+          << ",absFrobResid"
+          << ",absFrobOrthog";
+      out << std::endl;
+    }
+
+    template<class Scalar>
+    static std::string
+    getFileSuffix(const std::string& method)
+    {
+      std::string shortScalarType;
+      if(std::is_same<Scalar, float>::value) {
+        shortScalarType = "S";
+      }
+      else if(std::is_same<Scalar, double>::value) {
+        shortScalarType = "D";
+      }
+      else if(std::is_same<Scalar, std::complex<float>>::value) {
+        shortScalarType = "C";
+      }
+      else if(std::is_same<Scalar, std::complex<double>>::value) {
+        shortScalarType = "Z";
+      }
+      else {
+        shortScalarType = "U"; // unknown
+      }
+      const std::string sep("_");
+      return sep + method + sep + shortScalarType + ".txt";
+    }
+
+    // Test the accuracy of a NodeTsqr implementation on an nrows by
+    // ncols matrix (using the given cache block size (in bytes)),
+    // and print the results to stdout.
+    template<class Scalar>
+    static bool
+    verifyNodeTsqrTmpl(std::ostream& out,
+                       std::vector<int>& iseed,
+                       const NodeTestParameters& params)
+    {
+      using Teuchos::TypeNameTraits;
+      using std::cerr;
+      using std::endl;
+      using STS = Teuchos::ScalarTraits<Scalar>;
+      using mag_type = typename STS::magnitudeType;
+      using STM = Teuchos::ScalarTraits<mag_type>;
+      const bool verbose = params.verbose;
+      const std::string scalarType = TypeNameTraits<Scalar>::name();
+      const std::string fileSuffix =
+        getFileSuffix<Scalar>(params.nodeTsqrType);
+      if(verbose) {
+        cerr << "Test NodeTsqr with Scalar=" << scalarType << endl;
+      }
+
+      bool success = true;
+
+      const int nrows = params.numRows;
+      const int ncols = params.numCols;
+
+      Matrix<int, Scalar> A(nrows, ncols);
+      Matrix<int, Scalar> A_copy(nrows, ncols);
+      Matrix<int, Scalar> Q(nrows, ncols);
+      Matrix<int, Scalar> R(ncols, ncols);
+      if(std::numeric_limits<Scalar>::has_quiet_NaN) {
+        deep_copy(A, std::numeric_limits<Scalar>::quiet_NaN());
+        deep_copy(A_copy, std::numeric_limits<Scalar>::quiet_NaN());
+        deep_copy(Q, std::numeric_limits<Scalar>::quiet_NaN());
+        deep_copy(R, std::numeric_limits<Scalar>::quiet_NaN());
+      }
+      const int lda = nrows;
+      const int ldq = nrows;
+      const int ldr = ncols;
+
+      if(verbose) {
+        cerr << "-- Create test problem" << endl;
+      }
+      {
+        TSQR::Random::NormalGenerator<int, Scalar> gen(iseed);
+        nodeTestProblem(gen, nrows, ncols, A.data(), A.stride(1),
+                        true);
+        gen.getSeed(iseed); // fetch seed for the next test
+      }
+
+      if(params.saveMatrices) {
+        std::string filename = std::string("A") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save A to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, nrows, ncols,
+                           A.data(), A.stride(1));
+        fileOut.close();
+      }
+
+      auto nodeTsqrPtr = getNodeTsqr<Scalar>(params);
+      auto& actor = *nodeTsqrPtr;
+      if(verbose && actor.wants_device_memory()) {
+        cerr << "-- NodeTsqr claims to want device memory" << endl;
+      }
+
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      auto A_h = getHostMatrixView(A.view());
+      auto A_copy_h = getHostMatrixView(A_copy.view());
+      auto Q_h = getHostMatrixView(Q.view());
+      device_matrix_type A_d;
+      device_matrix_type A_copy_d;
+      device_matrix_type Q_d;
+      if(actor.wants_device_memory()) {
+        A_d = getDeviceMatrixCopy(A.view(), "A_d");
+        // Don't copy A_copy yet; see below.
+        A_copy_d = device_matrix_type("A_copy_d", nrows, ncols);
+        Q_d = device_matrix_type("Q_d", nrows, ncols);
+      }
+
+      if(! params.contiguousCacheBlocks) {
+        if(verbose) {
+          cerr << "-- Copy A into A_copy" << endl;
+        }
+        deep_copy(A_copy, A);
+        if(actor.wants_device_memory()) {
+          deep_copy(A_copy_d, A_d);
+        }
+      }
+      else {
+        if(verbose) {
+          cerr << "-- Copy A into A_copy via cache_block" << endl;
+        }
+        if(actor.wants_device_memory()) {
+          Scalar* A_copy_d_raw =
+            reinterpret_cast<Scalar*>(A_copy_d.data());
+          const Scalar* A_d_raw =
+            reinterpret_cast<const Scalar*>(A_d.data());
+          actor.cache_block(nrows, ncols, A_copy_d_raw,
+                            A_d_raw, A_d.stride(1));
+          Kokkos::deep_copy(A_copy_h, A_copy_d);
+        }
+        else {
+          actor.cache_block(nrows, ncols, A_copy.data(),
+                            A.data(), A.stride(1));
+        }
+        if(verbose) {
+          cerr << "-- Verify cache_block result" << endl;
+        }
+
+        Matrix<int, Scalar> A2(nrows, ncols);
+        if(std::numeric_limits<Scalar>::has_quiet_NaN) {
+          deep_copy(A2, std::numeric_limits<Scalar>::quiet_NaN());
+        }
+        if(actor.wants_device_memory()) {
+          auto A2_h = getHostMatrixView(A2.view());
+          auto A2_d = getDeviceMatrixCopy(A2.view(), "A2_d");
+          Scalar* A2_d_raw = reinterpret_cast<Scalar*>(A2_d.data());
+          const Scalar* A_copy_d_raw =
+            reinterpret_cast<const Scalar*>(A_copy_d.data());
+          actor.un_cache_block(nrows, ncols, A2_d_raw,
+                               A2_d.stride(1), A_copy_d_raw);
+          Kokkos::deep_copy(A2_h, A2_d);
+        }
+        else {
+          actor.un_cache_block(nrows, ncols, A2.data(),
+                               A2.stride(1), A_copy.data());
+        }
+        const bool matrices_equal = matrix_equal(A, A2);
+        if(! matrices_equal) {
+          success = false;
+          if(verbose) {
+            cerr << "*** cache_block failed!" << endl;
+          }
+        }
+      }
+
+      if(verbose) {
+        cerr << "-- Fill R with zeros" << endl;
+      }
+      // We need to fill R with zeros, since the factorization may not
+      // overwrite the strict lower triangle of R.
+      deep_copy(R, Scalar {});
+
+      if(verbose) {
+        cerr << "-- Call NodeTsqr::factor" << endl;
+      }
+      // R is always in host memory, because that's what Belos wants.
+      auto factorOutput = [&]() {
+        if(actor.wants_device_memory()) {
+          Scalar* A_copy_d_raw =
+            reinterpret_cast<Scalar*>(A_copy_d.data());
+          TEUCHOS_ASSERT( nrows == 0 || ncols == 0 ||
+                          A_copy_d_raw != nullptr );
+          TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) ==
+                          size_t(nrows) );
+          TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) ==
+                          size_t(ncols) );
+          auto result =
+            actor.factor(nrows, ncols, A_copy_d_raw,
+                         A_copy_d.stride(1),
+                         R.data(), R.stride(1),
+                         params.contiguousCacheBlocks);
+          Kokkos::deep_copy(A_copy_h, A_copy_d);
+          return result;
+        }
+        else {
+          return actor.factor(nrows, ncols, A_copy.data(),
+                              A_copy.stride(1),
+                              R.data(), R.stride(1),
+                              params.contiguousCacheBlocks);
+        }
+      }();
+
+      if(params.saveMatrices) {
+        std::string filename = std::string("R") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save R to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, ncols, ncols,
+                           R.data(), R.stride(1));
+        fileOut.close();
+      }
+
+      if(verbose) {
+        cerr << "-- Call NodeTsqr::explicit_Q" << endl;
+      }
+      if(actor.wants_device_memory()) {
+        const Scalar* A_copy_d_raw =
+          reinterpret_cast<const Scalar*>(A_copy_d.data());
+        Scalar* Q_d_raw = reinterpret_cast<Scalar*>(Q_d.data());
+        TEUCHOS_ASSERT( nrows == 0 || ncols == 0 ||
+                        Q_d_raw != nullptr );
+        TEUCHOS_ASSERT( size_t(Q_d.extent(0)) == size_t(nrows) );
+        TEUCHOS_ASSERT( size_t(Q_d.extent(1)) == size_t(ncols) );
+        actor.explicit_Q(nrows, ncols,
+                         A_copy_d_raw, A_copy_d.stride(1),
+                         *factorOutput, ncols,
+                         Q_d_raw, Q_d.stride(1),
+                         params.contiguousCacheBlocks);
+        // We copy back to Q_h below, either with un_cache_block (if
+        // contiguous cache blocks) or directly (if not).
+      }
+      else {
+        actor.explicit_Q(nrows, ncols,
+                         A_copy.data(), A_copy.stride(1),
+                         *factorOutput, ncols,
+                         Q.data(), Q.stride(1),
+                         params.contiguousCacheBlocks);
+      }
+
+      // "Un"-cache-block the output, if contiguous cache blocks were
+      // used.  This is only necessary because local_verify() doesn't
+      // currently support contiguous cache blocks.
+      if(params.contiguousCacheBlocks) {
+        // Use A_copy as temporary storage for un-cache-blocking Q.
+        if(verbose) {
+          cerr << "-- Call NodeTsqr::un_cache_block" << endl;
+        }
+        if(actor.wants_device_memory()) {
+          Scalar* A_copy_d_raw =
+            reinterpret_cast<Scalar*>(A_copy_d.data());
+          const Scalar* Q_d_raw =
+            reinterpret_cast<const Scalar*>(Q_d.data());
+          actor.un_cache_block(nrows, ncols, A_copy_d_raw,
+                               A_copy_d.stride(1), Q_d_raw);
+          Kokkos::deep_copy(Q_h, A_copy_d);
+        }
+        else {
+          actor.un_cache_block(nrows, ncols, A_copy.data(),
+                               A_copy.stride(1), Q.data());
+          deep_copy(Q, A_copy);
+        }
+      }
+      else {
+        if(actor.wants_device_memory()) {
+          Kokkos::deep_copy(Q_h, Q_d);
+        }
+      }
+
+      if(params.saveMatrices) {
+        std::string filename = std::string("Q") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save Q to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, nrows, ncols,
+                           Q.data(), Q.stride(1));
+        fileOut.close();
+      }
+
+      if(verbose) {
+        cerr << "-- Call local_verify to validate the factorization"
+             << endl;
+      }
+      auto results = local_verify(nrows, ncols, A.data(), lda,
+                                  Q.data(), ldq, R.data(), ldr);
+
+      if(verbose) {
+        cerr << "-- Compute accuracy bounds and check" << endl;
+      }
+
+      // Accuracy relates to the number of floating-point operations,
+      // which in turn is a function of the matrix's dimensions.
+      // Avoid overflow of the local Ordinal type, by casting first to
+      // a floating-point type.
+      const mag_type dimsProd = mag_type(nrows) * mag_type(ncols) *
+        mag_type(ncols);
+      const mag_type fudgeFactor(10.0);
+      // Relative residual error is ||A-Q*R|| / ||A||, or just
+      // ||A-Q*R|| if ||A|| == 0.  (The result had better be zero in
+      // the latter case.)  Square root of the matrix dimensions is an
+      // old heuristic from Wilkinson or perhaps even an earlier
+      // source.  We include a "fudge factor" so that the test won't
+      // fail unless there is a really good reason.
+      const mag_type relResidBound = fudgeFactor *
+        STM::squareroot(dimsProd) * STS::eps();
+
+      // Relative residual error; avoid division by zero.
+      const mag_type relResidError = results[0] /
+        (results[2] == STM::zero() ? STM::one() : results[2]);
+
+      if(relResidError > relResidBound) {
+        success = false;
+        if(verbose) {
+          const std::string relResStr
+            (results[2] == STM::zero() ? " / ||A||_F" : "");
+          cerr << "*** For NodeTsqr=" << params.nodeTsqrType
+               << " with Scalar=" << scalarType << ": "
+               << "Residual ||A - QR||_F" << relResStr
+               << " = " << relResidError << " > bound "
+               << relResidBound << "." << endl;
+        }
+      }
+
+      // Orthogonality of the matrix should not depend on the matrix
+      // dimensions, if we measure in the 2-norm.  However, we are
+      // measuring in the Frobenius norm, so it's appropriate to
+      // multiply eps by the number of entries in the matrix for which
+      // we compute the Frobenius norm.  We include a "fudge factor"
+      // for the same reason as mentioned above.
+      const mag_type orthoBound = fudgeFactor *
+        mag_type(ncols) * mag_type(ncols) * STS::eps();
+
+      const mag_type orthoError = results[1];
+      if(orthoError > orthoBound) {
+        success = false;
+        if(verbose) {
+          cerr << "*** For NodeTsqr=" << params.nodeTsqrType
+               << " with Scalar=" << scalarType << ": "
+               << "Orthogonality ||I - Q^* Q||_F = " << orthoError
+               << " > bound " << orthoBound << "." << endl;
+        }
+      }
+
+      if(params.humanReadable) {
+        out << "NodeTsqr subclass: " << params.nodeTsqrType
+            << endl
+            << "  - Scalar type: " << scalarType << endl
+            << "  - Matrix dimensions: " << nrows << " by " << ncols
+            << endl
+            << "  - Cache Size Hint: " << params.cacheSizeHint
+            << endl
+            << "  - Contiguous cache blocks: "
+            << (params.contiguousCacheBlocks ? "true" : "false")
+            << endl
+            << "  - Input matrix norm $\\| A \\|_F$: " << results[2]
+            << endl
+            << "  - Residual $\\| A - QR \\|_F$: " << results[0]
+            << endl
+            << "  - Orthogonality $\\| I - Q^* Q \\|_F$: "
+            << results[1] << endl
+            << endl;
+      }
+      else {
+        out << params.nodeTsqrType
+            << "," << scalarType
+            << "," << nrows
+            << "," << ncols
+            << "," << params.cacheSizeHint
+            << ","
+            << (params.contiguousCacheBlocks ? "true" : "false")
+            << "," << results[2]
+            << "," << results[0]
+            << "," << results[1];
+        out << endl;
+      }
+      return success;
+    }
+
+    bool
+    verifyNodeTsqr(std::ostream& out,
+                   const NodeTestParameters& p)
+    {
+      // Seed for the next pseudorandom number generator.  We do tests
+      // one after another, using the seed from the previous test in
+      // the current test, so that the pseudorandom streams used by
+      // the tests are independent.
+      std::vector<int> iseed{{0, 0, 0, 1}};
+
+      bool success = true;
+      if(p.testReal) {
+        const bool ok_S = verifyNodeTsqrTmpl<float>(out, iseed, p);
+        const bool ok_D = verifyNodeTsqrTmpl<double>(out, iseed, p);
+        success = success && ok_S && ok_D;
+      }
+      if(p.testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
+        const bool ok_C =
+          verifyNodeTsqrTmpl<std::complex<float>>(out, iseed, p);
+        const bool ok_Z =
+          verifyNodeTsqrTmpl<std::complex<double>>(out, iseed, p);
+        success = success && ok_C && ok_Z;
+#else // HAVE_TPETRATSQR_COMPLEX
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::logic_error, "TSQR was not built with complex "
+           "arithmetic support.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+      }
+      return success;
+    }
+
+    template<template<class SC> class LapackType, class Scalar>
+    static void
+    verifyLapackTmpl(std::ostream& out,
+                     std::vector<int>& iseed,
+                     LapackType<Scalar>& lapack,
+                     const NodeTestParameters& params,
+                     const std::string& lapackImplName)
+    {
+      using std::cerr;
+      using std::endl;
+      using STS = Teuchos::ScalarTraits<Scalar>;
+      using mag_type = typename STS::magnitudeType;
+      const bool verbose = params.verbose;
+
+      const std::string scalarType =
+        Teuchos::TypeNameTraits<Scalar>::name();
+      const std::string fileSuffix = getFileSuffix<Scalar>("Lapack");
+
+      if(verbose) {
+        cerr << "Test RawQR<" << scalarType << "> implementation "
+             << lapackImplName << " whose type is "
+             << Teuchos::typeName(lapack) << endl;
+        if(lapack.wants_device_memory()) {
+          cerr << "-- RawQR subclass claims to want device memory"
+               << endl;
+        }
+      }
+      const int nrows = params.numRows;
+      const int ncols = params.numCols;
+
+      Matrix<int, Scalar> A(nrows, ncols);
+      Matrix<int, Scalar> A_copy(nrows, ncols);
+      Matrix<int, Scalar> Q(nrows, ncols);
+      Matrix<int, Scalar> R(ncols, ncols);
+      if(std::numeric_limits<Scalar>::has_quiet_NaN) {
+        deep_copy(A, std::numeric_limits< Scalar>::quiet_NaN());
+        deep_copy(A_copy, std::numeric_limits<Scalar>::quiet_NaN());
+        deep_copy(Q, std::numeric_limits<Scalar>::quiet_NaN());
+        deep_copy(R, std::numeric_limits<Scalar>::quiet_NaN());
+      }
+      const int lda = nrows;
+      const int ldq = nrows;
+      const int ldr = ncols;
+
+      if(verbose) {
+        cerr << "-- Create test problem" << endl;
+      }
+      {
+        TSQR::Random::NormalGenerator<int, Scalar> gen(iseed);
+        nodeTestProblem(gen, nrows, ncols, A.data(), A.stride(1),
+                        true);
+        gen.getSeed(iseed); // fetch seed for the next test
+      }
+
+      if(params.saveMatrices) {
+        std::string filename = std::string("A") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save A to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, nrows, ncols,
+                           A.data(), A.stride(1));
+        fileOut.close();
+      }
+
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      auto A_h = getHostMatrixView(A.view());
+      auto A_copy_h = getHostMatrixView(A_copy.view());
+      auto Q_h = getHostMatrixView(Q.view());
+      device_matrix_type A_d;
+      device_matrix_type A_copy_d;
+      device_matrix_type Q_d;
+      if(lapack.wants_device_memory()) {
+        A_d = getDeviceMatrixCopy(A.view(), "A_d");
+        // Don't copy A_copy yet; see below.
+        A_copy_d = device_matrix_type("A_copy_d", nrows, ncols);
+        Q_d = device_matrix_type("Q_d", nrows, ncols);
+      }
+
+      if(verbose) {
+        cerr << "-- Copy A into A_copy" << endl;
+      }
+      deep_copy(A_copy, A);
+      if(lapack.wants_device_memory()) {
+        deep_copy(A_copy_d, A_d);
+      }
+
+      if(verbose) {
+        cerr << "-- Fill R with zeros" << endl;
+      }
+      // We need to do this because the factorization may not
+      // overwrite the strict lower triangle of R.  R is always in
+      // host memory.
+      deep_copy(R, Scalar {});
+
+      if(verbose) {
+        cerr << "-- Do LAPACK lwork query" << endl;
+      }
+      const int lwork = [&]() {
+        if(lapack.wants_device_memory()) {
+          Scalar* A_copy_d_raw =
+            reinterpret_cast<Scalar*>(A_copy_d.data());
+          const int A_copy_d_lda(A_copy_d.stride(1));
+          TEUCHOS_ASSERT( nrows == 0 || ncols == 0 ||
+                          A_copy_d_raw != nullptr );
+          TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) ==
+                          size_t(nrows) );
+          TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) ==
+                          size_t(ncols) );
+          return lapack.compute_QR_lwork(nrows, ncols, A_copy_d_raw,
+                                         A_copy_d_lda);
+        }
+        else {
+          Scalar* A_copy_raw = A_copy.data();
+          const int A_copy_lda(A_copy.stride(1));
+          return lapack.compute_QR_lwork(nrows, ncols, A_copy_raw,
+                                         A_copy_lda);
+        }
+      }();
+      if(verbose) {
+        cerr << "-- lwork=" << lwork << endl;
+      }
+      std::vector<Scalar> work(lwork);
+      std::vector<Scalar> tau(ncols);
+
+      Kokkos::View<IST*> work_d;
+      Kokkos::View<IST*> tau_d;
+      if(lapack.wants_device_memory()) {
+        work_d = Kokkos::View<IST*>("work_d", lwork);
+        tau_d = Kokkos::View<IST*>("tau_d", ncols);
+      }
+
+      if(verbose) {
+        cerr << "-- Call compute_QR" << endl;
+      }
+
+      if(lapack.wants_device_memory()) {
+        Scalar* A_copy_d_raw =
+          reinterpret_cast<Scalar*>(A_copy_d.data());
+        Scalar* tau_d_raw = reinterpret_cast<Scalar*>(tau_d.data());
+        Scalar* work_d_raw =
+          reinterpret_cast<Scalar*>(work_d.data());
+        TEUCHOS_ASSERT( ncols == 0 || tau_d_raw != nullptr );
+        TEUCHOS_ASSERT( size_t(tau_d.extent(0)) >= size_t(ncols) );
+        TEUCHOS_ASSERT( lwork == 0 || work_d_raw != nullptr );
+        TEUCHOS_ASSERT( size_t(work_d.extent(0)) >= size_t(lwork) );
+        TEUCHOS_ASSERT( nrows == 0 || ncols == 0 ||
+                        A_copy_d_raw != nullptr );
+        TEUCHOS_ASSERT( size_t(A_copy_d.extent(0)) ==
+                        size_t(nrows) );
+        TEUCHOS_ASSERT( size_t(A_copy_d.extent(1)) ==
+                        size_t(ncols) );
+        lapack.compute_QR(nrows, ncols, A_copy_d_raw,
+                          A_copy_d.stride(1), tau_d_raw,
+                          work_d_raw, lwork);
+        Kokkos::deep_copy(A_copy_h, A_copy_d);
+      }
+      else {
+        lapack.compute_QR(nrows, ncols, A_copy.data(),
+                          A_copy.stride(1), tau.data(),
+                          work.data(), lwork);
+      }
+
+      if(verbose) {
+        cerr << "-- Copy R out of in-place result" << endl;
+      }
+      copy_upper_triangle(R, A_copy);
+      if(params.saveMatrices) {
+        std::string filename = std::string("R") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save R to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, ncols, ncols,
+                           R.data(), R.stride(1));
+        fileOut.close();
+      }
+
+      // The explicit Q factor will be computed in place, so copy the
+      // result of the factorization into Q.
+      deep_copy(Q, A_copy);
+      if(lapack.wants_device_memory()) {
+        deep_copy(Q_d, A_copy_d);
+      }
+
+      if(verbose) {
+        cerr << "-- Call Lapack::compute_explicit_Q" << endl;
+      }
+      if(lapack.wants_device_memory()) {
+        Scalar* Q_d_raw = reinterpret_cast<Scalar*>(Q_d.data());
+        const Scalar* tau_d_raw =
+          reinterpret_cast<const Scalar*>(tau_d.data());
+        Scalar* work_d_raw =
+          reinterpret_cast<Scalar*>(work_d.data());
+        lapack.compute_explicit_Q(nrows, ncols, ncols,
+                                  Q_d_raw, ldq, tau_d_raw,
+                                  work_d_raw, lwork);
+        deep_copy(Q_h, Q_d);
+      }
+      else {
+        lapack.compute_explicit_Q(nrows, ncols, ncols,
+                                  Q.data(), ldq, tau.data(),
+                                  work.data(), lwork);
+      }
+
+      if(params.saveMatrices) {
+        std::string filename = std::string("Q") + fileSuffix;
+        if(verbose) {
+          cerr << "-- Save Q to \"" << filename << "\"" << endl;
+        }
+        std::ofstream fileOut(filename.c_str());
+        print_local_matrix(fileOut, nrows, ncols,
+                           Q.data(), Q.stride(1));
+        fileOut.close();
+      }
+
+      if(verbose) {
+        cerr << "-- Call local_verify to validate the factorization"
+             << endl;
+      }
+      auto results = local_verify(nrows, ncols, A.data(), lda,
+                                  Q.data(), ldq, R.data(), ldr);
+
+      if(params.humanReadable) {
+        out << lapackImplName << ":" << endl
+            << "  - Scalar type: " << scalarType << endl
+            << "  - Matrix dimensions: " << nrows << " by " << ncols
+            << endl
+            << "  - Matrix norm $\\| A \\|_F$: "
+            << results[2] << endl
+            << "  - Residual $\\| A - QR \\|_F$: "
+            << results[0] << endl
+            << "  - Orthogonality $\\| I - Q^* Q \\|_F$: "
+            << results[1] << endl
+            << endl;
+      }
+      else {
+        out << lapackImplName
+            << "," << scalarType
+            << "," << nrows
+            << "," << ncols
+            << ",0"     // cacheSizeHint
+            << ",false" // contiguousCacheBlocks
+            << "," << results[2]
+            << "," << results[0]
+            << "," << results[1];
+        out << endl;
+      }
+    }
+
+    template<class Scalar>
+    void
+    verifyLapackImplementations(std::ostream& out,
+                                std::vector<int>& iseed,
+                                const NodeTestParameters& p)
+    {
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+      {
+        // Make sure that both Lapack and CuSolver get the same
+        // pseudorandom seed.
+        std::vector<int> iseed_copy(iseed);
+        auto handle = Impl::CuSolverHandle::getSingleton();
+        Kokkos::View<int> info("info");
+        Impl::CuSolver<Scalar> solver(handle, info.data());
+        verifyLapackTmpl(out, iseed_copy, solver, p, "CUSOLVER");
+      }
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+      {
+        Impl::Lapack<Scalar> lapack;
+        verifyLapackTmpl(out, iseed, lapack, p, "LAPACK");
+      }
+    }
+
+    void
+    verifyLapack(std::ostream& out,
+                 const NodeTestParameters& p)
+    {
+      // We do tests one after another, using the seed from the
+      // previous test in the current test, so that the pseudorandom
+      // streams used by the tests are independent.
+      std::vector<int> iseed {{0, 0, 0, 1}};
+      if(p.testReal) {
+        verifyLapackImplementations<float>(out, iseed, p);
+        verifyLapackImplementations<double>(out, iseed, p);
+      }
+      if(p.testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
+        verifyLapackImplementations<std::complex<float>>
+          (out, iseed, p);
+        verifyLapackImplementations<std::complex<double>>
+          (out, iseed, p);
+#else // HAVE_TPETRATSQR_COMPLEX
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::logic_error, "TSQR was not built with complex "
+           "arithmetic support.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+      }
+    }
+
+    static void
+    printBenchmarkFieldNames(std::ostream& out)
+    {
+      const char prefix[] = "%";
+      out << prefix << "method"
+          << ",scalarType"
+          << ",numRows"
+          << ",numCols"
+          << ",cacheSizeHint"
+          << ",contiguousCacheBlocks"
+          << ",numTrials"
+          << ",timing" << std::endl;
+    }
+
+    template<template<class SC> class LapackType, class Scalar>
+    void
+    benchmarkLapackTmpl(std::ostream& out,
+                        std::vector<int>& iseed,
+                        LapackType<Scalar>& lapack,
+                        const NodeTestParameters& params,
+                        const std::string& lapackImplName)
+    {
+      using std::endl;
+
+      const int numRows = params.numRows;
+      const int numCols = params.numCols;
+      const int numTrials = params.numTrials;
+
+      Matrix<int, Scalar> A(numRows, numCols);
+      Matrix<int, Scalar> Q(numRows, numCols);
+      Matrix<int, Scalar> R(numCols, numCols);
+      const int lda = numRows;
+      const int ldq = numRows;
+
+      {
+        using prng_type = TSQR::Random::NormalGenerator<int, Scalar>;
+        prng_type gen(iseed);
+        nodeTestProblem(gen, numRows, numCols, A.data(), lda, false);
+        gen.getSeed(iseed);
+      }
+
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      auto A_h = getHostMatrixView(A.view());
+      auto Q_h = getHostMatrixView(Q.view());
+      device_matrix_type A_d;
+      device_matrix_type Q_d;
+      if(lapack.wants_device_memory()) {
+        A_d = getDeviceMatrixCopy(A.view(), "A_d");
+        Q_d = device_matrix_type("Q_d", numRows, numCols);
+      }
+
+      // Copy A into Q, since LAPACK QR overwrites the input.  We only
+      // need Q because LAPACK's computation of the explicit Q factor
+      // occurs in place.  This doesn't work with TSQR.  To give
+      // LAPACK QR the fullest possible advantage over TSQR, we don't
+      // allocate an A_copy here (as we would when benchmarking TSQR).
+      deep_copy(Q, A);
+      if(lapack.wants_device_memory()) {
+        deep_copy(Q_d, A_d);
+      }
+
+      // Determine the required workspace for the factorization
+      const int lwork =
+        lworkQueryLapackQr(lapack, numRows, numCols, lda);
+      std::vector<Scalar> work(lwork);
+      std::vector<Scalar> tau(numCols);
+
+      Kokkos::View<IST*> work_d;
+      Kokkos::View<IST*> tau_d;
+      if(lapack.wants_device_memory()) {
+        work_d = Kokkos::View<IST*>("work_d", lwork);
+        tau_d = Kokkos::View<IST*>("tau_d", numCols);
+      }
+
+      // Benchmark LAPACK's QR factorization for numTrials trials.
+      Teuchos::Time timer("LAPACK");
+      timer.start();
+      for(int trialNum = 0; trialNum < numTrials; ++trialNum) {
+        if(lapack.wants_device_memory()) {
+          Scalar* Q_raw = reinterpret_cast<Scalar*>(Q_d.data());
+          Scalar* tau_raw = reinterpret_cast<Scalar*>(tau_d.data());
+          Scalar* work_raw =
+            reinterpret_cast<Scalar*>(work_d.data());
+          lapack.compute_QR(numRows, numCols,
+                            Q_raw, Q_d.stride(1),
+                            tau_raw, work_raw, lwork);
+        }
+        else {
+          lapack.compute_QR(numRows, numCols,
+                            Q.data(), ldq,
+                            tau.data(), work.data(), lwork);
+        }
+
+        if(lapack.wants_device_memory()) {
+          // FIXME (mfh 18 Dec 2019) We should actually extract the
+          // upper triangle here and copy it to host, to get a fair
+          // comparison with TSQR.
+
+          Scalar* Q_raw = reinterpret_cast<Scalar*>(Q_d.data());
+          const Scalar* tau_raw =
+            reinterpret_cast<const Scalar*>(tau_d.data());
+          Scalar* work_raw =
+            reinterpret_cast<Scalar*>(work_d.data());
+          lapack.compute_explicit_Q(numRows, numCols, numCols,
+                                    Q_raw, Q_d.stride(1),
+                                    tau_raw, work_raw, lwork);
+        }
+        else {
+          // Extract the upper triangular factor R from Q (where it was
+          // computed in place by GEQRF), since UNGQR will overwrite all
+          // of Q with the explicit Q factor.
+          copy_upper_triangle(R, Q);
+          lapack.compute_explicit_Q(numRows, numCols, numCols,
+                                    Q.data(), ldq, tau.data(),
+                                    work.data(), lwork);
+        }
+      }
+      const double lapackTiming = timer.stop();
+
+      const std::string scalarType =
+        Teuchos::TypeNameTraits<Scalar>::name();
+
+      if(params.humanReadable) {
+        out << lapackImplName << ":" << endl
+            << "  Scalar: " << scalarType << endl
+            << "  numRows: " << numRows << endl
+            << "  numCols: " << numCols << endl
+            << "  numTrials: " << numTrials << endl
+            << "Total time (s) = " << lapackTiming << endl
+            << endl;
+      }
+      else {
+        // "0" refers to the cache size hint, which is not applicable
+        // in this case; we retain it for easy comparison of results
+        // with NodeTsqr (so that the number of fields is the same in
+        // both cases).  "false" (that follows 0) refers to whether or
+        // not contiguous cache blocks were used (see TSQR::NodeTsqr);
+        // this is also not applicable here.
+        out << lapackImplName
+            << "," << scalarType
+            << "," << numRows
+            << "," << numCols
+            << ",0"
+            << ",false"
+            << "," << numTrials
+            << "," << lapackTiming << endl;
+      }
+    }
+
+    template<class Scalar>
+    void
+    benchmarkLapackImplementations(std::ostream& out,
+                                   std::vector<int>& iseed,
+                                   const NodeTestParameters& p)
+    {
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+      {
+        // Make sure that both Lapack and CuSolver get the same
+        // pseudorandom seed.
+        std::vector<int> iseed_copy(iseed);
+        auto handle = Impl::CuSolverHandle::getSingleton();
+        Kokkos::View<int> info("info");
+        Impl::CuSolver<Scalar> solver(handle, info.data());
+        benchmarkLapackTmpl(out, iseed_copy, solver, p, "CUSOLVER");
+      }
+#endif // HAVE_TPETRATSQR_CUBLAS && HAVE_TPETRATSQR_CUSOLVER
+      {
+        Impl::Lapack<Scalar> lapack;
+        benchmarkLapackTmpl(out, iseed, lapack, p, "LAPACK");
+      }
+    }
+
+    void
+    benchmarkLapack(std::ostream& out,
+                    const NodeTestParameters& p)
+    {
+      std::vector<int> iseed{{0, 0, 0, 1}};
+      if(p.testReal) {
+        benchmarkLapackImplementations<float>(out, iseed, p);
+        benchmarkLapackImplementations<double>(out, iseed, p);
+      }
+      if(p.testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
+        benchmarkLapackImplementations<std::complex<float>>(out, iseed, p);
+        benchmarkLapackImplementations<std::complex<double>>(out, iseed, p);
+#else // Don't HAVE_TPETRATSQR_COMPLEX
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::logic_error,
+           "TSQR was not built with complex arithmetic support.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+      }
+    }
+
+    template<class Scalar>
+    void
+    benchmarkNodeTsqrTmpl(std::ostream& out,
+                          std::vector<int>& iseed,
+                          NodeTsqr<int, Scalar>& actor,
+                          const NodeTestParameters& params,
+                          const std::string& nodeTsqrType)
+    {
+      using std::endl;
+
+      const int numRows = params.numRows;
+      const int numCols = params.numCols;
+      const int numTrials = params.numTrials;
+      const bool contiguousCacheBlocks =
+        params.contiguousCacheBlocks;
+
+      Matrix<int, Scalar> A(numRows, numCols);
+      Matrix<int, Scalar> A_copy(numRows, numCols);
+      Matrix<int, Scalar> Q(numRows, numCols);
+      Matrix<int, Scalar> R(numCols, numCols);
+
+      {
+        using prng_type = TSQR::Random::NormalGenerator<int, Scalar>;
+        prng_type gen(iseed);
+        nodeTestProblem(gen, numRows, numCols,
+                        A.data(), A.stride(1), false);
+        gen.getSeed(iseed);
+      }
+      deep_copy(A_copy, A); // need copy since TSQR overwrites
+
+      using IST = typename Kokkos::ArithTraits<Scalar>::val_type;
+      using device_matrix_type =
+        Kokkos::View<IST**, Kokkos::LayoutLeft>;
+
+      auto A_copy_h = getHostMatrixView(A_copy.view());
+      auto Q_h = getHostMatrixView(Q.view());
+      device_matrix_type A_copy_d;
+      device_matrix_type Q_d;
+      if(actor.wants_device_memory()) {
+        A_copy_d = getDeviceMatrixCopy(A_copy.view(), "A_copy_d");
+        Q_d = device_matrix_type("Q_d", numRows, numCols);
+      }
+
+      // Benchmark sequential TSQR for numTrials trials.
+      Teuchos::Time timer("NodeTsqr");
+      timer.start();
+      for(int trialNum = 0; trialNum < numTrials; ++trialNum) {
+        if(actor.wants_device_memory()) {
+          Scalar* A_raw =
+            reinterpret_cast<Scalar*>(A_copy_d.data());
+          auto factorOutput =
+            actor.factor(numRows, numCols,
+                         A_raw, A_copy_d.stride(1),
+                         R.data(), R.stride(1),
+                         contiguousCacheBlocks);
+          // Unlike with LAPACK, this doesn't happen in place: the
+          // implicit Q factor is stored in A_copy_d, and the explicit
+          // Q factor is written to Q_d.
+          Scalar* Q_raw = reinterpret_cast<Scalar*>(Q_d.data());
+          actor.explicit_Q(numRows, numCols,
+                           A_raw, A_copy_d.stride(1),
+                           *factorOutput, numCols,
+                           Q_raw, Q_d.stride(1),
+                           contiguousCacheBlocks);
+        }
+        else {
+          Scalar* A_raw = A_copy.data();
+          auto factorOutput =
+            actor.factor(numRows, numCols,
+                         A_raw, A_copy.stride(1),
+                         R.data(), R.stride(1),
+                         contiguousCacheBlocks);
+          // Unlike with LAPACK, this doesn't happen in place: the
+          // implicit Q factor is stored in A_copy, and the explicit Q
+          // factor is written to Q.
+          Scalar* Q_raw = Q.data();
+          actor.explicit_Q(numRows, numCols,
+                           A_raw, A_copy.stride(1),
+                           *factorOutput, numCols,
+                           Q_raw, Q.stride(1),
+                           contiguousCacheBlocks);
+        }
+      }
+      const double nodeTsqrTiming = timer.stop();
+
+      const std::string scalarType =
+        Teuchos::TypeNameTraits<Scalar>::name();
+
+      if(params.humanReadable) {
+        out << "NodeTsqr:" << endl
+            << "  Implementation: " << nodeTsqrType << endl
+            << "  Scalar: " << scalarType << endl
+            << "  numRows: " << numRows << endl
+            << "  numCols: " << numCols << endl
+            << "  cache size hint (bytes): "
+            << params.cacheSizeHint << endl
+            << "  contiguous cache blocks? "
+            << (contiguousCacheBlocks ? "true" : "false") << endl
+            << "  # trials = " << numTrials << endl
+            << "Total time (s) = " << nodeTsqrTiming << endl
+            << endl;
+      }
+      else {
+        out << nodeTsqrType
+            << "," << scalarType
+            << "," << numRows
+            << "," << numCols
+            << "," << params.cacheSizeHint
+            << "," << (contiguousCacheBlocks ? "true" : "false")
+            << "," << numTrials
+            << "," << nodeTsqrTiming << endl;
+      }
+    }
+
+    // If nodeTsqrType == "", use p.nodeTsqrType.
+    template<class Scalar>
+    void
+    benchmarkNodeTsqrImplementation(std::ostream& out,
+                                    const std::vector<int>& iseed,
+                                    const NodeTestParameters& p,
+                                    const std::string& nodeTsqrType = "")
+    {
+      // Make sure that all NodeTsqr implementations get the same
+      // pseudorandom seed.  That way, if there are any data-dependent
+      // performance effects (e.g., subnorms), all implementations
+      // will see them.
+      std::vector<int> iseed_copy(iseed);
+      auto nodeTsqrPtr = getNodeTsqr<Scalar>(p, nodeTsqrType);
+      benchmarkNodeTsqrTmpl(out, iseed_copy, *nodeTsqrPtr, p,
+                            nodeTsqrType);
+    }
+
+    template<class Scalar>
+    void
+    benchmarkNodeTsqrImplementations(std::ostream& out,
+                                     std::vector<int>& iseed,
+                                     const NodeTestParameters& p)
+    {
+
+      if(p.nodeTsqrType == "all" || p.nodeTsqrType == "ALL" ||
+         p.nodeTsqrType == "All") {
+        const char* nodeTsqrImpls[] =
+          {"CombineNodeTsqr",
+#if defined(HAVE_TPETRATSQR_CUBLAS) && defined(HAVE_TPETRATSQR_CUSOLVER)
+           "CuSolverNodeTsqr",
+#endif
+           "SequentialTsqr"};
+        for(auto&& nodeTsqrType : nodeTsqrImpls) {
+          benchmarkNodeTsqrImplementation<Scalar>(out, iseed, p,
+                                                  nodeTsqrType);
+        }
+      }
+      else {
+        benchmarkNodeTsqrImplementation<Scalar>(out, iseed, p);
+      }
+    }
+
+    void
+    benchmarkNodeTsqr(std::ostream& out,
+                      const NodeTestParameters& p)
+    {
+      using Teuchos::TypeNameTraits;
+      using LO = int;
+
+      std::vector<int> iseed{{0, 0, 0, 1}};
+      if(p.testReal) {
+        benchmarkNodeTsqrImplementations<float>(out, iseed, p);
+        benchmarkNodeTsqrImplementations<double>(out, iseed, p);
+      }
+      if(p.testComplex) {
+#ifdef HAVE_TPETRATSQR_COMPLEX
+        benchmarkNodeTsqrImplementations<std::complex<float>>
+          (out, iseed, p);
+        benchmarkNodeTsqrImplementations<std::complex<double>>
+          (out, iseed, p);
+#else // Don't HAVE_TPETRATSQR_COMPLEX
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::logic_error,
+           "TSQR was not built with complex arithmetic support.");
+#endif // HAVE_TPETRATSQR_COMPLEX
+      }
+    }
+  } // namespace Test
+} // namespace TSQR
+
+int
+main(int argc, char *argv[])
+{
+  using TSQR::Test::parseOptions;
+  using std::cerr;
+  using std::cout;
+  using std::endl;
+
+  // Fetch command-line parameters.
+  bool printedHelp = false;
+  auto params = parseOptions(argc, argv, printedHelp);
+  if(printedHelp) {
+    return EXIT_SUCCESS;
+  }
+
+  cout << "NodeTsqr verify/benchmark test options:" << endl;
+  printNodeTestParameters(cout, params, "  - ");
+
+  bool success = true;
+  try {
+    Kokkos::ScopeGuard kokkosScope(argc, argv);
+
+    // We allow the same run to do both benchmark and verify.
+    if(params.verify) {
+      if(! params.humanReadable) {
+        TSQR::Test::printVerifyFieldNames(cout);
+      }
+      TSQR::Test::verifyLapack(cout, params);
+      success = TSQR::Test::verifyNodeTsqr(cout, params);
+    }
+    if(params.benchmark) {
+      if(! params.humanReadable) {
+        TSQR::Test::printBenchmarkFieldNames(cout);
+      }
+      TSQR::Test::benchmarkLapack(cout, params);
+      TSQR::Test::benchmarkNodeTsqr(cout, params);
+    }
+
+    if(params.printTrilinosTestStuff) {
+      // The Trilinos test framework expects a message like this.
+      if(success) {
+        cout << "\nEnd Result: TEST PASSED" << endl;
+      }
+      else {
+        cout << "\nEnd Result: TEST FAILED" << endl;
+      }
+    }
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, cerr, success);
+  return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp
deleted file mode 100644
index 26c4222dea57..000000000000
--- a/packages/tpetra/tsqr/test/Tsqr_TestSeqTsqr.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#include "Tsqr_ConfigDefs.hpp"
-#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI
-#include "Teuchos_Tuple.hpp"
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
-#include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_StandardCatchMacros.hpp"
-#include "Tsqr_SeqTest.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-#  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-
-namespace TSQR {
-  namespace Trilinos {
-    namespace Test {
-
-      const char docString[] = "This program tests TSQR::SequentialTsqr, "
-        "which implements the sequential cache-blocked version of TSQR.  "
-        "Accuracy and performance tests are included.";
-
-      using Teuchos::RCP;
-      using Teuchos::Tuple;
-
-      /// \class SeqTestParameters
-      /// \brief Encapsulates values of command-line parameters
-      ///
-      struct SeqTestParameters {
-        SeqTestParameters () :
-          verify (false),
-          benchmark (false),
-          numRows (1000),
-          numCols (10),
-          numTrials (10),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          testComplex (true),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-          cacheSizeHint (0), // choose a reasonable default
-          contiguousCacheBlocks (false),
-          printFieldNames (true),
-          printTrilinosTestStuff (true),
-          humanReadable (false),
-          debug (false)
-          {}
-
-        bool verify, benchmark;
-        int numRows, numCols, numTrials;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-        bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-        size_t cacheSizeHint;
-        bool contiguousCacheBlocks;
-        std::string additionalFieldNames, additionalData;
-        bool printFieldNames, printTrilinosTestStuff, humanReadable, debug;
-      };
-
-      static void
-        benchmark (std::ostream& out,
-            const SeqTestParameters& params)
-        {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          const bool testComplex = params.testComplex;
-#else
-          const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-          using TSQR::Test::benchmarkSeqTsqr;
-          benchmarkSeqTsqr (out,
-              params.numRows,
-              params.numCols,
-              params.numTrials,
-              params.cacheSizeHint,
-              params.contiguousCacheBlocks,
-              testComplex,
-              params.additionalFieldNames,
-              params.additionalData,
-              params.printFieldNames,
-              params.humanReadable);
-        }
-
-      static void
-        verify (std::ostream& out,
-            const SeqTestParameters& params)
-        {
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-          const bool testComplex = params.testComplex;
-#else
-          const bool testComplex = false;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-          const bool saveMatrices = false;
-
-          using TSQR::Test::verifySeqTsqr;
-          verifySeqTsqr (out,
-              params.numRows,
-              params.numCols,
-              params.cacheSizeHint,
-              testComplex,
-              saveMatrices,
-              params.contiguousCacheBlocks,
-              params.additionalFieldNames,
-              params.additionalData,
-              params.printFieldNames,
-              params.humanReadable,
-              params.debug);
-        }
-
-      /// \brief Parse command-line options for this test
-      ///
-      /// \param argc [in] As usual in C(++)
-      /// \param argv [in] As usual in C(++)
-      /// \param allowedToPrint [in] Whether this (MPI) process is allowed
-      ///   to print to stdout/stderr.  Different per (MPI) process.
-      /// \param printedHelp [out] Whether this (MPI) process printed the
-      ///   "help" display (summary of command-line options)
-      ///
-      /// \return Encapsulation of command-line options
-      static SeqTestParameters
-        parseOptions (int argc,
-            char* argv[],
-            const bool allowedToPrint,
-            bool& printedHelp)
-        {
-          using std::cerr;
-          using std::endl;
-
-          printedHelp = false;
-
-          // Command-line parameters, set to their default values.
-          SeqTestParameters params;
-          /// We really want the cache block size as a size_t, but
-          /// Teuchos::CommandLineProcessor doesn't offer that option.
-          /// So we read it in as an int, which means negative inputs
-          /// are possible.  We check for those below in the input
-          /// validation phase.
-          //
-          // Fetch default value of cacheSizeHint.
-          int cacheSizeHintAsInt = static_cast<int> (params.cacheSizeHint);
-          try {
-            using Teuchos::CommandLineProcessor;
-
-            CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-                /* recognizeAllOptions=*/ true);
-            cmdLineProc.setDocString (docString);
-            cmdLineProc.setOption ("verify",
-                "noverify",
-                &params.verify,
-                "Test accuracy");
-            cmdLineProc.setOption ("benchmark",
-                "nobenchmark",
-                &params.benchmark,
-                "Test performance");
-            cmdLineProc.setOption ("nrows",
-                &params.numRows,
-                "Number of rows in the test matrix");
-            cmdLineProc.setOption ("ncols",
-                &params.numCols,
-                "Number of columns in the test matrix");
-            cmdLineProc.setOption ("ntrials",
-                &params.numTrials,
-                "Number of trials (only used when \"--benchmark\"");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-            cmdLineProc.setOption ("complex",
-                "nocomplex",
-                &params.testComplex,
-                "Test complex arithmetic, as well as real");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-            cmdLineProc.setOption ("cache-block-size",
-                &cacheSizeHintAsInt,
-                "Cache size hint in bytes (0 means pick a reasonable default)");
-            cmdLineProc.setOption ("contiguous-cache-blocks",
-                "noncontiguous-cache-blocks",
-                &params.contiguousCacheBlocks,
-                "Whether cache blocks should be stored contiguously");
-            cmdLineProc.setOption ("field-names",
-                &params.additionalFieldNames,
-                "Any additional field name(s) (comma-delimited "
-                "string) to add to the benchmark output.  Empty "
-                "by default.  Good for things known when invoking "
-                "the benchmark executable, but not (easily) known "
-                "inside the benchmark -- e.g., environment "
-                "variables.");
-            cmdLineProc.setOption ("output-data",
-                &params.additionalData,
-                "Any additional data to add to the output, "
-                "corresponding to the above field name(s). "
-                "Empty by default.");
-            cmdLineProc.setOption ("print-field-names",
-                "no-print-field-names",
-                &params.printFieldNames,
-                "Print field names (for machine-readable output only)");
-            cmdLineProc.setOption ("print-trilinos-test-stuff",
-                "no-print-trilinos-test-stuff",
-                &params.printTrilinosTestStuff,
-                "Print output that makes the Trilinos test "
-                "framework happy (but makes benchmark results "
-                "parsing scripts unhappy)");
-            cmdLineProc.setOption ("human-readable",
-                "machine-readable",
-                &params.humanReadable,
-                "If set, make output easy to read by humans "
-                "(but hard to parse)");
-            cmdLineProc.setOption ("debug",
-                "nodebug",
-                &params.debug,
-                "Print debugging information");
-            cmdLineProc.parse (argc, argv);
-          }
-          catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-            if (allowedToPrint)
-              cerr << "Unrecognized command-line option: " << e.what() << endl;
-            throw e;
-          }
-          catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-            printedHelp = true;
-            return params; // Don't verify parameters in this case
-          }
-
-          // Validate command-line options.  We provide default values
-          // for unset options, so we don't have to validate those.
-          if (params.numRows <= 0)
-            throw std::invalid_argument ("Number of rows must be positive");
-          else if (params.numCols <= 0)
-            throw std::invalid_argument ("Number of columns must be positive");
-          else if (params.numRows < params.numCols)
-            throw std::invalid_argument ("Number of rows must be >= number of columns");
-          else if (params.benchmark && params.numTrials < 1)
-            throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");
-          else
-          {
-            if (cacheSizeHintAsInt < 0)
-              throw std::invalid_argument ("Cache size hint must be nonnegative");
-            else
-              params.cacheSizeHint = static_cast< size_t > (cacheSizeHintAsInt);
-          }
-          return params;
-        }
-
-    } // namespace Test
-  } // namespace Trilinos
-} // namespace TSQR
-
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-  int
-main (int argc, char *argv[])
-{
-  using Teuchos::RCP;
-  using TSQR::Trilinos::Test::SeqTestParameters;
-  using TSQR::Trilinos::Test::parseOptions;
-
-#ifdef HAVE_MPI
-  typedef RCP< const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to stdout.  The other MPI process ranks
-  // send their output to something that looks like /dev/null (and
-  // likely is, on Unix-y operating systems).
-  std::ostream& out = (myRank == 0) ? std::cout : blackhole;
-  // Only Rank 0 performs the tests.
-  const bool performingTests = (myRank == 0);
-  const bool allowedToPrint = (myRank == 0);
-
-#else // Don't HAVE_MPI: single-node test
-
-  const bool performingTests = true;
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-#endif // HAVE_MPI
-
-  // Fetch command-line parameters.
-  bool printedHelp = false;
-  SeqTestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp)
-    return 0;
-
-  bool success = false;
-  bool verbose = false;
-  try {
-    if (performingTests)
-    {
-      using std::endl;
-
-      if (params.benchmark)
-        TSQR::Trilinos::Test::benchmark (out, params);
-
-      // We allow the same run to do both benchmark and verify.
-      if (params.verify)
-        TSQR::Trilinos::Test::verify (out, params);
-
-      success = true;
-
-      if (params.printTrilinosTestStuff)
-        // The Trilinos test framework expects a message like this.
-        out << "\nEnd Result: TEST PASSED" << endl;
-    }
-  }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
-  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
-}
diff --git a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp b/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp
deleted file mode 100644
index e70a8c1c3b3c..000000000000
--- a/packages/tpetra/tsqr/test/Tsqr_TestTbbTsqr.cpp
+++ /dev/null
@@ -1,473 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//          Kokkos: Node API and Parallel Node Kernels
-//              Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ************************************************************************
-//@HEADER
-
-#include "Tsqr_ConfigDefs.hpp"
-#include "Teuchos_ConfigDefs.hpp" // HAVE_MPI
-#include "Teuchos_Tuple.hpp"
-#ifdef HAVE_MPI
-#  include "Teuchos_GlobalMPISession.hpp"
-#  include "Teuchos_oblackholestream.hpp"
-#endif // HAVE_MPI
-#include "Teuchos_CommandLineProcessor.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_Time.hpp"
-#include "Teuchos_StandardCatchMacros.hpp"
-#include "Tsqr_TbbTest.hpp"
-
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-#  include <complex>
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-
-namespace TSQR {
-namespace Trilinos {
-namespace Test {
-
-  const char docString[] = "This program tests TSQR::TbbTsqr, "
-    "which implements the Intel TBB intranode parallel version of TSQR.  "
-    "Accuracy and performance tests are included.";
-
-  using Teuchos::RCP;
-  using Teuchos::Tuple;
-
-  /// \class TbbTestParameters
-  /// \brief Encapsulates values of command-line parameters
-  struct TbbTestParameters {
-    TbbTestParameters () :
-      verify (false),
-      benchmark (false),
-      numCores (1),
-      numRows (1000),
-      numCols (10),
-      numTrials (10),
-      testReal (true),
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      testComplex (false),
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      cacheSizeHint (0),
-      contiguousCacheBlocks (false),
-      printFieldNames (true),
-      humanReadable (false),
-      debug (false)
-    {}
-
-    bool verify, benchmark;
-    int numCores, numRows, numCols, numTrials;
-    bool testReal;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    bool testComplex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-    size_t cacheSizeHint;
-    bool contiguousCacheBlocks, printFieldNames, humanReadable, debug;
-  };
-
-  static void
-  benchmark (const TbbTestParameters& params)
-  {
-    using TSQR::Test::benchmarkTbbTsqr;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    using std::complex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-    // Only print field names (if at all) for the first data type tested.
-    bool printedFieldNames = false;
-
-    if (params.testReal) {
-      {
-        std::string scalarTypeName ("float");
-        benchmarkTbbTsqr<int, float > (scalarTypeName,
-                                       params.numTrials,
-                                       params.numRows,
-                                       params.numCols,
-                                       params.numCores,
-                                       params.cacheSizeHint,
-                                       params.contiguousCacheBlocks,
-                                       params.printFieldNames && ! printedFieldNames,
-                                       params.humanReadable);
-        if (params.printFieldNames && ! printedFieldNames)
-          printedFieldNames = true;
-      }
-      {
-        std::string scalarTypeName ("double");
-        benchmarkTbbTsqr<int, double > (scalarTypeName,
-                                        params.numTrials,
-                                        params.numRows,
-                                        params.numCols,
-                                        params.numCores,
-                                        params.cacheSizeHint,
-                                        params.contiguousCacheBlocks,
-                                        params.printFieldNames && ! printedFieldNames,
-                                        params.humanReadable);
-        if (params.printFieldNames && ! printedFieldNames)
-          printedFieldNames = true;
-      }
-    }
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    if (params.testComplex) {
-      {
-        std::string scalarTypeName ("complex<float>");
-        benchmarkTbbTsqr<int, complex<float> > (scalarTypeName,
-                                                params.numTrials,
-                                                params.numRows,
-                                                params.numCols,
-                                                params.numCores,
-                                                params.cacheSizeHint,
-                                                params.contiguousCacheBlocks,
-                                                params.printFieldNames && ! printedFieldNames,
-                                                params.humanReadable);
-        if (params.printFieldNames && ! printedFieldNames)
-          printedFieldNames = true;
-      }
-      {
-        std::string scalarTypeName ("complex<double>");
-        benchmarkTbbTsqr<int, complex<double> > (scalarTypeName,
-                                                 params.numTrials,
-                                                 params.numRows,
-                                                 params.numCols,
-                                                 params.numCores,
-                                                 params.cacheSizeHint,
-                                                 params.contiguousCacheBlocks,
-                                                 params.printFieldNames && ! printedFieldNames,
-                                                 params.humanReadable);
-        if (params.printFieldNames && ! printedFieldNames)
-          printedFieldNames = true;
-      }
-    }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-  }
-
-  static void
-  verify (const TbbTestParameters& params)
-  {
-    using TSQR::Test::verifyTbbTsqr;
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    using std::complex;
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-
-    std::vector<int> seed(4);
-    seed[0] = 0;
-    seed[1] = 0;
-    seed[2] = 0;
-    seed[3] = 1;
-
-    // Only print field names (if at all) for the first data type tested.
-    bool printedFieldNames = false;
-
-    if (params.testReal) {
-      {
-        TSQR::Random::NormalGenerator<int, float> gen (seed);
-        std::string scalarTypeName ("float");
-        verifyTbbTsqr<int, float> (scalarTypeName,
-                                   gen,
-                                   params.numRows,
-                                   params.numCols,
-                                   params.numCores,
-                                   params.cacheSizeHint,
-                                   params.contiguousCacheBlocks,
-                                   params.printFieldNames && ! printedFieldNames,
-                                   params.humanReadable,
-                                   params.debug);
-        if (params.printFieldNames && ! printedFieldNames) {
-          printedFieldNames = true;
-        }
-        gen.getSeed (seed);
-      }
-      {
-        TSQR::Random::NormalGenerator<int, double> gen (seed);
-        std::string scalarTypeName ("double");
-        verifyTbbTsqr<int, double> (scalarTypeName,
-                                    gen,
-                                    params.numRows,
-                                    params.numCols,
-                                    params.numCores,
-                                    params.cacheSizeHint,
-                                    params.contiguousCacheBlocks,
-                                    params.printFieldNames && ! printedFieldNames,
-                                    params.humanReadable,
-                                    params.debug);
-        if (params.printFieldNames && ! printedFieldNames) {
-          printedFieldNames = true;
-        }
-        gen.getSeed (seed);
-      }
-    } // if (params.testReal)
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-    if (params.testComplex) {
-      {
-        TSQR::Random::NormalGenerator<int, complex<float> > gen (seed);
-        std::string scalarTypeName ("complex<float>");
-        verifyTbbTsqr<int, complex<float> > (scalarTypeName,
-                                             gen,
-                                             params.numRows,
-                                             params.numCols,
-                                             params.numCores,
-                                             params.cacheSizeHint,
-                                             params.contiguousCacheBlocks,
-                                             params.printFieldNames && ! printedFieldNames,
-                                             params.humanReadable,
-                                             params.debug);
-        if (params.printFieldNames && ! printedFieldNames) {
-          printedFieldNames = true;
-        }
-        gen.getSeed (seed);
-      }
-      {
-        TSQR::Random::NormalGenerator<int, complex<double> > gen (seed);
-        std::string scalarTypeName ("complex<double>");
-        verifyTbbTsqr<int, complex<double> > (scalarTypeName,
-                                              gen,
-                                              params.numRows,
-                                              params.numCols,
-                                              params.numCores,
-                                              params.cacheSizeHint,
-                                              params.contiguousCacheBlocks,
-                                              params.printFieldNames && ! printedFieldNames,
-                                              params.humanReadable,
-                                              params.debug);
-        if (params.printFieldNames && ! printedFieldNames) {
-          printedFieldNames = true;
-        }
-        gen.getSeed (seed);
-      }
-    }
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-  }
-
-  /// \brief Parse command-line options for this test
-  ///
-  /// \param argc [in] As usual in C(++)
-  /// \param argv [in] As usual in C(++)
-  /// \param allowedToPrint [in] Whether this (MPI) process is allowed
-  ///   to print to stdout/stderr.  Different per (MPI) process.
-  /// \param printedHelp [out] Whether this (MPI) process printed the
-  ///   "help" display (summary of command-line options)
-  ///
-  /// \return Encapsulation of command-line options
-  static TbbTestParameters
-  parseOptions (int argc,
-                char* argv[],
-                const bool allowedToPrint,
-                bool& printedHelp)
-  {
-    using std::cerr;
-    using std::endl;
-
-    printedHelp = false;
-
-    // Command-line parameters, set to their default values.
-    TbbTestParameters params;
-    /// We really want the cache block size as a size_t, but
-    /// Teuchos::CommandLineProcessor doesn't offer that option.
-    /// So we read it in as an int, which means negative inputs
-    /// are possible.  We check for those below in the input
-    /// validation phase.
-    //
-    // Fetch default value of cacheSizeHint.
-    int cacheSizeHintAsInt = static_cast<int> (params.cacheSizeHint);
-    try {
-      using Teuchos::CommandLineProcessor;
-
-      CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true,
-                                        /* recognizeAllOptions=*/ true);
-      cmdLineProc.setDocString (docString);
-      cmdLineProc.setOption ("verify",
-                             "noverify",
-                             &params.verify,
-                             "Test accuracy");
-      cmdLineProc.setOption ("benchmark",
-                             "nobenchmark",
-                             &params.benchmark,
-                             "Test performance");
-      cmdLineProc.setOption ("nrows",
-                             &params.numRows,
-                             "Number of rows in the test matrix");
-      cmdLineProc.setOption ("ncols",
-                             &params.numCols,
-                             "Number of columns in the test matrix");
-      cmdLineProc.setOption ("ntrials",
-                             &params.numTrials,
-                             "Number of trials (only used when \"--benchmark\"");
-      cmdLineProc.setOption ("real",
-                             "noreal",
-                             &params.testReal,
-                             "Test real arithmetic");
-#ifdef HAVE_KOKKOSTSQR_COMPLEX
-      cmdLineProc.setOption ("complex",
-                             "nocomplex",
-                             &params.testComplex,
-                             "Test complex arithmetic");
-#endif // HAVE_KOKKOSTSQR_COMPLEX
-      cmdLineProc.setOption ("ncores",
-                             &params.numCores,
-                             "Number of cores to use for Intel TBB");
-      cmdLineProc.setOption ("cache-block-size",
-                             &cacheSizeHintAsInt,
-                             "Cache size hint in bytes (0 means pick a reasonable default)");
-      cmdLineProc.setOption ("contiguous-cache-blocks",
-                             "noncontiguous-cache-blocks",
-                             &params.contiguousCacheBlocks,
-                             "Whether cache blocks should be stored contiguously");
-      cmdLineProc.setOption ("print-field-names",
-                             "no-print-field-names",
-                             &params.printFieldNames,
-                             "Print field names (for machine-readable output only)");
-      cmdLineProc.setOption ("human-readable",
-                             "machine-readable",
-                             &params.humanReadable,
-                             "If set, make output easy to read by humans "
-                             "(but hard to parse)");
-      cmdLineProc.setOption ("debug",
-                             "nodebug",
-                             &params.debug,
-                             "Print debugging information");
-      cmdLineProc.parse (argc, argv);
-    }
-    catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) {
-      if (allowedToPrint) {
-        cerr << "Unrecognized command-line option: " << e.what() << endl;
-      }
-      throw e;
-    }
-    catch (Teuchos::CommandLineProcessor::HelpPrinted& e) {
-      printedHelp = true;
-      return params; // Don't verify parameters in this case
-    }
-
-    // Validate command-line options.  We provide default values
-    // for unset options, so we don't have to validate those.
-    if (params.numRows <= 0) {
-      throw std::invalid_argument ("Number of rows must be positive");
-    }
-    else if (params.numCols <= 0) {
-      throw std::invalid_argument ("Number of columns must be positive");
-    }
-    else if (params.numRows < params.numCols) {
-      throw std::invalid_argument ("Number of rows must be >= number of columns");
-    }
-    else if (params.benchmark && params.numTrials < 1) {
-      throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");
-    }
-    else if (params.numCores < 1) {
-      throw std::invalid_argument ("\"--ncores\" option must be >= 1");
-    }
-    else {
-      if (cacheSizeHintAsInt < 0) {
-        throw std::invalid_argument ("Cache size hint must be nonnegative");
-      }
-      else {
-        params.cacheSizeHint = static_cast<size_t> (cacheSizeHintAsInt);
-      }
-    }
-    return params;
-  }
-
-} // namespace Test
-} // namespace Trilinos
-} // namespace TSQR
-
-int
-main (int argc, char *argv[])
-{
-  using Teuchos::RCP;
-  using TSQR::Trilinos::Test::TbbTestParameters;
-  using TSQR::Trilinos::Test::parseOptions;
-
-#ifdef HAVE_MPI
-  typedef RCP<const Teuchos::Comm<int> > comm_ptr;
-
-  Teuchos::oblackholestream blackhole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackhole);
-  comm_ptr comm = Teuchos::DefaultComm<int>::getComm();
-  const int myRank = comm->getRank();
-  // Only Rank 0 gets to write to stdout.  The other MPI process ranks
-  // send their output to something that looks like /dev/null (and
-  // likely is, on Unix-y operating systems).
-  std::ostream& out = (myRank == 0) ? std::cout : blackhole;
-  // Only Rank 0 performs the tests.
-  const bool performingTests = (myRank == 0);
-  const bool allowedToPrint = (myRank == 0);
-
-#else // Don't HAVE_MPI: single-node test
-
-  const bool performingTests = true;
-  const bool allowedToPrint = true;
-  std::ostream& out = std::cout;
-#endif // HAVE_MPI
-
-  // Fetch command-line parameters.
-  bool printedHelp = false;
-  TbbTestParameters params =
-    parseOptions (argc, argv, allowedToPrint, printedHelp);
-  if (printedHelp) {
-    return 0;
-  }
-
-  bool success = false;
-  bool verbose = false;
-  try {
-    if (performingTests) {
-      using std::endl;
-
-      // The same run may both benchmark and verify, if that's what
-      // the user wants.
-      if (params.verify) {
-        TSQR::Trilinos::Test::verify (params);
-      }
-      if (params.benchmark) {
-        TSQR::Trilinos::Test::benchmark (params);
-      }
-
-      success = true;
-
-      // The Trilinos test framework expects a message like this.
-      // Obviously we haven't tested anything, but eventually we
-      // will include accuracy integration tests.
-      out << "\nEnd Result: TEST PASSED" << endl;
-    }
-  }
-  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
-
-  return success ? EXIT_SUCCESS : EXIT_FAILURE;
-}
-
-