diff --git a/cmake/ctest/drivers/atdm/ats2/local-driver.sh b/cmake/ctest/drivers/atdm/ats2/local-driver.sh
index 8704a96fb282..eb9ffd6ab5aa 100755
--- a/cmake/ctest/drivers/atdm/ats2/local-driver.sh
+++ b/cmake/ctest/drivers/atdm/ats2/local-driver.sh
@@ -32,6 +32,8 @@ fi
 
 # Allow default setting for TPETRA_ASSUME_CUDA_AWARE_MPI=0 in trilinos_jsrun
 unset TPETRA_ASSUME_CUDA_AWARE_MPI
+atdm_config_ctest_regex_old="$ATDM_CONFIG_CTEST_REGEX"
+export ATDM_CONFIG_CTEST_REGEX="$ATDM_CONFIG_CTEST_REGEX -E Adelus*"
 
 echo
 echo "======================================================================="
@@ -44,6 +46,8 @@ set -x
 $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ats2/local-driver-single-build.sh
 set +x
 
+export ATDM_CONFIG_CTEST_REGEX="$atdm_config_ctest_regex_old"
+
 if [[ "${Trilinos_CTEST_RUN_CUDA_AWARE_MPI}" == "1" ]]; then
   echo
   echo "======================================================================="
diff --git a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake
index aa86835f17de..0de8d2ac8767 100644
--- a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake
+++ b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake
@@ -152,7 +152,6 @@ set (Kokkos_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (KokkosKernels_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (MueLu_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (NOX_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
-set (Phalanx_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (ROL_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (Sacado_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (SEACAS_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
@@ -163,127 +162,8 @@ set (Xpetra_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 set (Zoltan2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build")
 
 # Tpetra UVM = OFF tests
-set (TpetraCore_BlockCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Bug5072_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_BlankRowBugTest_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_iallreduce_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_idot_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_UnitTests0_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_UnitTests1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Issue601_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Issue601_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_insertGlobalIndicesFiltered_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_getNumDiags_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_UnpackIntoStaticGraph_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsGraph_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnitTests2_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnitTests3_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnitTests4_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnitTests_Swap_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_NonlocalAfterResume_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_LeftRightScale_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_2DRandomDist_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_WithGraph_Cuda_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_ReplaceDomainMapAndImporter_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_NonlocalSumInto_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_NonlocalSumInto_Ignore_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug5978_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug6069_1_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug6069_2_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug6171_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_ReplaceLocalValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_ReplaceDiagonal_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_MultipleFillCompletes_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_ReindexColumns_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_TransformValues_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_GetRowCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_PackUnpack_MPI_1_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Equilibration_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_StaticImportExport_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_sumIntoStaticProfileExtraSpace_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_createDeepCopy_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_UnpackMerge_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug7745_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_Bug8794_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_RemoveEmptyProcesses_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Albany182_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Distributor_CreateFromSendsAndRecvs_MPI_8_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_Issue1752_MPI_2_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FECrsGraph_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FECrsMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FEMultiVector_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FixedHashTableTest_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_computeOffsetsFromCounts_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_ImportExport_ImportConstructExpert_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_UnpackLongRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_ExportToStaticGraphCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_ImportExport2_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_rmat_nodiag_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_Binary_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_simple_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_3_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_6_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsMatrix_Dist_BinaryPerProcess_rmat_MPI_10_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Tpetra_CrsGraph_InOutTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMarket_Operator_Test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FECrs_MatrixMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_copyConvert_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_StaticView_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_RowMatrixTransposer_test_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_RowMatrixTransposer_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_CrsMatrix_transpose_sortedRows_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_lesson03_power_method_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_lesson05_redistribution_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FEMAssembly_InsertGlobalIndicesFESPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FEMAssembly_TotalElementLoopSP_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_FEMAssembly_TotalElementLoopSPKokkos_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_AdditiveSchwarzHalo_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_BlockCrsPerfTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_NewReaderExample_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_NewReaderExample_rmat_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_guide_power_method_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_guide_matrix_fill_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_guide_data_redist_1_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-set (TpetraCore_EpetraRowMatrix_UnitTests_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
-
+set (TpetraCore_BlockCrsMatrix_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for UVM build")
+set (TpetraCore_BlockCrsPerfTest_MPI_4_DISABLE ON CACHE BOOL "Turn off tests for UVM build")
 
 # ShyLU_DD UVM = OFF tests
 set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_IPOU_DIM3_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build")
diff --git a/cmake/std/atdm/ats2/tweaks/Tweaks.cmake b/cmake/std/atdm/ats2/tweaks/Tweaks.cmake
index f707b9fd1dbe..96d699941c0d 100644
--- a/cmake/std/atdm/ats2/tweaks/Tweaks.cmake
+++ b/cmake/std/atdm/ats2/tweaks/Tweaks.cmake
@@ -10,6 +10,12 @@ ATDM_SET_CACHE(Trilinos_CUDA_SLOTS_PER_GPU 2 CACHE STRING)
 # Disables across multiple builds on 'ats2'
 #
 
+IF (ATDM_NODE_TYPE STREQUAL "CUDA")
+  # Adelus always needs -M -gpu passed to jsrun, but trilinos_jsrun cannot support this
+  # for single rank MPI processes without breaking the invocation of other Trilinos tests
+  ATDM_SET_ENABLE(Adelus_vector_random_MPI_1_DISABLE ON)
+ENDIF()
+
 IF (ATDM_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
 
   # Disable some expensive KokkosKernels tests in pure debug builds (#6464)
diff --git a/packages/amesos2/src/Amesos2_Factory.cpp b/packages/amesos2/src/Amesos2_Factory.cpp
index ecfa91bdd245..dfa6a7f5a34d 100644
--- a/packages/amesos2/src/Amesos2_Factory.cpp
+++ b/packages/amesos2/src/Amesos2_Factory.cpp
@@ -169,11 +169,10 @@ namespace Amesos2 {
 
   std::string tolower (const std::string& s)
   {
-    std::locale loc;
     std::string rtn = s;
     const size_t len = rtn.length ();
     for (size_t i = 0; i < len; ++i) {
-      rtn[i] = tolower (rtn[i], loc);
+      rtn[i] = ::tolower (rtn[i]);
     }
     return rtn;
   }
diff --git a/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h b/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h
index 572115c0651d..c7a4cf411178 100644
--- a/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h
+++ b/packages/amesos2/src/KLU2/Include/klu2_ordinaltraits.h
@@ -110,35 +110,37 @@ struct KLU_OrdinalTraits<int>
 };
 
 template<>
-struct KLU_OrdinalTraits<long int>
+struct KLU_OrdinalTraits<ptrdiff_t>
 {
-    static inline long int btf_order (long int n, long int *Ap, long int *Ai,
-        double maxwork, double *work, long int *P, long int *Q, long int *R, long int *nmatch,
-        long int *Work)
+// These should all be UF_long, which I presume is resolving to ptrdiff_t
+// ptrdiff_t is ptrdiff_t on Linux64, but just to be safe
+    static inline ptrdiff_t btf_order (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai,
+        double maxwork, double *work, ptrdiff_t *P, ptrdiff_t *Q, ptrdiff_t *R, ptrdiff_t *nmatch,
+        ptrdiff_t *Work)
     {
         return (trilinos_btf_l_order (n, Ap, Ai, maxwork, work, P, Q, R, nmatch,
                     Work));
     }
 
-    static inline long int btf_strongcomp (long int n, long int *Ap, long int *Ai, long int *Q,
-        long int *P, long int *R, long int *Work)
+    static inline ptrdiff_t btf_strongcomp (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai, ptrdiff_t *Q,
+        ptrdiff_t *P, ptrdiff_t *R, ptrdiff_t *Work)
     {
         return(trilinos_btf_l_strongcomp (n, Ap, Ai, Q, P, R, Work)) ;
     }
 
-    static inline long int amd_order (long int n, long int *Ap, long int *Ai, long int *P,
+    static inline ptrdiff_t amd_order (ptrdiff_t n, ptrdiff_t *Ap, ptrdiff_t *Ai, ptrdiff_t *P,
         double *Control, double *Info)
     {
         return (trilinos_amd_l_order(n, Ap, Ai, P, Control, Info)) ;
     }
 
-    static inline long int colamd (long int n_row, long int n_col, long int Alen, long int *A,
-        long int *p, double *knobs, long int *stats)
+    static inline ptrdiff_t colamd (ptrdiff_t n_row, ptrdiff_t n_col, ptrdiff_t Alen, ptrdiff_t *A,
+        ptrdiff_t *p, double *knobs, ptrdiff_t *stats)
     {
         return(trilinos_colamd_l (n_row, n_col, Alen, A, p, knobs, stats));
     }
 
-    static inline long int colamd_recommended (long int nnz, long int n_row, long int n_col)
+    static inline ptrdiff_t colamd_recommended (ptrdiff_t nnz, ptrdiff_t n_row, ptrdiff_t n_col)
     {
         return(trilinos_colamd_l_recommended(nnz, n_row, n_col));
     }
diff --git a/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp b/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp
index b211af00e978..d6139da60a32 100644
--- a/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp
+++ b/packages/anasazi/tpetra/example/TraceMinDavidson/TraceMinDavidsonLaplacianEx.cpp
@@ -383,12 +383,10 @@ void formLaplacian(const RCP<const CrsMatrix>& A, const bool weighted, const boo
 
   if(weighted)
   {
-    // These vectors hold the actual data
-    // The ArrayView objects just point to them
-    std::vector<GO> colIndices;
-    std::vector<Scalar> values;
-    Teuchos::ArrayView<GO> colIndicesView;
-    Teuchos::ArrayView<Scalar> valuesView;
+    using indices_view = typename CrsMatrix::nonconst_global_inds_host_view_type;
+    using values_view  = typename CrsMatrix::nonconst_values_host_view_type;
+    indices_view colIndices("colIndices");
+    values_view values("values");
 
     // This vector holds the diagonal
     RCP<Vector> diagonal = Teuchos::rcp(new Vector(rowMap));
@@ -406,15 +404,11 @@ void formLaplacian(const RCP<const CrsMatrix>& A, const bool weighted, const boo
       {
         // Figure out how many entries are in the row
         size_t numentries = L->getNumEntriesInGlobalRow(i);
-        colIndices.resize(numentries);
-        values.resize(numentries);
-
-        // Point the array views to the vectors
-        colIndicesView = Teuchos::arrayViewFromVector(colIndices);
-        valuesView = Teuchos::arrayViewFromVector(values);
+        Kokkos::resize(colIndices,numentries);
+        Kokkos::resize(values,numentries);
 
         // Get a copy of row i
-        L->getGlobalRowCopy(i,colIndicesView,valuesView,numentries);
+        L->getGlobalRowCopy(i,colIndices,values,numentries);
 
         for(size_t j=0; j<colIndices.size(); j++)
         {
@@ -430,7 +424,7 @@ void formLaplacian(const RCP<const CrsMatrix>& A, const bool weighted, const boo
         }
 
         // Reinsert the updated row
-        L->replaceGlobalValues(i, colIndicesView, valuesView);
+        L->replaceGlobalValues(i, colIndices, values);
       }
     }
 
diff --git a/packages/belos/epetra/test/MINRES/CMakeLists.txt b/packages/belos/epetra/test/MINRES/CMakeLists.txt
index 09a6a9a46fae..877a25fefa80 100644
--- a/packages/belos/epetra/test/MINRES/CMakeLists.txt
+++ b/packages/belos/epetra/test/MINRES/CMakeLists.txt
@@ -35,4 +35,19 @@ IF (${PACKAGE_NAME}_ENABLE_Triutils)
     EXEDEPS minres_hb
     )
 
+  ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_Ifpack)
+  IF(${PACKAGE_NAME}_ENABLE_Ifpack)
+
+    TRIBITS_ADD_EXECUTABLE_AND_TEST(
+      pminres_hb
+      SOURCES test_pminres_hb.cpp
+      COMM serial mpi
+      ARGS
+        "--verbose --filename=bcsstk14.hb --left-prec --max-iters=100"
+        "--verbose --filename=bcsstk14.hb --right-prec --max-iters=100"
+      STANDARD_PASS_OUTPUT
+      )
+
+  ENDIF(${PACKAGE_NAME}_ENABLE_Ifpack)
+
 ENDIF(${PACKAGE_NAME}_ENABLE_Triutils)
diff --git a/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp b/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp
new file mode 100644
index 000000000000..f4f76ef86b02
--- /dev/null
+++ b/packages/belos/epetra/test/MINRES/test_pminres_hb.cpp
@@ -0,0 +1,255 @@
+//@HEADER
+// ************************************************************************
+//
+//                 Belos: Block Linear Solvers Package
+//                  Copyright 2004 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+//
+// This driver reads a problem from a Harwell-Boeing (HB) file.
+// Multiple right-hand-sides are created randomly.
+// The initial guesses are all set to zero.
+//
+#include "BelosConfigDefs.hpp"
+#include "BelosLinearProblem.hpp"
+#include "BelosEpetraAdapter.hpp"
+#include "BelosMinresSolMgr.hpp"
+#include "BelosEpetraUtils.h"
+#include "Trilinos_Util.h"
+#include "Epetra_CrsMatrix.h"
+#include "Epetra_Map.h"
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_ParameterList.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+
+#include "Ifpack.h"
+//
+int main(int argc, char *argv[]) {
+  //
+  Teuchos::GlobalMPISession session(&argc, &argv, NULL);
+  //
+  using Teuchos::ParameterList;
+  using Teuchos::RCP;
+  using Teuchos::rcp;
+
+  bool success = false;
+  bool verbose = false;
+  try {
+    //
+    // Get test parameters from command-line processor
+    //
+    bool proc_verbose = false;
+    bool leftprec = true;      // left preconditioning or right.
+    int frequency = -1;        // how often residuals are printed by solver
+    int numrhs = 5;            // total number of right-hand sides to solve for
+    int maxiters = -1;         // maximum number of iterations for the solver to use
+    std::string filename("bcsstk14.hb");
+    double tol = 1.0e-5;       // relative residual tolerance
+
+    Teuchos::CommandLineProcessor cmdp(false,true);
+    cmdp.setOption("verbose","quiet",&verbose,"Print messages and results.");
+    cmdp.setOption("left-prec","right-prec",&leftprec,"Left preconditioning or right.");
+    cmdp.setOption("frequency",&frequency,"Solvers frequency for printing residuals (#iters).");
+    cmdp.setOption("filename",&filename,"Filename for Harwell-Boeing test matrix.");
+    cmdp.setOption("tol",&tol,"Relative residual tolerance used by Minres solver.");
+    cmdp.setOption("num-rhs",&numrhs,"Number of right-hand sides to be solved for.");
+    cmdp.setOption("max-iters",&maxiters,"Maximum number of iterations per linear system (-1 := adapted to problem/block size).");
+
+    if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
+      return -1;
+    }
+    if (!verbose)
+      frequency = -1;  // Reset frequency if verbosity is off
+    //
+    // Get the problem
+    //
+    int MyPID;
+    RCP<Epetra_CrsMatrix> A;
+    RCP<Epetra_MultiVector> X, B;
+    int return_val =Belos::Util::createEpetraProblem(filename,NULL,&A,&B,&X,&MyPID);
+    if(return_val != 0) return return_val;
+    proc_verbose = ( verbose && (MyPID==0) );
+    //
+    // Solve using Belos
+    //
+    typedef double                           ST;
+    typedef Epetra_Operator                  OP;
+    typedef Epetra_MultiVector               MV;
+    typedef Belos::OperatorTraits<ST,MV,OP> OPT;
+    typedef Belos::MultiVecTraits<ST,MV>    MVT;
+    //
+    // *****Construct initial guess and random right-hand-sides *****
+    //
+    if (numrhs != 1) {
+      X = rcp( new Epetra_MultiVector( A->Map(), numrhs ) );
+      MVT::MvRandom( *X );
+      B = rcp( new Epetra_MultiVector( A->Map(), numrhs ) );
+      OPT::Apply( *A, *X, *B );
+      MVT::MvInit( *X, 0.0 );
+    }
+    //
+    // ************Construct preconditioner*************
+    //
+    ParameterList ifpackList;
+
+    // allocates an IFPACK factory. No data is associated
+    // to this object (only method Create()).
+    Ifpack Factory;
+
+    // create the preconditioner. For valid PrecType values,
+    // please check the documentation
+    std::string PrecType = "ICT"; // incomplete Cholesky
+    int OverlapLevel = 0; // must be >= 0. If Comm.NumProc() == 1,
+    // it is ignored.
+
+    RCP<Ifpack_Preconditioner> Prec = Teuchos::rcp( Factory.Create(PrecType, &*A, OverlapLevel) );
+    assert(Prec != Teuchos::null);
+
+    // specify parameters for ICT
+    ifpackList.set("fact: drop tolerance", 1e-9);
+    ifpackList.set("fact: ict level-of-fill", 1.0);
+    // the combine mode is on the following:
+    // "Add", "Zero", "Insert", "InsertAdd", "Average", "AbsMax"
+    // Their meaning is as defined in file Epetra_CombineMode.h
+    ifpackList.set("schwarz: combine mode", "Add");
+    // sets the parameters
+    IFPACK_CHK_ERR(Prec->SetParameters(ifpackList));
+
+    // initialize the preconditioner. At this point the matrix must
+    // have been FillComplete()'d, but actual values are ignored.
+    IFPACK_CHK_ERR(Prec->Initialize());
+
+    // Builds the preconditioners, by looking for the values of
+    // the matrix.
+    IFPACK_CHK_ERR(Prec->Compute());
+
+    // Create the Belos preconditioned operator from the Ifpack preconditioner.
+    // NOTE:  This is necessary because Belos expects an operator to apply the
+    //        preconditioner with Apply() NOT ApplyInverse().
+    RCP<Belos::EpetraPrecOp> belosPrec = rcp( new Belos::EpetraPrecOp( Prec ) );
+
+    //
+    // *****Create parameter list for the Minres solver manager*****
+    //
+    const int NumGlobalElements = B->GlobalLength();
+    if (maxiters == -1)
+      maxiters = NumGlobalElements - 1; // maximum number of iterations to run
+    //
+    ParameterList belosList;
+    belosList.set( "Maximum Iterations", maxiters );       // Maximum number of iterations allowed
+    belosList.set( "Convergence Tolerance", tol );         // Relative convergence tolerance requested
+    if (verbose) {
+      belosList.set( "Verbosity", Belos::Errors + Belos::Warnings +
+          Belos::TimingDetails + Belos::FinalSummary + Belos::StatusTestDetails );
+      if (frequency > 0)
+        belosList.set( "Output Frequency", frequency );
+    }
+    else
+      belosList.set( "Verbosity", Belos::Errors + Belos::Warnings );
+    //
+    // *******Construct a preconditioned linear problem********
+    //
+    RCP<Belos::LinearProblem<double,MV,OP> > problem
+      = rcp( new Belos::LinearProblem<double,MV,OP>( A, X, B ) );
+    if (leftprec) {
+      problem->setLeftPrec( belosPrec );
+    }
+    else {
+      problem->setRightPrec( belosPrec );
+    }
+
+    bool set = problem->setProblem();
+    if (set == false) {
+      if (proc_verbose)
+        std::cout << std::endl << "ERROR:  Belos::LinearProblem failed to set up correctly!" << std::endl;
+      return -1;
+    }
+
+    // Create an iterative solver manager.
+    RCP< Belos::SolverManager<double,MV,OP> > solver
+      = rcp( new Belos::MinresSolMgr<double,MV,OP>(problem, rcp(&belosList,false)) );
+
+    //
+    // *******************************************************************
+    // *************Start the Minres iteration*************************
+    // *******************************************************************
+    if (proc_verbose) {
+      std::cout << std::endl << std::endl;
+      std::cout << "Dimension of matrix: " << NumGlobalElements << std::endl;
+      std::cout << "Number of right-hand sides: " << numrhs << std::endl;
+      std::cout << "Max number of Minres iterations: " << maxiters << std::endl;
+      std::cout << "Relative residual tolerance: " << tol << std::endl;
+      std::cout << std::endl;
+    }
+    //
+    // Perform solve
+    //
+    Belos::ReturnType ret = solver->solve();
+    //
+    // Compute actual residuals.
+    //
+    bool badRes = false;
+    std::vector<double> actual_resids( numrhs );
+    std::vector<double> rhs_norm( numrhs );
+    Epetra_MultiVector resid(A->Map(), numrhs);
+    OPT::Apply( *A, *X, resid );
+    MVT::MvAddMv( -1.0, resid, 1.0, *B, resid );
+    MVT::MvNorm( resid, actual_resids );
+    MVT::MvNorm( *B, rhs_norm );
+    if (proc_verbose) {
+      std::cout<< "---------- Actual Residuals (normalized) ----------"<<std::endl<<std::endl;
+      for ( int i=0; i<numrhs; i++) {
+        double actRes = actual_resids[i]/rhs_norm[i];
+        std::cout<<"Problem "<<i<<" : \t"<< actRes <<std::endl;
+        if (actRes > tol) badRes = true;
+      }
+    }
+
+    success = ret==Belos::Converged && !badRes;
+
+    if (success) {
+      if (proc_verbose)
+        std::cout << std::endl << "End Result: TEST PASSED" << std::endl;
+    } else {
+      if (proc_verbose)
+        std::cout << std::endl << "End Result: TEST FAILED" << std::endl;
+    }
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
+
+  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
+} // end test_bl_pcg_hb.cpp
+
diff --git a/packages/belos/src/BelosMinresIter.hpp b/packages/belos/src/BelosMinresIter.hpp
index 5cc4a452c6ce..4ca2de6e82ea 100644
--- a/packages/belos/src/BelosMinresIter.hpp
+++ b/packages/belos/src/BelosMinresIter.hpp
@@ -86,8 +86,8 @@ namespace Belos {
 /// Implementation of the preconditioned Minimal Residual Method
 /// (MINRES) iteration.  This a bilinear form implementation, that
 /// uses inner products of the form <x,My> to solve the preconditioned
-/// linear system M^{-1}*A x = b.  Thus, it is necessary that the
-/// left preconditioner M is positive definite.
+/// linear system.  Thus, it is necessary that the left preconditioner 
+/// M is positive definite.
 ///
 /// \ingroup belos_solver_framework
 ///
@@ -402,7 +402,6 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
 
     // Create convenience variables for zero, one.
     const ScalarType one = SCT::one();
-    const MagnitudeType zero = SMT::zero();
     const MagnitudeType m_zero = SMT::zero();
 
     // Set up y and v for the first Lanczos vector v_1.
@@ -417,6 +416,13 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
 
     if ( lp_->getLeftPrec() != Teuchos::null ) {
       lp_->applyLeftPrec( *newstate.Y, *Y_ );
+      if ( lp_->getRightPrec() != Teuchos::null ) {
+        Teuchos::RCP<MV> tmp = MVT::CloneCopy( *Y_ );
+        lp_->applyRightPrec( *tmp, *Y_ );
+      }
+    }
+    else if ( lp_->getRightPrec() != Teuchos::null ) {
+      lp_->applyRightPrec( *newstate.Y, *Y_ );
     }
     else {
       if (newstate.Y != Y_) {
@@ -433,7 +439,7 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
                         std::invalid_argument,
                         "The preconditioner is not positive definite." );
 
-    if( SCT::magnitude(beta1_(0,0)) == zero )
+    if( SCT::magnitude(beta1_(0,0)) == m_zero )
     {
         // X = 0
         Teuchos::RCP<MV> cur_soln_vec = lp_->getCurrLHSVec();
@@ -470,7 +476,6 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
     Teuchos::SerialDenseMatrix<int,ScalarType> alpha( 1, 1 );
     Teuchos::SerialDenseMatrix<int,ScalarType> beta( beta1_ );
     phibar_ = Teuchos::ScalarTraits<ScalarType>::magnitude( beta1_(0,0) );
-    ScalarType shift = zero; // TODO Allow for proper shift.
 
     // Initialize a few variables.
     ScalarType oldBeta = zero;
@@ -513,10 +518,6 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
       // Apply operator.
       lp_->applyOp (*V, *Y_);
 
-      // Apply shift
-      if (shift != zero)
-	MVT::MvAddMv (one, *Y_, -shift, *V, *Y_);
-
       if (iter_ > 1)
 	MVT::MvAddMv (one, *Y_, -beta(0,0)/oldBeta, *R1_, *Y_);
 
@@ -533,12 +534,19 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
       R2_ = Y_;
       Y_ = tmpY;
 
-      // apply left preconditioner
+      // apply preconditioner
       if ( lp_->getLeftPrec() != Teuchos::null ) {
         lp_->applyLeftPrec( *R2_, *Y_ );
+        if ( lp_->getRightPrec() != Teuchos::null ) {
+          Teuchos::RCP<MV> tmp = MVT::CloneCopy( *Y_ );
+          lp_->applyRightPrec( *tmp, *Y_ );
+        }
+      }
+      else if ( lp_->getRightPrec() != Teuchos::null ) {
+        lp_->applyRightPrec( *R2_, *Y_ );
       } // else "y = r2"
       else {
-        MVT::MvAddMv( one, *R2_, zero, *R2_, *Y_ );
+        MVT::Assign( *R2_, *Y_ );
       }
 
       // Get new beta.
@@ -594,8 +602,8 @@ class MinresIter : virtual public MinresIteration<ScalarType,MV,OP> {
 
       // Update x:
       // x = x + phi*w;
-      //MVT::MvAddMv( one, *cur_soln_vec, phi, *W_, *cur_soln_vec );
-      lp_->updateSolution( W_, true, phi );
+      MVT::MvAddMv( one, *cur_soln_vec, phi, *W_, *cur_soln_vec );
+      lp_->updateSolution();
     } // end while (sTest_->checkStatus(this) != Passed)
   }
 
diff --git a/packages/epetra/src/Epetra_Comm.h b/packages/epetra/src/Epetra_Comm.h
index 65c9743f246c..9f482c494c04 100644
--- a/packages/epetra/src/Epetra_Comm.h
+++ b/packages/epetra/src/Epetra_Comm.h
@@ -460,6 +460,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_Comm {
   //! Create a distributor object.
   virtual Epetra_Distributor * CreateDistributor() const = 0;
   //! Create a directory object for the given Epetra_BlockMap.
+// CreateDirectory is defined in Winbase.h as a macro!
+#ifdef CreateDirectory
+#undef CreateDirectory
+#endif
   virtual Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const = 0;
   //@}
 
diff --git a/packages/epetra/src/Epetra_CrsGraph.cpp b/packages/epetra/src/Epetra_CrsGraph.cpp
index adc1e5c96781..cb460126cd0d 100644
--- a/packages/epetra/src/Epetra_CrsGraph.cpp
+++ b/packages/epetra/src/Epetra_CrsGraph.cpp
@@ -3065,3 +3065,13 @@ Epetra_CrsGraph& Epetra_CrsGraph::operator = (const Epetra_CrsGraph& Source) {
 
   return(*this);
 }
+
+//=============================================================================
+Epetra_IntSerialDenseVector& Epetra_CrsGraph::ExpertExtractIndexOffset(){
+   return CrsGraphData_->IndexOffset_;
+ }
+
+//=============================================================================
+Epetra_IntSerialDenseVector& Epetra_CrsGraph::ExpertExtractIndices() {
+  return CrsGraphData_->data->All_Indices_;
+ }
diff --git a/packages/epetra/src/Epetra_CrsGraph.h b/packages/epetra/src/Epetra_CrsGraph.h
index 59febb14c9b8..96ac3cfc1638 100644
--- a/packages/epetra/src/Epetra_CrsGraph.h
+++ b/packages/epetra/src/Epetra_CrsGraph.h
@@ -1004,8 +1004,22 @@ class EPETRA_LIB_DLL_EXPORT Epetra_CrsGraph: public Epetra_DistObject {
   //! Returns a pointer to the CrsGraphData instance this CrsGraph uses.
   /*! (Intended for developer use only for testing purposes.) */
   const Epetra_CrsGraphData* DataPtr() const {return(CrsGraphData_);}
+  
 
-        //! Forces FillComplete() to locally order ghostnodes associated with each remote processor in ascending order.
+  //! Returns a reference to the Epetra_IntSerialDenseVector used to hold the local IndexOffsets (CRS rowptr)
+  /*!
+    \warning This method is intended for experts only, its use may require user code modifications in future versions of Epetra.
+  */
+  Epetra_IntSerialDenseVector& ExpertExtractIndexOffset(); 
+
+  //! Returns a reference to the Epetra_IntSerialDenseVector used to hold the local All_Indices (CRS colind)
+  /*!
+    \warning This method is intended for experts only, its use may require user code modifications in future versions of Epetra.
+  */
+  Epetra_IntSerialDenseVector& ExpertExtractIndices();
+
+
+    //! Forces FillComplete() to locally order ghostnodes associated with each remote processor in ascending order.
         /*! To be compliant with AztecOO, FillComplete() already locally orders ghostnodes such that
             information received from processor k has a lower local numbering than information received
             from processor j if k is less than j.  SortGhostsAssociatedWithEachProcessor(True) further
diff --git a/packages/epetra/src/Epetra_MpiComm.h b/packages/epetra/src/Epetra_MpiComm.h
index abcd5723e1c4..6a14cb5d6797 100644
--- a/packages/epetra/src/Epetra_MpiComm.h
+++ b/packages/epetra/src/Epetra_MpiComm.h
@@ -474,6 +474,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_MpiComm: public Epetra_Object, public virtual
   //! Create a distributor object.
   Epetra_Distributor * CreateDistributor() const;
   //! Create a directory object for the given Epetra_BlockMap.
+// CreateDirectory is defined in Winbase.h as a macro!
+#ifdef CreateDirectory
+#undef CreateDirectory
+#endif
   Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const;
   //@}
 
diff --git a/packages/epetra/src/Epetra_SerialComm.h b/packages/epetra/src/Epetra_SerialComm.h
index 1a5ca0a47d68..43f38d505bec 100644
--- a/packages/epetra/src/Epetra_SerialComm.h
+++ b/packages/epetra/src/Epetra_SerialComm.h
@@ -441,6 +441,10 @@ class EPETRA_LIB_DLL_EXPORT Epetra_SerialComm: public Epetra_Object, public virt
   //! Create a distributor object.
   Epetra_Distributor * CreateDistributor() const;
   //! Create a directory object for the given Epetra_BlockMap.
+// CreateDirectory is defined in Winbase.h as a macro!
+#ifdef CreateDirectory
+#undef CreateDirectory
+#endif
   Epetra_Directory * CreateDirectory(const Epetra_BlockMap & Map) const;
   //@}
 
diff --git a/packages/ifpack/src/Ifpack_Hypre.cpp b/packages/ifpack/src/Ifpack_Hypre.cpp
index 5c9a1cef6e92..9fe6a6d43624 100644
--- a/packages/ifpack/src/Ifpack_Hypre.cpp
+++ b/packages/ifpack/src/Ifpack_Hypre.cpp
@@ -53,6 +53,7 @@
 #include "HYPRE_parcsr_ls.h"
 #include "krylov.h"
 #include "_hypre_parcsr_mv.h"
+#include "_hypre_parcsr_ls.h"
 #include "_hypre_IJ_mv.h"
 #include "HYPRE_parcsr_mv.h"
 #include "HYPRE.h"
@@ -792,6 +793,40 @@ int Ifpack_Hypre::Compute(){
     IFPACK_CHK_ERR(PrecondSetupPtr_(Preconditioner_, ParMatrix_, ParX_, ParY_));
   }
 
+  // Dump Hierarchy here for BoomerAMG Preconditioner
+  if(Dump_ && PrecondSolvePtr_ == &HYPRE_BoomerAMGSolve) {
+    hypre_ParAMGData   *amg_data = (hypre_ParAMGData*) Preconditioner_;
+    hypre_ParCSRMatrix    **A_array = hypre_ParAMGDataAArray(amg_data);
+    hypre_ParCSRMatrix    **P_array = hypre_ParAMGDataPArray(amg_data);
+    HYPRE_Int     **CF_marker_array = hypre_ParAMGDataCFMarkerArray(amg_data);
+    HYPRE_Int num_levels = hypre_ParAMGDataNumLevels(amg_data);
+
+    char ofs[80];
+    for(int k=0; k<num_levels; k++) {
+      // A 
+      sprintf(ofs,"A_matrix.bmg.%d.dat",k);
+      HYPRE_ParCSRMatrixPrint(A_array[k], ofs);
+      if(k!=num_levels-1) {
+        // P       
+        sprintf(ofs,"P_matrix.bmg.%d.dat",k);
+        HYPRE_ParCSRMatrixPrint(P_array[k], ofs);
+
+        // CF
+        // Note: Hypre outputs "-1" for F Points and "1" for C Points
+        HYPRE_Int local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[k]));
+        sprintf(ofs,"cf_marker.bmg.%d.dat.%d",k,Comm().MyPID());
+        FILE * f = fopen(ofs,"w");
+        fprintf(f,"%%%%MatrixMarket matrix array real general\n");
+        fprintf(f,"%d 1\n",local_size);
+        for(int i=0; i<local_size; i++)
+          fprintf(f,"%d\n",(int)CF_marker_array[k][i]);
+        fclose(f);
+      }
+
+    }
+  }//end dump for BoomerAMG
+
+
   IsComputed_ = true;
   NumCompute_ = NumCompute_ + 1;
   ComputeTime_ = ComputeTime_ + Time_.ElapsedTime();
diff --git a/packages/ifpack2/adapters/thyra/Thyra_Ifpack2PreconditionerFactory_def.hpp b/packages/ifpack2/adapters/thyra/Thyra_Ifpack2PreconditionerFactory_def.hpp
index fd74afdc97f6..a917da8927c8 100644
--- a/packages/ifpack2/adapters/thyra/Thyra_Ifpack2PreconditionerFactory_def.hpp
+++ b/packages/ifpack2/adapters/thyra/Thyra_Ifpack2PreconditionerFactory_def.hpp
@@ -170,9 +170,8 @@ void Ifpack2PreconditionerFactory<MatrixType>::initializePrec(
   // precTypeUpper is the upper-case version of preconditionerType.
   std::string precTypeUpper (preconditionerType);
   if (precTypeUpper.size () > 0) {
-    std::locale locale;
     for (size_t k = 0; k < precTypeUpper.size (); ++k) {
-      precTypeUpper[k] = std::toupper<char> (precTypeUpper[k], locale);
+      precTypeUpper[k] = ::toupper(precTypeUpper[k]);
     }
   }
   
diff --git a/packages/ifpack2/example/RelaxationWithEquilibration.cpp b/packages/ifpack2/example/RelaxationWithEquilibration.cpp
index c045c27ed439..4eb218d7664b 100644
--- a/packages/ifpack2/example/RelaxationWithEquilibration.cpp
+++ b/packages/ifpack2/example/RelaxationWithEquilibration.cpp
@@ -913,26 +913,23 @@ densifyGatheredCrsMatrix (LO& errCode,
 {
   const LO numRows = LO (A.getRangeMap ()->getNodeNumElements ());
   const LO numCols = LO (A.getDomainMap ()->getNodeNumElements ());
+  using lids_type = typename Tpetra::CrsMatrix<SC, LO, GO, NT>::local_inds_host_view_type;
+  using vals_type = typename Tpetra::CrsMatrix<SC, LO, GO, NT>::values_host_view_type;
 
   using dense_matrix_type = HostDenseMatrix<SC, LO, GO, NT>;
   dense_matrix_type A_dense (label, numRows, numCols);
 
   for (LO lclRow = 0; lclRow < numRows; ++lclRow) {
-    LO numEnt = 0;
-    const LO* lclColInds = nullptr;
-    const SC* vals = nullptr;
-    const LO curErrCode = A.getLocalRowView (lclRow, numEnt, vals, lclColInds);
-    if (errCode != 0) {
-      errCode = curErrCode;
-    }
-    else {
-      for (LO k = 0; k < numEnt; ++k) {
-        const LO lclCol = lclColInds[k];
-        using impl_scalar_type =
-          typename Tpetra::CrsMatrix<SC, LO, GO, NT>::impl_scalar_type;
-        A_dense(lclRow, lclCol) += impl_scalar_type (vals[k]);
-      }
-    }
+    lids_type lclColInds;
+    vals_type vals;
+    A.getLocalRowView (lclRow, lclColInds, vals);
+    LO numEnt = vals.size();
+    for (LO k = 0; k < numEnt; ++k) {
+      const LO lclCol = lclColInds[k];
+      using impl_scalar_type =
+        typename Tpetra::CrsMatrix<SC, LO, GO, NT>::impl_scalar_type;
+      A_dense(lclRow, lclCol) += impl_scalar_type (vals[k]);
+    }   
   }
 
   return A_dense;
@@ -1125,8 +1122,8 @@ deepCopyFillCompleteCrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
     (! A.isFillComplete (), std::invalid_argument,
      "deepCopyFillCompleteCrsMatrix: Input matrix A must be fillComplete.");
   RCP<crs_matrix_type> A_copy (new crs_matrix_type (A.getCrsGraph ()));
-  auto A_copy_lcl = A_copy->getLocalMatrix ();
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_copy_lcl = A_copy->getLocalMatrixDevice ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   Kokkos::deep_copy (A_copy_lcl.values, A_lcl.values);
   A_copy->fillComplete (A.getDomainMap (), A.getRangeMap ());
   return A_copy;
diff --git a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp
index d36b348160f9..6e01346cd9a8 100644
--- a/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BandedContainer_decl.hpp
@@ -137,6 +137,7 @@ class BandedContainer
   using typename Container<MatrixType>::HostView;
   using typename ContainerImpl<MatrixType, LSC>::HostSubviewLocal;
   using typename ContainerImpl<MatrixType, LSC>::ConstHostSubviewLocal;
+  using typename ContainerImpl<MatrixType,LSC>::block_crs_matrix_type;
   using HostViewLocal = typename local_mv_type::dual_view_type::t_host;
 
   static_assert(std::is_same<MatrixType,
diff --git a/packages/ifpack2/src/Ifpack2_BandedContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BandedContainer_def.hpp
index 20c0cbe17d34..8caa3c782647 100644
--- a/packages/ifpack2/src/Ifpack2_BandedContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BandedContainer_def.hpp
@@ -129,14 +129,17 @@ computeBandwidth()
         LO localCol = this->translateRowToCol(blockRows[j]);
         colToBlockOffset[localCol] = blockStart + j;
       }
+
+      using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+      using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
       for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        h_inds_type indices;
+        h_vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();
         for(LO k = 0; k < numEntries; k++)
         {
           LO colOffset = colToBlockOffset[indices[k]];
@@ -285,14 +288,16 @@ void BandedContainer<MatrixType, LocalScalarType>::extract()
         LO localCol = this->translateRowToCol(blockRows[j]);
         colToBlockOffset[localCol] = blockStart + j;
       }
+      using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+      using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
       for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        h_inds_type indices;
+        h_vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();        
         for(LO k = 0; k < numEntries; k++)
         {
           LO colOffset = colToBlockOffset[indices[k]];
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
index aad796ef8d8b..4e4e75aadaac 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
@@ -363,7 +363,7 @@ namespace Ifpack2 {
       typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
       typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
       typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
-      typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
+      typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
 
       ///
       /// simd vectorization
@@ -1563,10 +1563,10 @@ namespace Ifpack2 {
 
       // construct the D and R graphs in A = D + R.
       {
-        const auto& local_graph = g.getLocalGraph();
-        const auto& local_graph_rowptr = local_graph.row_map;
+        const auto local_graph = g.getLocalGraphHost();
+        const auto local_graph_rowptr = local_graph.row_map;
         TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
-        const auto& local_graph_colidx = local_graph.entries;
+        const auto local_graph_colidx = local_graph.entries;
 
         //assume no overlap.
 
@@ -1783,8 +1783,8 @@ namespace Ifpack2 {
           }
 
           // Allocate or view values.
-          amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->
-                               template getValues<node_memory_space>());
+          amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
+                               
         }
       }
     }
@@ -1914,8 +1914,8 @@ namespace Ifpack2 {
         packptr(interf_.packptr),
         max_partsz(interf_.max_partsz),
         // block crs matrix
-        A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
-        A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
+        A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
+        A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
         // block tridiags
         pack_td_ptr(btdm_.pack_td_ptr),
         flat_td_ptr(btdm_.flat_td_ptr),
@@ -3822,7 +3822,7 @@ namespace Ifpack2 {
 
       const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
       ComputeResidualVector<MatrixType>
-        compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
+        compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
                                 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
 
       // norm manager workspace resize
diff --git a/packages/ifpack2/src/Ifpack2_Container_decl.hpp b/packages/ifpack2/src/Ifpack2_Container_decl.hpp
index 82063ab27de8..f43f89a6cf07 100644
--- a/packages/ifpack2/src/Ifpack2_Container_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Container_decl.hpp
@@ -537,25 +537,33 @@ namespace Details {
   {
     using SC = Scalar;
     using LO = LocalOrdinal;
+
+    using block_crs_matrix_type = Tpetra::BlockCrsMatrix<SC, LO, GlobalOrdinal, Node>;
+
+    using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+    using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
     //! Constructor for row views (preferred)
-    StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_);
+    StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_);
+
+    //! Constructor for row views 
+    //    StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_);
 
     //! Constructor for deep copy (fallback, if matrix doesn't support row views)
     StridedRowView(Teuchos::Array<SC>& vals_, Teuchos::Array<LO>& inds_);
-    
+        
     SC val(size_t i) const;
     LO ind(size_t i) const;
 
     size_t size() const;
 
     private:
-      const SC* vals;
-      const LO* inds;
-      int blockSize;
-      size_t nnz;
-      //These arrays are only used if the inputMatrix_ doesn't support row views.
-      Teuchos::Array<SC> valsCopy;
-      Teuchos::Array<LO> indsCopy;
+    h_vals_type vals;
+    h_inds_type inds;
+    int blockSize;
+    size_t nnz;
+    //These arrays are only used if the inputMatrix_ doesn't support row views.
+    Teuchos::Array<SC> valsCopy;
+    Teuchos::Array<LO> indsCopy;
   };
 } // namespace Details
 
diff --git a/packages/ifpack2/src/Ifpack2_Container_def.hpp b/packages/ifpack2/src/Ifpack2_Container_def.hpp
index b5c9b220a7af..04f693422b8c 100644
--- a/packages/ifpack2/src/Ifpack2_Container_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Container_def.hpp
@@ -264,13 +264,15 @@ void ContainerImpl<MatrixType, LocalScalarType>::DoGSBlock(
     //Use efficient blocked version
     ArrayView<const LO> blockRows = this->getBlockRows(i);
     const size_t localNumRows = this->blockSizes_[i];
+    using inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+    using vals_type = typename block_crs_matrix_type::values_host_view_type;
     for(size_t j = 0; j < localNumRows; j++)
     {
       LO row = blockRows[j]; // Containers_[i]->ID (j);
-      LO numEntries;
-      SC* values;
-      const LO* colinds;
-      this->inputBlockMatrix_->getLocalRowView(row, colinds, values, numEntries);
+      vals_type values;
+      inds_type colinds;
+      this->inputBlockMatrix_->getLocalRowView(row, colinds, values);
+      LO numEntries = (LO) colinds.size();
       for(size_t m = 0; m < numVecs; m++)
       {
         for (int localR = 0; localR < this->bcrsBlockSize_; localR++)
@@ -318,8 +320,8 @@ void ContainerImpl<MatrixType, LocalScalarType>::DoGSBlock(
     //But, can only do this if the matrix is accessible directly from host, since it's not a DualView
     using container_exec_space = typename ContainerImpl<MatrixType, LocalScalarType>::crs_matrix_type::execution_space;
     container_exec_space().fence();
-    auto localA = this->inputCrsMatrix_->getLocalMatrix();
-    using size_type = typename crs_matrix_type::local_matrix_type::size_type;
+    auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
+    using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
     const auto& rowmap = localA.graph.row_map;
     const auto& entries = localA.graph.entries;
     const auto& values = localA.values;
@@ -844,33 +846,47 @@ Details::StridedRowView<
   typename ContainerImpl<MatrixType, LocalScalarType>::NO>
 ContainerImpl<MatrixType, LocalScalarType>::
 getInputRowView(LO row) const
-{
+{  
+
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+  using IST = typename row_matrix_type::impl_scalar_type;
+
   if(this->hasBlockCrs_)
   {
-    const LO* colinds;
-    SC* values;
-    LO numEntries;
-    this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values, numEntries);
-    return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
+    using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+    using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
+    h_inds_type colinds;
+    h_vals_type values;
+    this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values);
+    size_t numEntries = colinds.size();
+    // CMS: Can't say I understand what this really does
+    //return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
+    h_vals_type subvals = Kokkos::subview(values,std::pair<size_t,size_t>(row % this->bcrsBlockSize_,values.size()));
+    return StridedRowView(subvals, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
   }
   else if(!this->inputMatrix_->supportsRowViews())
   {
     size_t maxEntries = this->inputMatrix_->getNodeMaxNumRowEntries();
-    Teuchos::Array<LO> indsCopy(maxEntries);
-    Teuchos::Array<SC> valsCopy(maxEntries);
+    Teuchos::Array<LO> inds(maxEntries);
+    Teuchos::Array<SC> vals(maxEntries);
+    nonconst_local_inds_host_view_type inds_v(inds.data(),maxEntries);
+    nonconst_values_host_view_type vals_v(reinterpret_cast<IST*>(vals.data()),maxEntries);
     size_t numEntries;
-    this->inputMatrix_->getLocalRowCopy(row, indsCopy, valsCopy, numEntries);
-    indsCopy.resize(numEntries);
-    valsCopy.resize(numEntries);
-    return StridedRowView(valsCopy, indsCopy);
+    this->inputMatrix_->getLocalRowCopy(row, inds_v, vals_v, numEntries);
+    vals.resize(numEntries); inds.resize(numEntries);
+    return StridedRowView(vals, inds);
   }
   else
   {
-    const LO* colinds;
-    const SC* values;
-    LO numEntries;
-    this->inputMatrix_->getLocalRowViewRaw(row, numEntries, colinds, values);
-    return StridedRowView(values, colinds, 1, numEntries);
+    // CMS - This is dangerous and might not work.
+    local_inds_host_view_type colinds;
+    values_host_view_type values;
+    this->inputMatrix_->getLocalRowView(row, colinds, values);
+    return StridedRowView(values, colinds, 1, colinds.size());
   }
 }
 
@@ -890,14 +906,14 @@ namespace Details {
 //Implementation of Ifpack2::Details::StridedRowView
 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-StridedRowView(const SC* vals_, const LO* inds_, int blockSize_, size_t nnz_)
+StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
   : vals(vals_), inds(inds_), blockSize(blockSize_), nnz(nnz_)
 {}
 
 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
 StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
 StridedRowView(Teuchos::Array<SC>& vals_, Teuchos::Array<LO>& inds_)
-  : vals(nullptr), inds(nullptr), blockSize(1), nnz(vals_.size())
+  : vals(), inds(), blockSize(1), nnz(vals_.size())
 {
   valsCopy.swap(vals_);
   indsCopy.swap(inds_);
@@ -911,7 +927,7 @@ val(size_t i) const
   TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
       "Out-of-bounds access into Ifpack2::Container::StridedRowView");
   #endif
-  if(vals)
+  if(vals.size() > 0)
   {
     if(blockSize == 1)
       return vals[i];
@@ -931,7 +947,7 @@ ind(size_t i) const
         "Out-of-bounds access into Ifpack2::Container::StridedRowView");
   #endif
   //inds is smaller than vals by a factor of the block size (dofs/node)
-  if(inds)
+    if(inds.size() > 0)
   {
     if(blockSize == 1)
       return inds[i];
diff --git a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp
index 058ffac13387..3c96bd09abc4 100644
--- a/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_DenseContainer_def.hpp
@@ -160,14 +160,16 @@ void DenseContainer<MatrixType, LocalScalarType>::extract()
         LO localCol = this->translateRowToCol(blockRows[j]);
         colToBlockOffset[localCol] = blockStart + j;
       }
+      using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+      using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
       for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        h_inds_type indices;
+        h_vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();
         for(LO k = 0; k < numEntries; k++)
         {
           LO colOffset = colToBlockOffset[indices[k]];
diff --git a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp
index c058c876c385..7ec68c63a709 100644
--- a/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_ChebyshevKernel_def.hpp
@@ -439,7 +439,7 @@ fusedCase (vector_type& W,
   using Impl::chebyshev_kernel_vector;
   using STS = Teuchos::ScalarTraits<SC>;
 
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   //D_inv, B, X and W are all Vectors, so it's safe to take the first column only
   auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
   auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
diff --git a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp
index 03a05e654712..05190e78916a 100644
--- a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_decl.hpp
@@ -125,6 +125,12 @@ class DenseSolver<MatrixType, false> :
   static_assert(std::is_same<MatrixType, row_matrix_type>::value,
                 "Ifpack2::Details::DenseSolver: Please use MatrixType = Tpetra::RowMatrix.");
 
+  typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
+
+
+
   //! Specialization of Tpetra::Map used by this class.
   typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
 
diff --git a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp
index 9bfea906cf1c..460529cb9097 100644
--- a/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_DenseSolver_def.hpp
@@ -642,8 +642,8 @@ extract (Teuchos::SerialDenseMatrix<int, scalar_type>& A_local_dense,
   // each row of A_local.
   const size_type maxNumRowEntries =
     static_cast<size_type> (A_local.getNodeMaxNumRowEntries ());
-  Array<LO> localIndices (maxNumRowEntries);
-  Array<scalar_type> values (maxNumRowEntries);
+  nonconst_local_inds_host_view_type localIndices ("localIndices",maxNumRowEntries);
+  nonconst_values_host_view_type values ("values",maxNumRowEntries);
 
   const LO numLocalRows = static_cast<LO> (rowMap.getNodeNumElements ());
   const LO minLocalRow = rowMap.getMinLocalIndex ();
@@ -661,8 +661,8 @@ extract (Teuchos::SerialDenseMatrix<int, scalar_type>& A_local_dense,
       static_cast<size_type> (A_local.getNumEntriesInLocalRow (localRow));
     size_t numEntriesOut = 0; // ignored
     A_local.getLocalRowCopy (localRow,
-                             localIndices (0, numEntriesInRow),
-                             values (0, numEntriesInRow),
+                             localIndices,
+                             values,                             
                              numEntriesOut);
     for (LO k = 0; k < numEntriesInRow; ++k) {
       const LO localCol = localIndices[k];
diff --git a/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp b/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp
index b62d22c58b8b..35aef8c8344c 100644
--- a/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_Factory_def.hpp
@@ -122,9 +122,8 @@ create (const std::string& precType,
   // precTypeUpper is the upper-case version of precType.
   std::string precTypeUpper (precType);
   if (precTypeUpper.size () > 0) {
-    std::locale locale;
     for (size_t k = 0; k < precTypeUpper.size (); ++k) {
-      precTypeUpper[k] = std::toupper<char> (precTypeUpper[k], locale);
+      precTypeUpper[k] = ::toupper(precTypeUpper[k]);
     }
   }
 
diff --git a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp
index 137323cd1b6b..23c2b71d4134 100644
--- a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp
@@ -51,14 +51,14 @@ namespace Details
     using crs_matrix_type = Tpetra::CrsMatrix<Scalar, LO, GO, NT>;
     using bcrs_matrix_type = Tpetra::BlockCrsMatrix<Scalar, LO, GO, NT>;
     using row_matrix_type = Tpetra::RowMatrix<Scalar, LO, GO, NT>;
-    using local_matrix_type = typename crs_matrix_type::local_matrix_type;
+    using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
     using vector_type = Tpetra::Vector<Scalar, LO, GO, NT>;
     using multivector_type = Tpetra::MultiVector<Scalar, LO, GO, NT>;
     using block_multivector_type = Tpetra::BlockMultiVector<Scalar, LO, GO, NT>;
-    using mem_space_t = typename local_matrix_type::memory_space;
-    using rowmap_t = typename local_matrix_type::row_map_type::HostMirror;
-    using entries_t = typename local_matrix_type::index_type::HostMirror;
-    using values_t = typename local_matrix_type::values_type::HostMirror;
+    using mem_space_t = typename local_matrix_device_type::memory_space;
+    using rowmap_t = typename local_matrix_device_type::row_map_type::HostMirror;
+    using entries_t = typename local_matrix_device_type::index_type::HostMirror;
+    using values_t = typename local_matrix_device_type::values_type::HostMirror;
     using Offset = typename rowmap_t::non_const_value_type;
     using IST = typename crs_matrix_type::impl_scalar_type;
     using KAT = Kokkos::ArithTraits<IST>;
@@ -66,6 +66,10 @@ namespace Details
     using InverseBlocks = Kokkos::View<IST***, typename bcrs_matrix_type::device_type>;
     using InverseBlocksHost = typename InverseBlocks::HostMirror;
 
+    typedef typename crs_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+    typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+    typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
+
     //Setup for CrsMatrix
     GaussSeidel(const crs_matrix_type& A, Teuchos::RCP<vector_type>& inverseDiagVec_, Teuchos::ArrayRCP<LO>& applyRows_, Scalar omega_)
     {
@@ -74,7 +78,7 @@ namespace Details
       applyRows = applyRows_;
       blockSize = 1;
       omega = omega_;
-      auto Alocal = A.getLocalMatrix();
+      auto Alocal = A.getLocalMatrixDevice();
       Arowmap = Kokkos::create_mirror_view(Alocal.graph.row_map);
       Aentries = Kokkos::create_mirror_view(Alocal.graph.entries);
       Avalues = Kokkos::create_mirror_view(Alocal.values);
@@ -95,8 +99,8 @@ namespace Details
       Aentries = entries_t(Kokkos::ViewAllocateWithoutInitializing("Aentries"), A.getNodeNumEntries());
       Avalues = values_t(Kokkos::ViewAllocateWithoutInitializing("Avalues"), A.getNodeNumEntries());
       size_t maxDegree = A.getNodeMaxNumRowEntries();
-      Teuchos::Array<Scalar> rowValues(maxDegree);
-      Teuchos::Array<LO> rowEntries(maxDegree);
+      nonconst_values_host_view_type rowValues("rowValues",maxDegree);
+      nonconst_local_inds_host_view_type rowEntries("rowEntries",maxDegree);
       size_t accum = 0;
       for(LO i = 0; i <= numRows; i++)
       {
@@ -104,7 +108,7 @@ namespace Details
         if(i == numRows)
           break;
         size_t degree;
-        A.getLocalRowCopy(i, rowEntries(), rowValues(), degree);
+        A.getLocalRowCopy(i, rowEntries, rowValues, degree);
         accum += degree;
         size_t rowBegin = Arowmap(i);
         for(size_t j = 0; j < degree; j++)
@@ -123,9 +127,9 @@ namespace Details
       Kokkos::deep_copy(inverseBlockDiag, inverseBlockDiag_);
       applyRows = applyRows_;
       omega = omega_;
-      auto AlocalGraph = A.getCrsGraph().getLocalGraph();
+      auto AlocalGraph = A.getCrsGraph().getLocalGraphDevice();
       //A.sync_host();  //note: this only syncs values, not graph
-      Avalues = A.getValuesHost();
+      Avalues = A.getValuesHostNonConst();
       Arowmap = Kokkos::create_mirror_view(AlocalGraph.row_map);
       Aentries = Kokkos::create_mirror_view(AlocalGraph.entries);
       Kokkos::deep_copy(Arowmap, AlocalGraph.row_map);
diff --git a/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp b/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp
index 146f6635a74f..a6775a308b39 100644
--- a/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_InverseDiagonalKernel_def.hpp
@@ -197,7 +197,8 @@ compute (vector_type& D_inv,
 
     // Canonicalize template arguments to avoid redundant instantiations.
   using d_type = typename vector_type::dual_view_type::t_dev;
-  using matrix_type = typename crs_matrix_type::local_matrix_type;
+  //  using h_matrix_type = typename crs_matrix_type::local_matrix_host_type;
+  using d_matrix_type = typename crs_matrix_type::local_matrix_device_type;
 
   const char kernel_label[] = "inverse_diagonal_kernel";
   using execution_space = typename NT::execution_space;
@@ -206,7 +207,7 @@ compute (vector_type& D_inv,
   auto policy = range_type(0, lclNumRows);
 
   d_type d = D_inv.getLocalViewDevice(Tpetra::Access::OverwriteAll);
-  matrix_type a = A_crs_->getLocalMatrix();
+  d_matrix_type a = A_crs_->getLocalMatrixDevice();
 
   if (do_l1) {
     constexpr bool do_l1_template = true;
@@ -214,7 +215,7 @@ compute (vector_type& D_inv,
       constexpr bool fix_tiny_template = true;
       using functor_type =
         Impl::InverseDiagonalWithExtraction<d_type,
-                               matrix_type,
+                               d_matrix_type,
                                offset_type,
                                do_l1_template,
                                fix_tiny_template>;
@@ -224,7 +225,7 @@ compute (vector_type& D_inv,
       constexpr bool fix_tiny_template = false;
       using functor_type =
         Impl::InverseDiagonalWithExtraction<d_type,
-                               matrix_type,
+                               d_matrix_type,
                                offset_type,
                                do_l1_template,
                                fix_tiny_template>;
@@ -237,7 +238,7 @@ compute (vector_type& D_inv,
       constexpr bool fix_tiny_template = true;
       using functor_type =
         Impl::InverseDiagonalWithExtraction<d_type,
-                               matrix_type,
+                               d_matrix_type,
                                offset_type,
                                do_l1_template,
                                fix_tiny_template>;
@@ -247,7 +248,7 @@ compute (vector_type& D_inv,
       constexpr bool fix_tiny_template = false;
       using functor_type =
         Impl::InverseDiagonalWithExtraction<d_type,
-                               matrix_type,
+                               d_matrix_type,
                                offset_type,
                                do_l1_template,
                                fix_tiny_template>;
diff --git a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp
index f6386c941890..91550d9b075f 100644
--- a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_decl.hpp
@@ -70,11 +70,16 @@ class OverlappingRowGraph :
   typedef typename GraphType::local_ordinal_type local_ordinal_type;
   typedef typename GraphType::global_ordinal_type global_ordinal_type;
   typedef typename GraphType::node_type node_type;
-
+  typedef typename GraphType::local_inds_host_view_type local_inds_host_view_type;  
+  typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;  
+  typedef typename GraphType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  
   typedef Tpetra::Export<local_ordinal_type, global_ordinal_type, node_type> export_type;
   typedef Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type> import_type;
   typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
   typedef Tpetra::RowGraph<local_ordinal_type, global_ordinal_type, node_type> row_graph_type;
+  
   //@}
   //! \name Constructors and destructor
   //@{
@@ -225,10 +230,18 @@ class OverlappingRowGraph :
   /// \c globalRow does not belong to this process, then \c indices is
   /// not modified and \c numIndices is set to
   /// Teuchos::OrdinalTraits<size_t>::invalid() on output.
+  virtual void
+  getGlobalRowCopy (global_ordinal_type globalRow,
+                    nonconst_global_inds_host_view_type& gblColInds,
+                    size_t& numIndices) const;
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowCopy (global_ordinal_type globalRow,
                     const Teuchos::ArrayView<global_ordinal_type>& indices,
                     size_t& numIndices) const;
+#endif
+
 
   /// \brief Copy out a list of local column indices in the given
   ///   local row that are owned by the calling process.
@@ -247,10 +260,79 @@ class OverlappingRowGraph :
   /// <tt>localRow</tt> does not belong to this process, then
   /// <tt>indices</tt> is not modified and \c numIndices is set to
   /// Teuchos::OrdinalTraits<size_t>::invalid() on output.
+  virtual void
+  getLocalRowCopy (local_ordinal_type localRow,
+                   nonconst_local_inds_host_view_type& gblColInds,
+                   size_t& numIndices) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getLocalRowCopy (local_ordinal_type localRow,
                    const Teuchos::ArrayView<local_ordinal_type>& indices,
                    size_t& numIndices) const;
+#endif
+
+    /// \brief Get a constant, nonpersisting, locally indexed view of
+    ///   the given row of the graph.
+    ///
+    /// The returned views of the column indices are not guaranteed to
+    /// persist beyond the lifetime of <tt>this</tt>.  Furthermore,
+    /// some RowGraph implementations allow changing the values, or
+    /// the indices and values.  Any such changes invalidate the
+    /// returned views.
+    ///
+    /// This method only gets the entries in the given row that are
+    /// stored on the calling process.  Note that if the graph has an
+    /// overlapping row Map, it is possible that the calling process
+    /// does not store all the entries in that row.
+    ///
+    /// \pre <tt>isLocallyIndexed () && supportsRowViews ()</tt>
+    /// \post <tt>indices.size () == getNumEntriesInGlobalRow (LocalRow)</tt>
+    ///
+    /// \param lclRow [in] Local index of the row.
+    /// \param lclColInds [out] Local indices of the columns in the
+    ///   row.  If the given row is not a valid row index on the
+    ///   calling process, then the result has no entries (its size is
+    ///   zero).
+    ///
+    /// Subclasses are expected to implement this method.  We would
+    /// have made this method pure virtual, but that would have broken
+    /// backwards compatibility, since we added the method at least
+    /// one major release after introducing this class.
+    virtual void
+    getLocalRowView (const local_ordinal_type lclRow,
+                     local_inds_host_view_type & lclColInds) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    virtual void
+    getLocalRowView (const local_ordinal_type lclRow,
+                     Teuchos::ArrayView<const local_ordinal_type>& lclColInds) const;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
+    /// \brief Get a const, non-persisting view of the given global
+    ///   row's global column indices, as a Teuchos::ArrayView.
+    ///
+    /// \param gblRow [in] Global index of the row.
+    /// \param gblColInds [out] Global column indices in the row.  If
+    ///   the given row is not a valid row index on the calling
+    ///   process, then the result has no entries (its size is zero).
+    ///
+    /// \pre <tt>! isLocallyIndexed()</tt>
+    /// \post <tt>gblColInds.size() == getNumEntriesInGlobalRow(gblRow)</tt>
+    ///
+    /// Subclasses are expected to implement this method.  We would
+    /// have made this method pure virtual, but that would have broken
+    /// backwards compatibility, since we added the method at least
+    /// one major release after introducing this class.
+    virtual void
+    getGlobalRowView (const global_ordinal_type gblRow,
+                      global_inds_host_view_type& gblColInds) const;
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    virtual void
+    getGlobalRowView (const global_ordinal_type gblRow,
+                      Teuchos::ArrayView<const global_ordinal_type>& gblColInds) const;
+#endif
+
+
   //@}
 private:
   //! \name Internal data
diff --git a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp
index 39126127244a..d9367503f9b9 100644
--- a/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_OverlappingRowGraph_def.hpp
@@ -259,8 +259,27 @@ bool OverlappingRowGraph<GraphType>::isFillComplete () const
 {
   return true;
 }
-  
+ 
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+  getGlobalRowCopy (global_ordinal_type globalRow,
+                    nonconst_global_inds_host_view_type& indices,
+                    size_t& numIndices) const
+{
+  const local_ordinal_type localRow = rowMap_->getLocalElement (globalRow);
+  if (localRow == Teuchos::OrdinalTraits<local_ordinal_type>::invalid ()) {
+    numIndices = Teuchos::OrdinalTraits<size_t>::invalid ();
+  } else {
+    if (Teuchos::as<size_t> (localRow) < nonoverlappingGraph_->getNodeNumRows ()) {
+      nonoverlappingGraph_->getGlobalRowCopy (globalRow, indices, numIndices);
+    } else {
+      overlappingGraph_->getGlobalRowCopy (globalRow, indices, numIndices);
+    }
+  }
+}
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class GraphType>
 void
 OverlappingRowGraph<GraphType>::
@@ -279,13 +298,13 @@ getGlobalRowCopy (global_ordinal_type globalRow,
     }
   }
 }
-  
+#endif
 
 template<class GraphType>
 void
 OverlappingRowGraph<GraphType>::
-getLocalRowCopy (local_ordinal_type localRow, 
-                 const Teuchos::ArrayView<local_ordinal_type>& indices, 
+getLocalRowCopy (local_ordinal_type localRow,
+                 nonconst_local_inds_host_view_type& indices,
                  size_t& numIndices) const
 {
   using Teuchos::as;
@@ -298,7 +317,101 @@ getLocalRowCopy (local_ordinal_type localRow,
     overlappingGraph_->getLocalRowCopy (localRowOffset, indices, numIndices);
   }
 }
-  
+
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+getLocalRowCopy (local_ordinal_type localRow,
+                 const Teuchos::ArrayView<local_ordinal_type>& indices,
+                 size_t& numIndices) const
+{
+  using Teuchos::as;
+  const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows ();
+  if (as<size_t> (localRow) < numMyRowsA) {
+    nonoverlappingGraph_->getLocalRowCopy (localRow, indices, numIndices);
+  } else {
+    const local_ordinal_type localRowOffset = 
+      localRow - as<local_ordinal_type> (numMyRowsA);
+    overlappingGraph_->getLocalRowCopy (localRowOffset, indices, numIndices);
+  }
+}
+#endif
+
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+getGlobalRowView (global_ordinal_type GlobalRow,
+                  global_inds_host_view_type &indices) const {
+  const local_ordinal_type LocalRow = rowMap_->getLocalElement (GlobalRow);
+  if (LocalRow == Teuchos::OrdinalTraits<local_ordinal_type>::invalid())  {
+    indices = global_inds_host_view_type();
+  } else {
+    if (Teuchos::as<size_t> (LocalRow) < nonoverlappingGraph_->getNodeNumRows ()) {
+      nonoverlappingGraph_->getGlobalRowView (GlobalRow, indices);
+    } else {
+      overlappingGraph_->getGlobalRowView (GlobalRow, indices);
+    }
+  }
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+getGlobalRowView (global_ordinal_type GlobalRow,
+                  Teuchos::ArrayView<const global_ordinal_type>& indices) const
+{
+  const local_ordinal_type LocalRow = rowMap_->getLocalElement (GlobalRow);
+  if (LocalRow == Teuchos::OrdinalTraits<local_ordinal_type>::invalid())  {
+    indices = Teuchos::null;
+  } else {
+    if (Teuchos::as<size_t> (LocalRow) < nonoverlappingGraph_->getNodeNumRows ()) {
+      nonoverlappingGraph_->getGlobalRowView (GlobalRow, indices);
+    } else {
+      overlappingGraph_->getGlobalRowView (GlobalRow, indices);
+    }
+  }
+}
+#endif
+
+
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+  getLocalRowView (local_ordinal_type LocalRow,
+                   local_inds_host_view_type & indices) const {
+  using Teuchos::as;
+  const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows ();
+  if (as<size_t> (LocalRow) < numMyRowsA) {
+    nonoverlappingGraph_->getLocalRowView (LocalRow, indices);
+  } else {
+    overlappingGraph_->getLocalRowView (LocalRow - as<local_ordinal_type> (numMyRowsA),
+                                 indices);
+  }
+
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class GraphType>
+void
+OverlappingRowGraph<GraphType>::
+getLocalRowView (local_ordinal_type LocalRow,
+                 Teuchos::ArrayView<const local_ordinal_type>& indices) const
+{
+  using Teuchos::as;
+  const size_t numMyRowsA = nonoverlappingGraph_->getNodeNumRows ();
+  if (as<size_t> (LocalRow) < numMyRowsA) {
+    nonoverlappingGraph_->getLocalRowView (LocalRow, indices);
+  } else {
+    overlappingGraph_->getLocalRowView (LocalRow - as<local_ordinal_type> (numMyRowsA),
+                                 indices);
+  }
+}
+#endif
+
+
 } // namespace Details
 } // namespace Ifpack2
 
diff --git a/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp b/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp
index f9c0aa0866e6..862d828acc46 100644
--- a/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_RowGraph.hpp
@@ -70,7 +70,10 @@ class RowGraph :
   typedef typename GraphType::local_ordinal_type local_ordinal_type;
   typedef typename GraphType::global_ordinal_type global_ordinal_type;
   typedef typename GraphType::node_type node_type;
-
+  typedef typename GraphType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename GraphType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
   //@}
   //! \name Destructor
   //@{
diff --git a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp
index 79d14e65a9d3..8455f580e2b5 100644
--- a/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_ScaledDampedResidual_def.hpp
@@ -384,7 +384,7 @@ fusedCase (vector_type& W,
   using Impl::scaled_damped_residual_vector;
   using STS = Teuchos::ScalarTraits<SC>;
 
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
   auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
   auto X_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
diff --git a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp
index 0f14676cab70..17798192c00f 100644
--- a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_decl.hpp
@@ -125,6 +125,10 @@ class TriDiSolver<MatrixType, false> :
 
   static_assert(std::is_same<MatrixType, row_matrix_type>::value, "Ifpack2::Details::TriDiSolver: The template parameter MatrixType must be a Tpetra::RowMatrix specialization.  Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore.  The constructor can take either a RowMatrix or a CrsMatrix just fine.");
 
+  typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   //! Specialization of Tpetra::Map used by this class.
   typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
 
diff --git a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp
index dcaf5e92c624..c843fea63cd5 100644
--- a/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Details_TriDiSolver_def.hpp
@@ -629,8 +629,8 @@ void TriDiSolver<MatrixType, false>::extract (Teuchos::SerialTriDiMatrix<int, sc
   // each row of A_local.
   const size_type maxNumRowEntries =
     static_cast<size_type> (A_local.getNodeMaxNumRowEntries ());
-  Array<LO> localIndices (maxNumRowEntries);
-  Array<scalar_type> values (maxNumRowEntries);
+  nonconst_local_inds_host_view_type localIndices("localIndices",maxNumRowEntries);
+  nonconst_values_host_view_type values ("values",maxNumRowEntries);
 
   const LO numLocalRows = static_cast<LO> (rowMap.getNodeNumElements ());
   const LO minLocalRow = rowMap.getMinLocalIndex ();
@@ -648,8 +648,8 @@ void TriDiSolver<MatrixType, false>::extract (Teuchos::SerialTriDiMatrix<int, sc
       static_cast<size_type> (A_local.getNumEntriesInLocalRow (localRow));
     size_t numEntriesOut = 0; // ignored
     A_local.getLocalRowCopy (localRow,
-                             localIndices (0, numEntriesInRow),
-                             values (0, numEntriesInRow),
+                             localIndices,
+                             values,
                              numEntriesOut);
     for (LO k = 0; k < numEntriesInRow; ++k) {
       const LO localCol = localIndices[k];
diff --git a/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp
index 88e972608b13..b3a9ca21d102 100644
--- a/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_DiagonalFilter_decl.hpp
@@ -79,9 +79,17 @@ class DiagonalFilter :
   typedef typename MatrixType::local_ordinal_type LocalOrdinal;
   typedef typename MatrixType::global_ordinal_type GlobalOrdinal;
   typedef typename MatrixType::node_type Node;
-  typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
 
-  typedef typename Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mag_type mag_type;
+  typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;
+  typedef Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> row_matrix_type;
+  typedef typename row_matrix_type::mag_type mag_type;
 
   static_assert(std::is_same<MatrixType, Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >::value, "Ifpack2::DiagonalFilter: The template parameter MatrixType must be a Tpetra::RowMatrix specialization.  Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore.  The constructor can take either a RowMatrix or a CrsMatrix just fine.");
 
@@ -186,10 +194,17 @@ class DiagonalFilter :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow,
                                 const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                                 const Teuchos::ArrayView<Scalar> &Values,
                                 size_t &NumEntries) const;
+#endif
 
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
@@ -202,11 +217,17 @@ class DiagonalFilter :
     with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowCopy(LocalOrdinal LocalRow,
                                const Teuchos::ArrayView<LocalOrdinal> &Indices,
                                const Teuchos::ArrayView<Scalar> &Values,
                                size_t &NumEntries) const ;
-
+#endif
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
     \param GlobalRow - (In) Global row number for which indices are desired.
@@ -217,10 +238,15 @@ class DiagonalFilter :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (GlobalOrdinal GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowView(GlobalOrdinal GlobalRow,
                                 Teuchos::ArrayView<const GlobalOrdinal> &indices,
                                 Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -231,10 +257,15 @@ class DiagonalFilter :
 
     Note: If \c LocalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (LocalOrdinal LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowView(LocalOrdinal LocalRow,
                                Teuchos::ArrayView<const LocalOrdinal> &indices,
                                Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
     the zero and non-zero diagonals owned by this node. */
diff --git a/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp b/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp
index 6e28f00c05e5..cf31bd69562b 100644
--- a/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_DiagonalFilter_def.hpp
@@ -65,8 +65,8 @@ DiagonalFilter (const Teuchos::RCP<const Tpetra::RowMatrix<Scalar,LocalOrdinal,G
   pos_.resize(getNodeNumRows());
   val_=Teuchos::rcp(new Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node>(A_->getRowMap()));
 
-  std::vector<LocalOrdinal> Indices(getNodeMaxNumRowEntries());
-  std::vector<Scalar> Values(getNodeMaxNumRowEntries());
+  nonconst_local_inds_host_view_type Indices("Indices",getNodeMaxNumRowEntries());
+  nonconst_values_host_view_type Values("Values",getNodeMaxNumRowEntries());
   size_t NumEntries;
   magnitudeType mysign;
 
@@ -239,10 +239,11 @@ bool DiagonalFilter<MatrixType>::isFillComplete() const
 
 template<class MatrixType>
 void DiagonalFilter<MatrixType>::
-getGlobalRowCopy (GlobalOrdinal GlobalRow,
-                  const Teuchos::ArrayView<GlobalOrdinal> &Indices,
-                  const Teuchos::ArrayView<Scalar> &Values,
-                  size_t &NumEntries) const
+  getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
+
 {
   Teuchos::ArrayRCP< const Scalar > myvals=val_->get1dView();
   LocalOrdinal LocalRow=getRowMap()->getLocalElement(GlobalRow);
@@ -253,12 +254,27 @@ getGlobalRowCopy (GlobalOrdinal GlobalRow,
     Values[pos_[LocalRow]] += myvals[LocalRow];
 }
 
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void DiagonalFilter<MatrixType>::
-getLocalRowCopy (LocalOrdinal LocalRow,
-                 const Teuchos::ArrayView<LocalOrdinal> &Indices,
-                 const Teuchos::ArrayView<Scalar> &Values,
-                 size_t &NumEntries) const
+getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                  const Teuchos::ArrayView<GlobalOrdinal> &Indices,
+                  const Teuchos::ArrayView<Scalar> &Values,
+                  size_t &NumEntries) const {
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getGlobalRowCopy(GlobalRow,ind_in,val_in,NumEntries); 
+}
+#endif
+
+template<class MatrixType>
+void DiagonalFilter<MatrixType>::
+ getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
 {
   Teuchos::ArrayRCP< const Scalar > myvals=val_->get1dView();
 
@@ -268,6 +284,32 @@ getLocalRowCopy (LocalOrdinal LocalRow,
     Values[pos_[LocalRow]] += myvals[LocalRow];
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void
+DiagonalFilter<MatrixType>::
+getLocalRowCopy (LocalOrdinal LocalRow,
+                 const Teuchos::ArrayView<LocalOrdinal> &Indices,
+                 const Teuchos::ArrayView<Scalar> &Values,
+             size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
+
+template<class MatrixType>
+void DiagonalFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
+                                                  global_inds_host_view_type &/*indices*/,
+                                                  values_host_view_type &/*values*/) const
+{
+  throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getGlobalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void DiagonalFilter<MatrixType>::
 getGlobalRowView (GlobalOrdinal /* GlobalRow */,
@@ -276,7 +318,17 @@ getGlobalRowView (GlobalOrdinal /* GlobalRow */,
 {
   throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getGlobalRowView.");
 }
+#endif
 
+template<class MatrixType>
+void DiagonalFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
+    local_inds_host_view_type & /*indices*/,
+    values_host_view_type & /*values*/) const
+{
+  throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getLocalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void DiagonalFilter<MatrixType>::
 getLocalRowView (LocalOrdinal /* LocalRow */,
@@ -285,6 +337,7 @@ getLocalRowView (LocalOrdinal /* LocalRow */,
 {
   throw std::runtime_error("Ifpack2::DiagonalFilter: does not support getLocalRowView.");
 }
+#endif
 
 template<class MatrixType>
 void DiagonalFilter<MatrixType>::getLocalDiagCopy(Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &diag) const
diff --git a/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp
index 932184265b1c..263dba73d13b 100644
--- a/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_DropFilter_decl.hpp
@@ -81,9 +81,19 @@ class DropFilter :
   typedef typename MatrixType::local_ordinal_type LocalOrdinal;
   typedef typename MatrixType::global_ordinal_type GlobalOrdinal;
   typedef typename MatrixType::node_type Node;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;
 
-  typedef typename Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mag_type mag_type;
+  typedef Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> row_matrix_type;
+  typedef typename row_matrix_type::mag_type mag_type;
+
 
   static_assert(std::is_same<MatrixType, Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >::value, "Ifpack2::DropFilter: The template parameter MatrixType must be a Tpetra::RowMatrix specialization.  Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore.  The constructor can take either a RowMatrix or a CrsMatrix just fine.");
 
@@ -186,10 +196,17 @@ class DropFilter :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                    nonconst_global_inds_host_view_type &Indices,
+                    nonconst_values_host_view_type &Values,
+                    size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE 
   virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow,
                                 const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                                 const Teuchos::ArrayView<Scalar> &Values,
                                 size_t &NumEntries) const;
+#endif
 
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
@@ -202,11 +219,17 @@ class DropFilter :
     with row \c DropRow. If \c DropRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getLocalRowCopy (LocalOrdinal DropRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowCopy(LocalOrdinal DropRow,
                                const Teuchos::ArrayView<LocalOrdinal> &Indices,
                                const Teuchos::ArrayView<Scalar> &Values,
                                size_t &NumEntries) const ;
-
+#endif
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
     \param GlobalRow - (In) Global row number for which indices are desired.
@@ -217,10 +240,15 @@ class DropFilter :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (GlobalOrdinal GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowView(GlobalOrdinal GlobalRow,
                                 Teuchos::ArrayView<const GlobalOrdinal> &indices,
                                 Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param DropRow - (In) Drop row number for which indices are desired.
@@ -231,10 +259,15 @@ class DropFilter :
 
     Note: If \c DropRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (LocalOrdinal LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowView(LocalOrdinal DropRow,
                                Teuchos::ArrayView<const LocalOrdinal> &indices,
                                Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
     the zero and non-zero diagonals owned by this node. */
@@ -307,9 +340,9 @@ class DropFilter :
   //! NumEntries_[i] contains the nonzero entries in row `i'.
   std::vector<size_t> NumEntries_;
   //! Used in ExtractMyRowCopy, to avoid allocation each time.
-  mutable Teuchos::Array<LocalOrdinal> Indices_;
-  //! Used in ExtractMyRowCopy, to avoid allocation each time.
-  mutable Teuchos::Array<Scalar> Values_;
+  mutable nonconst_local_inds_host_view_type Indices_;
+  //! Used in ExtractMyRowCopy, to avoid allocation each time
+  mutable nonconst_values_host_view_type Values_;
 
 };
 
diff --git a/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp b/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp
index 32c19689b822..b602141c0374 100644
--- a/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_DropFilter_def.hpp
@@ -85,8 +85,8 @@ DropFilter<MatrixType>::DropFilter(const Teuchos::RCP<const Tpetra::RowMatrix<Sc
   MaxNumEntriesA_ = A_->getNodeMaxNumRowEntries();
 
   // ExtractMyRowCopy() will use these vectors
-  Indices_.resize(MaxNumEntries_);
-  Values_.resize(MaxNumEntries_);
+  Kokkos::resize(Indices_,MaxNumEntries_);
+  Kokkos::resize(Values_,MaxNumEntries_);
 
   size_t ActualMaxNumEntries = 0;
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
@@ -278,6 +278,18 @@ bool DropFilter<MatrixType>::isFillComplete() const
 
 //==========================================================================
 template<class MatrixType>
+void DropFilter<MatrixType>::
+getGlobalRowCopy (GlobalOrdinal /*GlobalRow*/,
+                  nonconst_global_inds_host_view_type &/*Indices*/,
+                  nonconst_values_host_view_type &/*Values*/,
+                  size_t& /*NumEntries*/) const
+{
+  throw std::runtime_error("Ifpack2::DropFilter does not implement getGlobalRowCopy.");
+}
+
+//==========================================================================
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE 
+template<class MatrixType>
 void DropFilter<MatrixType>::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */,
                                                   const Teuchos::ArrayView<GlobalOrdinal> &/* Indices */,
                                                   const Teuchos::ArrayView<Scalar> &/* Values */,
@@ -285,13 +297,15 @@ void DropFilter<MatrixType>::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */,
 {
   throw std::runtime_error("Ifpack2::DropFilter does not implement getGlobalRowCopy.");
 }
+#endif
 
 //==========================================================================
 template<class MatrixType>
-void DropFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
-                                              const Teuchos::ArrayView<LocalOrdinal> &Indices,
-                                              const Teuchos::ArrayView<Scalar> &Values,
-                                              size_t &NumEntries) const
+void DropFilter<MatrixType>::
+  getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
 {
   TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >=  NumRows_ || (size_t) Indices.size() <  NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::DropFilter::getLocalRowCopy invalid row or array size.");
 
@@ -302,7 +316,7 @@ void DropFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
   // This is because I need more space than that given by
   // the user (for the external nodes)
   size_t A_NumEntries=0;
-  A_->getLocalRowCopy(LocalRow,Indices_(),Values_(),A_NumEntries);
+  A_->getLocalRowCopy(LocalRow,Indices_,Values_,A_NumEntries);
 
   // loop over all nonzero elements of row MyRow,
   // and drop elements below specified threshold.
@@ -321,6 +335,30 @@ void DropFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
 }
 
 //==========================================================================
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void DropFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
+                                              const Teuchos::ArrayView<LocalOrdinal> &Indices,
+                                              const Teuchos::ArrayView<Scalar> &Values,
+                                              size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
+//==========================================================================
+template<class MatrixType>
+void DropFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
+                                                  global_inds_host_view_type &/*indices*/,
+                                                  values_host_view_type &/*values*/) const
+{
+  throw std::runtime_error("Ifpack2::DropFilter: does not support getGlobalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void DropFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
                                                   Teuchos::ArrayView<const GlobalOrdinal> &/* indices */,
@@ -328,15 +366,25 @@ void DropFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
 {
   throw std::runtime_error("Ifpack2::DropFilter: does not support getGlobalRowView.");
 }
-
+#endif
 //==========================================================================
 template<class MatrixType>
+void DropFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
+    local_inds_host_view_type & /*indices*/,
+    values_host_view_type & /*values*/) const
+{
+  throw std::runtime_error("Ifpack2::DropFilter: does not support getLocalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
 void DropFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
                                                  Teuchos::ArrayView<const LocalOrdinal> &/* indices */,
                                                  Teuchos::ArrayView<const Scalar> &/* values */) const
 {
   throw std::runtime_error("Ifpack2::DropFilter: does not support getLocalRowView.");
 }
+#endif
 
 //==========================================================================
 template<class MatrixType>
@@ -383,21 +431,22 @@ void DropFilter<MatrixType>::apply(const Tpetra::MultiVector<Scalar,LocalOrdinal
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
     size_t Nnz;
     // Use this class's getrow to make the below code simpler
-    getLocalRowCopy(i,Indices_(),Values_(),Nnz);
+    getLocalRowCopy(i,Indices_,Values_,Nnz);
+    Scalar* Values = reinterpret_cast<Scalar*>(Values_.data());
     if (mode==Teuchos::NO_TRANS){
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]];
+          y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]];
     }
     else if (mode==Teuchos::TRANS){
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i];
     }
     else { //mode==Teuchos::CONJ_TRANS
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits<Scalar>::conjugate(Values_[j]) * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits<Scalar>::conjugate(Values[j]) * x_ptr[k][i];
     }
   }
 }
diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp
index 2cf6a45defb6..f1dd7af784dc 100644
--- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp
@@ -140,9 +140,11 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix
 
   //! The type of local indices in the input MatrixType.
   typedef typename MatrixType::local_ordinal_type local_ordinal_type;
+  typedef typename MatrixType::local_ordinal_type LO;
 
   //! The type of global indices in the input MatrixType.
   typedef typename MatrixType::global_ordinal_type global_ordinal_type;
+  typedef typename MatrixType::global_ordinal_type GO;
 
   //! The Node type used by the input MatrixType.
   typedef typename MatrixType::node_type node_type;
@@ -173,13 +175,13 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix
   //! \name Implementation of KK ILU(k).
   //@{
   
-  typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
-  typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
-  typedef typename local_matrix_type::values_type scalar_nonzero_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace;
+  typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
+  typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace;
   typedef typename KokkosKernels::Experimental::KokkosKernelsHandle
       <typename lno_row_view_t::const_value_type, typename lno_nonzero_view_t::const_value_type, typename scalar_nonzero_view_t::value_type,
       HandleExecSpace, TemporaryMemorySpace,PersistentMemorySpace > kk_handle_type;
@@ -333,10 +335,16 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix
   typedef Teuchos::ScalarTraits<impl_scalar_type> STS;
   typedef Teuchos::ScalarTraits<magnitude_type> STM;
   typedef typename block_crs_matrix_type::little_block_type little_block_type;
+  typedef typename block_crs_matrix_type::little_block_host_type little_block_host_type;
   typedef typename block_crs_matrix_type::little_vec_type little_vec_type;
   typedef typename block_crs_matrix_type::little_host_vec_type little_host_vec_type;
   typedef typename block_crs_matrix_type::const_host_little_vec_type const_host_little_vec_type;
 
+  using local_inds_host_view_type = typename block_crs_matrix_type::local_inds_host_view_type;
+  using values_host_view_type     = typename block_crs_matrix_type::values_host_view_type;
+  using local_inds_device_view_type = typename block_crs_matrix_type::local_inds_device_view_type;
+  using values_device_view_type     = typename block_crs_matrix_type::values_device_view_type;
+
   void allocate_L_and_U_blocks();
   void initAllValues (const block_crs_matrix_type& A);
 
diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp
index 4dd78f5b66d4..e11343e2d626 100644
--- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp
@@ -277,22 +277,22 @@ RBILUK<MatrixType>::
 initAllValues (const block_crs_matrix_type& A)
 {
   using Teuchos::RCP;
-  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> map_type;
+  typedef Tpetra::Map<LO,GO,node_type> map_type;
 
-  local_ordinal_type NumIn = 0, NumL = 0, NumU = 0;
+  LO NumIn = 0, NumL = 0, NumU = 0;
   bool DiagFound = false;
   size_t NumNonzeroDiags = 0;
   size_t MaxNumEntries = A.getNodeMaxNumRowEntries();
-  local_ordinal_type blockMatSize = blockSize_*blockSize_;
+  LO blockMatSize = blockSize_*blockSize_;
 
   // First check that the local row map ordering is the same as the local portion of the column map.
   // The extraction of the strictly lower/upper parts of A, as well as the factorization,
   // implicitly assume that this is the case.
-  Teuchos::ArrayView<const global_ordinal_type> rowGIDs = A.getRowMap()->getNodeElementList();
-  Teuchos::ArrayView<const global_ordinal_type> colGIDs = A.getColMap()->getNodeElementList();
+  Teuchos::ArrayView<const GO> rowGIDs = A.getRowMap()->getNodeElementList();
+  Teuchos::ArrayView<const GO> colGIDs = A.getColMap()->getNodeElementList();
   bool gidsAreConsistentlyOrdered=true;
-  global_ordinal_type indexOfInconsistentGID=0;
-  for (global_ordinal_type i=0; i<rowGIDs.size(); ++i) {
+  GO indexOfInconsistentGID=0;
+  for (GO i=0; i<rowGIDs.size(); ++i) {
     if (rowGIDs[i] != colGIDs[i]) {
       gidsAreConsistentlyOrdered=false;
       indexOfInconsistentGID=i;
@@ -307,8 +307,8 @@ initAllValues (const block_crs_matrix_type& A)
 
   // Allocate temporary space for extracting the strictly
   // lower and upper parts of the matrix A.
-  Teuchos::Array<local_ordinal_type> LI(MaxNumEntries);
-  Teuchos::Array<local_ordinal_type> UI(MaxNumEntries);
+  Teuchos::Array<LO> LI(MaxNumEntries);
+  Teuchos::Array<LO> UI(MaxNumEntries);
   Teuchos::Array<scalar_type> LV(MaxNumEntries*blockMatSize);
   Teuchos::Array<scalar_type> UV(MaxNumEntries*blockMatSize);
 
@@ -322,6 +322,7 @@ initAllValues (const block_crs_matrix_type& A)
   // host, so sync to host first.  The const_cast is unfortunate but
   // is our only option to make this correct.
 
+  /*
   const_cast<block_crs_matrix_type&> (A).sync_host ();
   L_block_->sync_host ();
   U_block_->sync_host ();
@@ -330,6 +331,7 @@ initAllValues (const block_crs_matrix_type& A)
   L_block_->modify_host ();
   U_block_->modify_host ();
   D_block_->modify_host ();
+  */
 
   RCP<const map_type> rowMap = L_block_->getRowMap ();
 
@@ -343,14 +345,17 @@ initAllValues (const block_crs_matrix_type& A)
 
   //TODO BMK: Revisit this fence when BlockCrsMatrix is refactored.
   Kokkos::fence();
+  using inds_type = typename row_matrix_type::local_inds_host_view_type;
+  using vals_type = typename row_matrix_type::values_host_view_type;
   for (size_t myRow=0; myRow<A.getNodeNumRows(); ++myRow) {
-    local_ordinal_type local_row = myRow;
+    LO local_row = myRow;
 
     //TODO JJH 4April2014 An optimization is to use getLocalRowView.  Not all matrices support this,
     //                    we'd need to check via the Tpetra::RowMatrix method supportsRowViews().
-    const local_ordinal_type * InI = 0;
-    scalar_type * InV = 0;
-    A.getLocalRowView(local_row, InI, InV, NumIn);
+    inds_type InI;
+    vals_type InV;
+    A.getLocalRowView(local_row, InI, InV);
+    NumIn = (LO)InI.size();
 
     // Split into L and U (we don't assume that indices are ordered).
 
@@ -358,14 +363,14 @@ initAllValues (const block_crs_matrix_type& A)
     NumU = 0;
     DiagFound = false;
 
-    for (local_ordinal_type j = 0; j < NumIn; ++j) {
-      const local_ordinal_type k = InI[j];
-      const local_ordinal_type blockOffset = blockMatSize*j;
+    for (LO j = 0; j < NumIn; ++j) {
+      const LO k = InI[j];
+      const LO blockOffset = blockMatSize*j;
 
       if (k == local_row) {
         DiagFound = true;
         // Store perturbed diagonal in Tpetra::Vector D_
-        for (local_ordinal_type jj = 0; jj < blockMatSize; ++jj)
+        for (LO jj = 0; jj < blockMatSize; ++jj)
           diagValues[jj] = this->Rthresh_ * InV[blockOffset+jj] + IFPACK2_SGN(InV[blockOffset+jj]) * this->Athresh_;
         D_block_->replaceLocalValues(local_row, &InI[j], diagValues.getRawPtr(), 1);
       }
@@ -380,15 +385,15 @@ initAllValues (const block_crs_matrix_type& A)
       }
       else if (k < local_row) {
         LI[NumL] = k;
-        const local_ordinal_type LBlockOffset = NumL*blockMatSize;
-        for (local_ordinal_type jj = 0; jj < blockMatSize; ++jj)
+        const LO LBlockOffset = NumL*blockMatSize;
+        for (LO jj = 0; jj < blockMatSize; ++jj)
           LV[LBlockOffset+jj] = InV[blockOffset+jj];
         NumL++;
       }
       else if (Teuchos::as<size_t>(k) <= rowMap->getNodeNumElements()) {
         UI[NumU] = k;
-        const local_ordinal_type UBlockOffset = NumU*blockMatSize;
-        for (local_ordinal_type jj = 0; jj < blockMatSize; ++jj)
+        const LO UBlockOffset = NumU*blockMatSize;
+        for (LO jj = 0; jj < blockMatSize; ++jj)
           UV[UBlockOffset+jj] = InV[blockOffset+jj];
         NumU++;
       }
@@ -400,7 +405,7 @@ initAllValues (const block_crs_matrix_type& A)
       ++NumNonzeroDiags;
     } else
     {
-      for (local_ordinal_type jj = 0; jj < blockSize_; ++jj)
+      for (LO jj = 0; jj < blockSize_; ++jj)
         diagValues[jj*(blockSize_+1)] = this->Athresh_;
       D_block_->replaceLocalValues(local_row, &local_row, diagValues.getRawPtr(), 1);
     }
@@ -416,6 +421,7 @@ initAllValues (const block_crs_matrix_type& A)
 
   // NOTE (mfh 27 May 2016) Sync back to device, in case compute()
   // ever gets a device implementation.
+  /*
   {
     typedef typename block_crs_matrix_type::device_type device_type;
     const_cast<block_crs_matrix_type&> (A).template sync<device_type> ();
@@ -423,6 +429,7 @@ initAllValues (const block_crs_matrix_type& A)
     U_block_->template sync<device_type> ();
     D_block_->template sync<device_type> ();
   }
+  */
   this->isInitialized_ = true;
 }
 
@@ -430,7 +437,7 @@ namespace { // (anonymous)
 
 // For a given Kokkos::View type, possibly unmanaged, get the
 // corresponding managed Kokkos::View type.  This is handy for
-// translating from little_block_type or little_vec_type (both
+// translating from little_block_type or little_host_vec_type (both
 // possibly unmanaged) to their managed versions.
 template<class LittleBlockType>
 struct GetManagedView {
@@ -477,8 +484,9 @@ void RBILUK<MatrixType>::compute ()
   if (! A_block_.is_null ()) {
     Teuchos::RCP<block_crs_matrix_type> A_nc =
       Teuchos::rcp_const_cast<block_crs_matrix_type> (A_block_);
-    A_nc->sync_host ();
+    //    A_nc->sync_host ();
   }
+  /*
   L_block_->sync_host ();
   U_block_->sync_host ();
   D_block_->sync_host ();
@@ -486,6 +494,7 @@ void RBILUK<MatrixType>::compute ()
   L_block_->modify_host ();
   U_block_->modify_host ();
   D_block_->modify_host ();
+  */
 
   Teuchos::Time timer ("RBILUK::compute");
   double startTime = timer.wallTime();
@@ -501,18 +510,18 @@ void RBILUK<MatrixType>::compute ()
     initAllValues (*A_block_);
 
     size_t NumIn;
-    local_ordinal_type NumL, NumU, NumURead;
+    LO NumL, NumU, NumURead;
 
     // Get Maximum Row length
     const size_t MaxNumEntries =
       L_block_->getNodeMaxNumRowEntries () + U_block_->getNodeMaxNumRowEntries () + 1;
 
-    const local_ordinal_type blockMatSize = blockSize_*blockSize_;
+    const LO blockMatSize = blockSize_*blockSize_;
 
     // FIXME (mfh 08 Nov 2015, 24 May 2016) We need to move away from
     // expressing these strides explicitly, in order to finish #177
     // (complete Kokkos-ization of BlockCrsMatrix) thoroughly.
-    const local_ordinal_type rowStride = blockSize_;
+    const LO rowStride = blockSize_;
 
     Teuchos::Array<int> ipiv_teuchos(blockSize_);
     Kokkos::View<int*, Kokkos::HostSpace,
@@ -524,59 +533,61 @@ void RBILUK<MatrixType>::compute ()
     size_t num_cols = U_block_->getColMap()->getNodeNumElements();
     Teuchos::Array<int> colflag(num_cols);
 
-    typename GetManagedView<little_block_type>::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_);
-    typename GetManagedView<little_block_type>::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_);
-    typename GetManagedView<little_block_type>::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_);
+    typename GetManagedView<little_block_host_type>::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_);
+    typename GetManagedView<little_block_host_type>::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_);
+    typename GetManagedView<little_block_host_type>::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_);
 
 //    Teuchos::ArrayRCP<scalar_type> DV = D_->get1dViewNonConst(); // Get view of diagonal
 
     // Now start the factorization.
 
     // Need some integer workspace and pointers
-    local_ordinal_type NumUU;
+    LO NumUU;
     for (size_t j = 0; j < num_cols; ++j) {
       colflag[j] = -1;
     }
-    Teuchos::Array<local_ordinal_type> InI(MaxNumEntries, 0);
+    Teuchos::Array<LO> InI(MaxNumEntries, 0);
     Teuchos::Array<scalar_type> InV(MaxNumEntries*blockMatSize,STM::zero());
 
-    const local_ordinal_type numLocalRows = L_block_->getNodeNumRows ();
-    for (local_ordinal_type local_row = 0; local_row < numLocalRows; ++local_row) {
+    const LO numLocalRows = L_block_->getNodeNumRows ();
+    for (LO local_row = 0; local_row < numLocalRows; ++local_row) {
 
       // Fill InV, InI with current row of L, D and U combined
 
       NumIn = MaxNumEntries;
-      const local_ordinal_type * colValsL;
-      scalar_type * valsL;
+      local_inds_host_view_type colValsL;
+      values_host_view_type valsL;
 
-      L_block_->getLocalRowView(local_row, colValsL, valsL, NumL);
-      for (local_ordinal_type j = 0; j < NumL; ++j)
+      L_block_->getLocalRowView(local_row, colValsL, valsL);
+      NumL = (LO) colValsL.size();
+      for (LO j = 0; j < NumL; ++j)
       {
-        const local_ordinal_type matOffset = blockMatSize*j;
-        little_block_type lmat((typename little_block_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
-        little_block_type lmatV((typename little_block_type::value_type*) &InV[matOffset],blockSize_,rowStride);
+        const LO matOffset = blockMatSize*j;
+        little_block_host_type lmat((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
+        little_block_host_type lmatV((typename little_block_host_type::value_type*) &InV[matOffset],blockSize_,rowStride);
         //lmatV.assign(lmat);
         Tpetra::COPY (lmat, lmatV);
         InI[j] = colValsL[j];
       }
 
-      little_block_type dmat = D_block_->getLocalBlock(local_row, local_row);
-      little_block_type dmatV((typename little_block_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride);
+      little_block_host_type dmat = D_block_->getLocalBlockHostNonConst(local_row, local_row);
+      little_block_host_type dmatV((typename little_block_host_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride);
       //dmatV.assign(dmat);
       Tpetra::COPY (dmat, dmatV);
       InI[NumL] = local_row;
 
-      const local_ordinal_type * colValsU;
-      scalar_type * valsU;
-      U_block_->getLocalRowView(local_row, colValsU, valsU, NumURead);
+      local_inds_host_view_type colValsU;
+      values_host_view_type valsU;
+      U_block_->getLocalRowView(local_row, colValsU, valsU);
+      NumURead = (LO) colValsU.size();
       NumU = 0;
-      for (local_ordinal_type j = 0; j < NumURead; ++j)
+      for (LO j = 0; j < NumURead; ++j)
       {
         if (!(colValsU[j] < numLocalRows)) continue;
         InI[NumL+1+j] = colValsU[j];
-        const local_ordinal_type matOffset = blockMatSize*(NumL+1+j);
-        little_block_type umat((typename little_block_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride);
-        little_block_type umatV((typename little_block_type::value_type*) &InV[matOffset], blockSize_, rowStride);
+        const LO matOffset = blockMatSize*(NumL+1+j);
+        little_block_host_type umat((typename little_block_host_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride);
+        little_block_host_type umatV((typename little_block_host_type::value_type*) &InV[matOffset], blockSize_, rowStride);
         //umatV.assign(umat);
         Tpetra::COPY (umat, umatV);
         NumU += 1;
@@ -589,8 +600,8 @@ void RBILUK<MatrixType>::compute ()
       }
 
 #ifndef IFPACK2_RBILUK_INITIAL
-      for (local_ordinal_type i = 0; i < blockSize_; ++i)
-        for (local_ordinal_type j = 0; j < blockSize_; ++j){
+      for (LO i = 0; i < blockSize_; ++i)
+        for (LO j = 0; j < blockSize_; ++j){
           {
             diagModBlock(i,j) = 0;
           }
@@ -600,13 +611,13 @@ void RBILUK<MatrixType>::compute ()
       Kokkos::deep_copy (diagModBlock, diagmod);
 #endif
 
-      for (local_ordinal_type jj = 0; jj < NumL; ++jj) {
-        local_ordinal_type j = InI[jj];
-        little_block_type currentVal((typename little_block_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++;
+      for (LO jj = 0; jj < NumL; ++jj) {
+        LO j = InI[jj];
+        little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++;
         //multiplier.assign(currentVal);
         Tpetra::COPY (currentVal, multiplier);
 
-        const little_block_type dmatInverse = D_block_->getLocalBlock(j,j);
+        const little_block_host_type dmatInverse = D_block_->getLocalBlockHostNonConst(j,j);
         // alpha = 1, beta = 0
 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
         KokkosBatched::Experimental::SerialGemm
@@ -621,18 +632,19 @@ void RBILUK<MatrixType>::compute ()
         //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*> (currentVal.data ()), reinterpret_cast<impl_scalar_type*> (dmatInverse.data ()), reinterpret_cast<impl_scalar_type*> (matTmp.data ()), blockSize_);
         //currentVal.assign(matTmp);
         Tpetra::COPY (matTmp, currentVal);
+        local_inds_host_view_type UUI;
+        values_host_view_type UUV;
 
-        const local_ordinal_type * UUI;
-        scalar_type * UUV;
-        U_block_->getLocalRowView(j, UUI, UUV, NumUU);
+        U_block_->getLocalRowView(j, UUI, UUV);
+        NumUU = (LO) UUI.size();
 
         if (this->RelaxValue_ == STM::zero ()) {
-          for (local_ordinal_type k = 0; k < NumUU; ++k) {
+          for (LO k = 0; k < NumUU; ++k) {
             if (!(UUI[k] < numLocalRows)) continue;
             const int kk = colflag[UUI[k]];
             if (kk > -1) {
-              little_block_type kkval((typename little_block_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
-              little_block_type uumat((typename little_block_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
+              little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
+              little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
         KokkosBatched::Experimental::SerialGemm
           <KokkosBatched::Experimental::Trans::NoTranspose,
@@ -648,12 +660,12 @@ void RBILUK<MatrixType>::compute ()
           }
         }
         else {
-          for (local_ordinal_type k = 0; k < NumUU; ++k) {
+          for (LO k = 0; k < NumUU; ++k) {
             if (!(UUI[k] < numLocalRows)) continue;
             const int kk = colflag[UUI[k]];
-            little_block_type uumat((typename little_block_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
+            little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
             if (kk > -1) {
-              little_block_type kkval((typename little_block_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
+              little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
         KokkosBatched::Experimental::SerialGemm
           <KokkosBatched::Experimental::Trans::NoTranspose,
@@ -723,8 +735,8 @@ void RBILUK<MatrixType>::compute ()
           "lapackInfo = " << lapackInfo << " which indicates an error in the matrix inverse GETRI.");
       }
 
-      for (local_ordinal_type j = 0; j < NumU; ++j) {
-        little_block_type currentVal((typename little_block_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++;
+      for (LO j = 0; j < NumU; ++j) {
+        little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++;
         // scale U by the diagonal inverse
 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
         KokkosBatched::Experimental::SerialGemm
@@ -762,6 +774,7 @@ void RBILUK<MatrixType>::compute ()
   } // Stop timing
 
   // Sync everything back to device, for efficient solves.
+  /*
   {
     typedef typename block_crs_matrix_type::device_type device_type;
     if (! A_block_.is_null ()) {
@@ -773,6 +786,7 @@ void RBILUK<MatrixType>::compute ()
     U_block_->template sync<device_type> ();
     D_block_->template sync<device_type> ();
   }
+  */
 
   this->isComputed_ = true;
   this->numCompute_ += 1;
@@ -814,15 +828,15 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
     "complex Scalar type.  Please talk to the Ifpack2 developers to get this "
     "fixed.  There is a FIXME in this file about this very issue.");
 
-  const local_ordinal_type blockMatSize = blockSize_*blockSize_;
+  const LO blockMatSize = blockSize_*blockSize_;
 
-  const local_ordinal_type rowStride = blockSize_;
+  const LO rowStride = blockSize_;
 
   BMV yBlock (Y, * (A_block_->getGraph ()->getDomainMap ()), blockSize_);
   const BMV xBlock (X, * (A_block_->getColMap ()), blockSize_);
 
   Teuchos::Array<scalar_type> lclarray(blockSize_);
-  little_vec_type lclvec((typename little_vec_type::value_type*)&lclarray[0], blockSize_);
+  little_host_vec_type lclvec((typename little_host_vec_type::value_type*)&lclarray[0], blockSize_);
   const scalar_type one = STM::one ();
   const scalar_type zero = STM::zero ();
 
@@ -838,14 +852,14 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
         // input and output to alias one another.
         //
         // FIXME (mfh 24 Jan 2014) Cache this temp multivector.
-        const local_ordinal_type numVectors = xBlock.getNumVectors();
+        const LO numVectors = xBlock.getNumVectors();
         BMV cBlock (* (A_block_->getGraph ()->getDomainMap ()), blockSize_, numVectors);
         BMV rBlock (* (A_block_->getGraph ()->getDomainMap ()), blockSize_, numVectors);
-        for (local_ordinal_type imv = 0; imv < numVectors; ++imv)
+        for (LO imv = 0; imv < numVectors; ++imv)
         {
           for (size_t i = 0; i < D_block_->getNodeNumRows(); ++i)
           {
-            local_ordinal_type local_row = i;
+            LO local_row = i;
             const_host_little_vec_type xval = 
                    xBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly);
             little_host_vec_type cval = 
@@ -853,20 +867,19 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
             //cval.assign(xval);
             Tpetra::COPY (xval, cval);
 
-            local_ordinal_type NumL;
-            const local_ordinal_type * colValsL;
-            scalar_type * valsL;
+            local_inds_host_view_type colValsL;
+            values_host_view_type valsL;
+            L_block_->getLocalRowView(local_row, colValsL, valsL);
+            LO NumL = (LO) colValsL.size();
 
-            L_block_->getLocalRowView(local_row, colValsL, valsL, NumL);
-
-            for (local_ordinal_type j = 0; j < NumL; ++j)
+            for (LO j = 0; j < NumL; ++j)
             {
-              local_ordinal_type col = colValsL[j];
+              LO col = colValsL[j];
               const_host_little_vec_type prevVal = 
                     cBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly);
 
-              const local_ordinal_type matOffset = blockMatSize*j;
-              little_block_type lij((typename little_block_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
+              const LO matOffset = blockMatSize*j;
+              little_block_host_type lij((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
 
               //cval.matvecUpdate(-one, lij, prevVal);
               Tpetra::GEMV (-one, lij, prevVal, cval);
@@ -878,12 +891,12 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
         D_block_->applyBlock(cBlock, rBlock);
 
         // Solve U Y = R.
-        for (local_ordinal_type imv = 0; imv < numVectors; ++imv)
+        for (LO imv = 0; imv < numVectors; ++imv)
         {
-          const local_ordinal_type numRows = D_block_->getNodeNumRows();
-          for (local_ordinal_type i = 0; i < numRows; ++i)
+          const LO numRows = D_block_->getNodeNumRows();
+          for (LO i = 0; i < numRows; ++i)
           {
-            local_ordinal_type local_row = (numRows-1)-i;
+            LO local_row = (numRows-1)-i;
             const_host_little_vec_type rval = 
                    rBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly);
             little_host_vec_type yval = 
@@ -891,20 +904,19 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
             //yval.assign(rval);
             Tpetra::COPY (rval, yval);
 
-            local_ordinal_type NumU;
-            const local_ordinal_type * colValsU;
-            scalar_type * valsU;
-
-            U_block_->getLocalRowView(local_row, colValsU, valsU, NumU);
+            local_inds_host_view_type colValsU;
+            values_host_view_type valsU;      
+            U_block_->getLocalRowView(local_row, colValsU, valsU);
+            LO NumU = (LO) colValsU.size();
 
-            for (local_ordinal_type j = 0; j < NumU; ++j)
+            for (LO j = 0; j < NumU; ++j)
             {
-              local_ordinal_type col = colValsU[NumU-1-j];
+              LO col = colValsU[NumU-1-j];
               const_host_little_vec_type prevVal = 
                    yBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly);
 
-              const local_ordinal_type matOffset = blockMatSize*(NumU-1-j);
-              little_block_type uij((typename little_block_type::value_type*) &valsU[matOffset], blockSize_, rowStride);
+              const LO matOffset = blockMatSize*(NumU-1-j);
+              little_block_host_type uij((typename little_block_host_type::value_type*) &valsU[matOffset], blockSize_, rowStride);
 
               //yval.matvecUpdate(-one, uij, prevVal);
               Tpetra::GEMV (-one, uij, prevVal, yval);
diff --git a/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp b/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp
index 553d70478043..400687d68d34 100644
--- a/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_ILUT_decl.hpp
@@ -123,6 +123,14 @@ class ILUT :
                             global_ordinal_type,
                             node_type> row_matrix_type;
 
+  typedef typename row_matrix_type::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename row_matrix_type::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename row_matrix_type::values_host_view_type values_host_view_type;
+
+  typedef typename row_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename row_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   static_assert(std::is_same<MatrixType, row_matrix_type>::value, "Ifpack2::ILUT: The template parameter MatrixType must be a Tpetra::RowMatrix specialization.  Please don't use Tpetra::CrsMatrix (a subclass of Tpetra::RowMatrix) here anymore.  The constructor can take either a RowMatrix or a CrsMatrix just fine.");
 
   //! Type of the Tpetra::CrsMatrix specialization that this class uses for the L and U factors.
diff --git a/packages/ifpack2/src/Ifpack2_ILUT_def.hpp b/packages/ifpack2/src/Ifpack2_ILUT_def.hpp
index 6bbab2d5beae..63d962a5b822 100644
--- a/packages/ifpack2/src/Ifpack2_ILUT_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_ILUT_def.hpp
@@ -491,18 +491,17 @@ void ILUT<MatrixType>::compute ()
     // =================== //
     // start factorization //
     // =================== //
-
-    ArrayRCP<local_ordinal_type> ColIndicesARCP;
-    ArrayRCP<scalar_type>       ColValuesARCP;
+    nonconst_local_inds_host_view_type ColIndicesARCP;
+    nonconst_values_host_view_type ColValuesARCP;
     if (! A_local_->supportsRowViews ()) {
       const size_t maxnz = A_local_->getNodeMaxNumRowEntries ();
-      ColIndicesARCP.resize (maxnz);
-      ColValuesARCP.resize (maxnz);
+      Kokkos::resize(ColIndicesARCP,maxnz);
+      Kokkos::resize(ColValuesARCP,maxnz);
     }
 
     for (local_ordinal_type row_i = 0 ; row_i < myNumRows ; ++row_i) {
-      ArrayView<const local_ordinal_type> ColIndicesA;
-      ArrayView<const scalar_type> ColValuesA;
+      local_inds_host_view_type  ColIndicesA;
+      values_host_view_type ColValuesA;
       size_t RowNnz;
 
       if (A_local_->supportsRowViews ()) {
@@ -510,9 +509,9 @@ void ILUT<MatrixType>::compute ()
         RowNnz = ColIndicesA.size ();
       }
       else {
-        A_local_->getLocalRowCopy (row_i, ColIndicesARCP (), ColValuesARCP (), RowNnz);
-        ColIndicesA = ColIndicesARCP (0, RowNnz);
-        ColValuesA = ColValuesARCP (0, RowNnz);
+        A_local_->getLocalRowCopy (row_i, ColIndicesARCP, ColValuesARCP, RowNnz);
+        ColIndicesA = Kokkos::subview(ColIndicesARCP,std::make_pair((size_t)0, RowNnz));
+        ColValuesA  = Kokkos::subview(ColValuesARCP,std::make_pair((size_t)0, RowNnz));
       }
 
       // Always include the diagonal in the U factor. The value should get
@@ -612,7 +611,7 @@ void ILUT<MatrixType>::compute ()
       // Put indices and values for L into arrays and then into the L_ matrix.
 
       //   first, the original entries from the L section of A:
-      for (size_type i = 0; i < ColIndicesA.size (); ++i) {
+      for (size_type i = 0; i < (size_type)ColIndicesA.size (); ++i) {
         if (ColIndicesA[i] < row_i) {
           L_tmp_idx[row_i].push_back(ColIndicesA[i]);
           L_tmpv[row_i].push_back(cur_row[ColIndicesA[i]]);
diff --git a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp
index 9242bb5c70b0..cac7ae4a167b 100644
--- a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp
+++ b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp
@@ -58,6 +58,7 @@
 #include <Teuchos_ParameterList.hpp>
 #include <Teuchos_CommHelpers.hpp>
 #include <Tpetra_CrsGraph.hpp>
+#include <Tpetra_Details_WrappedDualView.hpp>
 #include <Tpetra_Import.hpp>
 #include <Ifpack2_CreateOverlapGraph.hpp>
 #include <Ifpack2_Parameters.hpp>
@@ -111,6 +112,13 @@ class IlukGraph : public Teuchos::Describable {
                            global_ordinal_type,
                            node_type> crs_graph_type;
 
+
+
+  typedef typename crs_graph_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename crs_graph_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename crs_graph_type::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename crs_graph_type::local_inds_host_view_type  local_inds_host_view_type;
+
   /// \brief Constructor.
   ///
   /// Create a IlukGraph object using the input graph and specified
@@ -280,34 +288,39 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
 
   using device_type = typename node_type::device_type;
   using execution_space = typename device_type::execution_space;
-  Kokkos::DualView<size_t*, device_type> numEntPerRow("numEntPerRow", NumMyRows);
-  auto numEntPerRow_d = numEntPerRow.template view<device_type>();
-  auto localOverlapGraph = OverlapGraph_->getLocalGraph();
+  using dual_view_type = Kokkos::DualView<size_t*,device_type>;
+  dual_view_type numEntPerRow_dv("numEntPerRow",NumMyRows);
+  Tpetra::Details::WrappedDualView<dual_view_type> numEntPerRow(numEntPerRow_dv);
 
   const auto overalloc = Overalloc_;
   const auto levelfill = LevelFill_;
-  numEntPerRow.sync_device();
-  numEntPerRow.modify_device();
-  Kokkos::parallel_for("CountOverlapGraphRowEntries",
-    Kokkos::RangePolicy<execution_space>(0, NumMyRows),
-    KOKKOS_LAMBDA(const int i)
-    {
-      // Heuristic to get the maximum number of entries per row.
-      int RowMaxNumIndices = localOverlapGraph.rowConst(i).length;
-      numEntPerRow_d(i) = (levelfill == 0) ? RowMaxNumIndices  // No additional storage needed
-                                  : ceil(static_cast<double>(RowMaxNumIndices) 
-                                        * pow(overalloc, levelfill));
-    });
+  {
+    // Scoping for the  localOverlapGraph access
+    auto numEntPerRow_d = numEntPerRow.getDeviceView(Tpetra::Access::OverwriteAll);
+    auto localOverlapGraph = OverlapGraph_->getLocalGraphDevice();
+    Kokkos::parallel_for("CountOverlapGraphRowEntries",
+                         Kokkos::RangePolicy<execution_space>(0, NumMyRows),
+                         KOKKOS_LAMBDA(const int i)
+                         {
+                           // Heuristic to get the maximum number of entries per row.
+                           int RowMaxNumIndices = localOverlapGraph.rowConst(i).length;
+                           numEntPerRow_d(i) = (levelfill == 0) ? RowMaxNumIndices  // No additional storage needed
+                             : ceil(static_cast<double>(RowMaxNumIndices) 
+                                    * pow(overalloc, levelfill));
+                         });
+   
+  };
 
   bool insertError;  // No error found yet while inserting entries
   do {
     insertError = false;
+    Teuchos::ArrayView<const size_t> a_numEntPerRow(numEntPerRow.getHostView(Tpetra::Access::ReadOnly).data(),NumMyRows);
     L_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (),
                                         OverlapGraph_->getRowMap (),
-                                        numEntPerRow));
+                                        a_numEntPerRow));
     U_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (),
                                         OverlapGraph_->getRowMap (),
-                                        numEntPerRow));
+                                        a_numEntPerRow));
 
     Array<local_ordinal_type> L (MaxNumIndices);
     Array<local_ordinal_type> U (MaxNumIndices);
@@ -317,7 +330,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
     NumMyDiagonals_ = 0;
 
     for (int i = 0; i< NumMyRows; ++i) {
-      ArrayView<const local_ordinal_type> my_indices;
+      local_inds_host_view_type my_indices;
       OverlapGraph_->getLocalRowView (i, my_indices);
 
       // Split into L and U (we don't assume that indices are ordered).
@@ -352,12 +365,10 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
         ++NumMyDiagonals_;
       }
       if (NumL) {
-        ArrayView<const local_ordinal_type> L_view = L.view (0, NumL);
-        L_Graph_->insertLocalIndices (i, L_view);
+        L_Graph_->insertLocalIndices (i, NumL, L.data());
       }
       if (NumU) {
-        ArrayView<const local_ordinal_type> U_view = U.view (0, NumU);
-        U_Graph_->insertLocalIndices (i, U_view);
+        U_Graph_->insertLocalIndices (i, NumU, U.data());
       }
     }
 
@@ -394,16 +405,16 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
           size_t LenL = L_Graph_->getNumEntriesInLocalRow(i);
           size_t LenU = U_Graph_->getNumEntriesInLocalRow(i);
           size_t Len = LenL + LenU + 1;
-
           CurrentRow.resize(Len);
-
-          L_Graph_->getLocalRowCopy(i, CurrentRow(), LenL);  // Get L Indices
+          nonconst_local_inds_host_view_type CurrentRow_view(CurrentRow.data(),CurrentRow.size());
+          L_Graph_->getLocalRowCopy(i, CurrentRow_view, LenL);  // Get L Indices
           CurrentRow[LenL] = i;                              // Put in Diagonal
           if (LenU > 0) {
-            ArrayView<local_ordinal_type> URowView = CurrentRow.view (LenL+1,
-                                                                      LenU);
+            ArrayView<local_ordinal_type> URowView = CurrentRow.view (LenL+1,LenU);            
+            nonconst_local_inds_host_view_type URowView_v(URowView.data(),URowView.size());
+
             // Get U Indices
-            U_Graph_->getLocalRowCopy (i, URowView, LenU);
+            U_Graph_->getLocalRowCopy (i, URowView_v, LenU);
           }
 
           // Construct linked list for current row
@@ -425,7 +436,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
             int NextInList = LinkList[Next];
             int RowU = Next;
             // Get Indices for this row of U
-            ArrayView<const local_ordinal_type> IndicesU;
+            local_inds_host_view_type IndicesU;
             U_Graph_->getLocalRowView (RowU, IndicesU);
             // FIXME (mfh 23 Dec 2013) size() returns ptrdiff_t, not int.
             int LengthRowU = IndicesU.size ();
@@ -464,15 +475,13 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
           }
 
           // Put pattern into L and U
-
-          CurrentRow.resize (0);
+          CurrentRow.resize(0);
 
           Next = First;
 
           // Lower
-
           while (Next < i) {
-            CurrentRow.push_back (Next);
+            CurrentRow.push_back(Next);
             Next = LinkList[Next];
           }
 
@@ -481,7 +490,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
           // particular, it does not actually change the column Map.
           L_Graph_->removeLocalIndices (i); // Delete current set of Indices
           if (CurrentRow.size() > 0) {
-            L_Graph_->insertLocalIndices (i, CurrentRow ());
+            L_Graph_->insertLocalIndices (i, CurrentRow.size(),CurrentRow.data());
           }
 
           // Diagonal
@@ -494,8 +503,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
           Next = LinkList[Next];
 
           // Upper
-
-          CurrentRow.resize (0);
+          CurrentRow.resize(0);
           LenU = 0;
 
           while (Next < NumMyRows) {
@@ -511,7 +519,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
 
           U_Graph_->removeLocalIndices (i); // Delete current set of Indices
           if (LenU > 0) {
-            U_Graph_->insertLocalIndices (i, CurrentRow ());
+            U_Graph_->insertLocalIndices (i, CurrentRow.size(),CurrentRow.data());
           }
 
           // Allocate and fill Level info for this row
@@ -523,8 +531,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize()
       }
       catch (std::runtime_error &e) {
         insertError = true;
-        numEntPerRow.sync_device();
-        numEntPerRow.modify_device();
+        auto numEntPerRow_d = numEntPerRow.getDeviceView(Tpetra::Access::OverwriteAll);
         Kokkos::parallel_for("CountOverlapGraphRowEntries",
           Kokkos::RangePolicy<execution_space>(0, NumMyRows),
           KOKKOS_LAMBDA(const int i)
@@ -564,11 +571,11 @@ void IlukGraph<GraphType, KKHandleType>::initialize(const Teuchos::RCP<KKHandleT
   using Teuchos::REDUCE_SUM;
   using Teuchos::reduceAll;
 
-  typedef typename crs_graph_type::local_graph_type local_graph_type;
-  typedef typename local_graph_type::size_type      size_type;
-  typedef typename local_graph_type::data_type      data_type;
-  typedef typename local_graph_type::array_layout   array_layout;
-  typedef typename local_graph_type::device_type    device_type;
+  typedef typename crs_graph_type::local_graph_device_type local_graph_device_type;
+  typedef typename local_graph_device_type::size_type      size_type;
+  typedef typename local_graph_device_type::data_type      data_type;
+  typedef typename local_graph_device_type::array_layout   array_layout;
+  typedef typename local_graph_device_type::device_type    device_type;
 
   typedef typename Kokkos::View<size_type*, array_layout, device_type> lno_row_view_t;
   typedef typename Kokkos::View<data_type*, array_layout, device_type> lno_nonzero_view_t;
@@ -578,7 +585,7 @@ void IlukGraph<GraphType, KKHandleType>::initialize(const Teuchos::RCP<KKHandleT
   // FIXME (mfh 23 Dec 2013) Use size_t or whatever
   // getNodeNumElements() returns, instead of ptrdiff_t.
   const int NumMyRows = OverlapGraph_->getRowMap()->getNodeNumElements();
-  auto localOverlapGraph = OverlapGraph_->getLocalGraph();
+  auto localOverlapGraph = OverlapGraph_->getLocalGraphDevice();
 
   if (KernelHandle->get_spiluk_handle()->get_nrows() < static_cast<size_type>(NumMyRows)) {
     KernelHandle->get_spiluk_handle()->reset_handle(NumMyRows,
diff --git a/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp b/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp
index bb882a8a810b..1678526e5391 100644
--- a/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_LinePartitioner_decl.hpp
@@ -82,6 +82,8 @@ class LinePartitioner : public OverlappingPartitioner<GraphType> {
   typedef Tpetra::RowGraph<local_ordinal_type, global_ordinal_type, node_type>  row_graph_type;
   typedef Tpetra::MultiVector<double,local_ordinal_type, global_ordinal_type, node_type>  multivector_type;
 
+  typedef typename row_graph_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename row_graph_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
 
   //! Constructor.
   LinePartitioner(const Teuchos::RCP<const row_graph_type>& graph);
diff --git a/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp b/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp
index 7bf97cc206e9..74fa0f10b927 100644
--- a/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_LinePartitioner_def.hpp
@@ -122,7 +122,7 @@ int LinePartitioner<GraphType,Scalar>::Compute_Blocks_AutoLine(Teuchos::ArrayVie
   size_t N               = this->Graph_->getNodeNumRows();
   size_t allocated_space = this->Graph_->getNodeMaxNumRowEntries();
 
-  Teuchos::Array<LO>     cols(allocated_space);
+  nonconst_local_inds_host_view_type cols("cols",allocated_space);
   Teuchos::Array<LO>     indices(allocated_space);
   Teuchos::Array<double> dist(allocated_space);
 
@@ -137,7 +137,7 @@ int LinePartitioner<GraphType,Scalar>::Compute_Blocks_AutoLine(Teuchos::ArrayVie
     if(blockIndices[i] != invalid) continue;
 
     // Get neighbors and sort by distance
-    this->Graph_->getLocalRowCopy(i,cols(),nz);
+    this->Graph_->getLocalRowCopy(i,cols,nz);
     double x0 = (!xvals.is_null()) ? xvals[i/NumEqns_] : zero;
     double y0 = (!yvals.is_null()) ? yvals[i/NumEqns_] : zero;
     double z0 = (!zvals.is_null()) ? zvals[i/NumEqns_] : zero;
@@ -190,7 +190,8 @@ void LinePartitioner<GraphType,Scalar>::local_automatic_line_search(int NumEqns,
 
   size_t N               = this->Graph_->getNodeNumRows();
   size_t allocated_space = this->Graph_->getNodeMaxNumRowEntries();
-  Teuchos::ArrayView<LO>     cols    = itemp();
+
+  nonconst_local_inds_host_view_type cols(itemp.data(),allocated_space);
   Teuchos::ArrayView<LO>     indices = itemp.view(allocated_space,allocated_space);
   Teuchos::ArrayView<double> dist= dtemp();
 
@@ -199,7 +200,7 @@ void LinePartitioner<GraphType,Scalar>::local_automatic_line_search(int NumEqns,
     size_t nz=0;
     LO neighbors_in_line=0;
 
-    this->Graph_->getLocalRowCopy(next,cols(),nz);
+    this->Graph_->getLocalRowCopy(next,cols,nz);
     double x0 = (!xvals.is_null()) ? xvals[next/NumEqns_] : zero;
     double y0 = (!yvals.is_null()) ? yvals[next/NumEqns_] : zero;
     double z0 = (!zvals.is_null()) ? zvals[next/NumEqns_] : zero;
diff --git a/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp
index 4b6bf895def3..444f0d12cb43 100644
--- a/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_LocalFilter_decl.hpp
@@ -189,9 +189,21 @@ class LocalFilter :
   //! The Node type used by the input MatrixType.
   typedef typename MatrixType::node_type node_type;
 
+
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
+
   //! The type of the magnitude (absolute value) of a matrix entry.
   typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
 
+
+
   //! Type of the Tpetra::RowMatrix specialization that this class uses.
   typedef Tpetra::RowMatrix<scalar_type,
                             local_ordinal_type,
@@ -329,11 +341,18 @@ class LocalFilter :
   /// process, then \c Indices and \c Values are unchanged and
   /// \c NumIndices is <tt>Teuchos::OrdinalTraits<size_t>::invalid()</tt>
   /// on output.
+  virtual void
+  getGlobalRowCopy (global_ordinal_type GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowCopy (global_ordinal_type GlobalRow,
                     const Teuchos::ArrayView<global_ordinal_type> &Indices,
                     const Teuchos::ArrayView<scalar_type> &Values,
                     size_t &NumEntries) const;
+#endif
 
   /// \brief Get the entries in the given row, using local indices.
   ///
@@ -348,12 +367,19 @@ class LocalFilter :
   /// process, then \c Indices and \c Values are unchanged and
   /// \c NumIndices is <tt>Teuchos::OrdinalTraits<size_t>::invalid()</tt>
   /// on output.
+  virtual void
+  getLocalRowCopy (local_ordinal_type LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+
   virtual void
   getLocalRowCopy (local_ordinal_type LocalRow,
                    const Teuchos::ArrayView<local_ordinal_type> &Indices,
                    const Teuchos::ArrayView<scalar_type> &Values,
                    size_t &NumEntries) const ;
-
+#endif
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
     \param GlobalRow [in] Global row number for which indices are desired.
@@ -365,10 +391,16 @@ class LocalFilter :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (global_ordinal_type GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowView (global_ordinal_type GlobalRow,
                     Teuchos::ArrayView<const global_ordinal_type> &indices,
                     Teuchos::ArrayView<const scalar_type> &values) const;
+#endif
 
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
@@ -381,11 +413,16 @@ class LocalFilter :
 
     Note: If \c LocalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (local_ordinal_type LocalRow,
+    local_inds_host_view_type & indices,
+    values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getLocalRowView (local_ordinal_type LocalRow,
                    Teuchos::ArrayView<const local_ordinal_type> &indices,
                    Teuchos::ArrayView<const scalar_type> &values) const;
-
+#endif
   /// \brief Get the diagonal entries of the (locally filtered) matrix.
   ///
   /// \param diag [in/out] On input: a Tpetra::Vector whose Map is the
@@ -508,11 +545,12 @@ class LocalFilter :
   //! NumEntries_[i] contains the nonzero entries in row `i'.
   std::vector<size_t> NumEntries_;
 
-  //! Temporary array used in getLocalRowCopy().
-  mutable Teuchos::Array<local_ordinal_type> localIndices_;
+  //! Used in ExtractMyRowCopy, to avoid allocation each time.
+  mutable nonconst_local_inds_host_view_type localIndices_;
+  mutable nonconst_local_inds_host_view_type localIndicesForGlobalCopy_;
+  //! Used in ExtractMyRowCopy, to avoid allocation each time.
+  mutable nonconst_values_host_view_type Values_;
 
-  //! Temporary array used in getLocalRowCopy().
-  mutable Teuchos::Array<scalar_type> Values_;
 };// class LocalFilter
 
 }// namespace Ifpack2
diff --git a/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp b/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp
index 5cd4b100cca7..edadd5e192c1 100644
--- a/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_LocalFilter_def.hpp
@@ -177,8 +177,9 @@ LocalFilter (const Teuchos::RCP<const row_matrix_type>& A) :
   MaxNumEntriesA_ = A_->getNodeMaxNumRowEntries ();
 
   // Allocate temporary arrays for getLocalRowCopy().
-  localIndices_.resize (MaxNumEntries_);
-  Values_.resize (MaxNumEntries_);
+  Kokkos::resize(localIndices_,MaxNumEntries_);
+  Kokkos::resize(localIndicesForGlobalCopy_,MaxNumEntries_);
+  Kokkos::resize(Values_,MaxNumEntries_);
 
   // now compute:
   // - the number of nonzero per row
@@ -427,10 +428,10 @@ bool LocalFilter<MatrixType>::isFillComplete () const
 template<class MatrixType>
 void
 LocalFilter<MatrixType>::
-getGlobalRowCopy (global_ordinal_type globalRow,
-                  const Teuchos::ArrayView<global_ordinal_type>& globalIndices,
-                  const Teuchos::ArrayView<scalar_type>& values,
-                  size_t& numEntries) const
+  getGlobalRowCopy (global_ordinal_type globalRow,
+                   nonconst_global_inds_host_view_type &globalIndices,
+                   nonconst_values_host_view_type &values,
+                   size_t& numEntries) const
 {
   typedef local_ordinal_type LO;
   typedef typename Teuchos::Array<LO>::size_type size_type;
@@ -452,29 +453,44 @@ getGlobalRowCopy (global_ordinal_type globalRow,
     // FIXME (mfh 26 Mar 2014) If local_ordinal_type ==
     // global_ordinal_type, we could just alias the input array
     // instead of allocating a temporary array.
-    Teuchos::Array<LO> localIndices (numEntries);
-    this->getLocalRowCopy (localRow, localIndices (), values, numEntries);
+
+    // In this case, getLocalRowCopy *does* use the localIndices_, so we use a second temp array
+    this->getLocalRowCopy (localRow, localIndicesForGlobalCopy_, values, numEntries);
 
     const map_type& colMap = * (this->getColMap ());
 
     // Don't fill the output array beyond its size.
     const size_type numEnt =
       std::min (static_cast<size_type> (numEntries),
-                std::min (globalIndices.size (), values.size ()));
+                std::min ((size_type)globalIndices.size (), (size_type)values.size ()));
     for (size_type k = 0; k < numEnt; ++k) {
-      globalIndices[k] = colMap.getGlobalElement (localIndices[k]);
+      globalIndices[k] = colMap.getGlobalElement (localIndicesForGlobalCopy_[k]);
     }
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void
+LocalFilter<MatrixType>::
+getGlobalRowCopy (global_ordinal_type globalRow,
+                  const Teuchos::ArrayView<global_ordinal_type>& Indices,
+                  const Teuchos::ArrayView<scalar_type>& Values,
+                  size_t& numEntries) const {
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getGlobalRowCopy(globalRow,ind_in,val_in,numEntries);  
+}
+#endif
 
 template<class MatrixType>
 void
 LocalFilter<MatrixType>::
 getLocalRowCopy (local_ordinal_type LocalRow,
-                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
-                 const Teuchos::ArrayView<scalar_type> &Values,
-                 size_t &NumEntries) const
+                 nonconst_local_inds_host_view_type &Indices,
+                 nonconst_values_host_view_type &Values,
+                 size_t& NumEntries) const
 {
   typedef local_ordinal_type LO;
   typedef global_ordinal_type GO;
@@ -486,7 +502,7 @@ getLocalRowCopy (local_ordinal_type LocalRow,
   }
 
   if (A_->getRowMap()->getComm()->getSize() == 1) {
-    A_->getLocalRowCopy (LocalRow, Indices (), Values (), NumEntries);
+    A_->getLocalRowCopy (LocalRow, Indices, Values, NumEntries);
     return;
   }
 
@@ -524,7 +540,7 @@ getLocalRowCopy (local_ordinal_type LocalRow,
   // column indices.  CrsMatrix could take a set of column indices,
   // and return their corresponding values.
   size_t numEntInMat = 0;
-  A_->getLocalRowCopy (LocalRow, localIndices_ (), Values_ (), numEntInMat);
+  A_->getLocalRowCopy (LocalRow, localIndices_, Values_ , numEntInMat);
 
   // Fill the user's arrays with the "local" indices and values in
   // that row.  Note that the matrix might have a different column Map
@@ -573,7 +589,35 @@ getLocalRowCopy (local_ordinal_type LocalRow,
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void
+LocalFilter<MatrixType>::
+getLocalRowCopy (local_ordinal_type globalRow,
+                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
+                 const Teuchos::ArrayView<scalar_type> &Values,
+             size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(globalRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
 
+template<class MatrixType>
+void
+LocalFilter<MatrixType>::
+getGlobalRowView (global_ordinal_type /*GlobalRow*/,
+                    global_inds_host_view_type &/*indices*/,
+                    values_host_view_type &/*values*/) const 
+{
+  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
+    "Ifpack2::LocalFilter does not implement getGlobalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void
 LocalFilter<MatrixType>::
@@ -584,8 +628,21 @@ getGlobalRowView (global_ordinal_type /* GlobalRow */,
   TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
     "Ifpack2::LocalFilter does not implement getGlobalRowView.");
 }
+#endif
+
+template<class MatrixType>
+void
+LocalFilter<MatrixType>::
+getLocalRowView (local_ordinal_type /*LocalRow*/,
+    local_inds_host_view_type &/*indices*/,
+    values_host_view_type &/*values*/) const 
+{
+  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
+    "Ifpack2::LocalFilter does not implement getLocalRowView.");
+}
 
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void
 LocalFilter<MatrixType>::
@@ -596,6 +653,7 @@ getLocalRowView (local_ordinal_type /* LocalRow */,
   TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
     "Ifpack2::LocalFilter does not implement getLocalRowView.");
 }
+#endif
 
 
 template<class MatrixType>
@@ -738,13 +796,14 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
     for (size_t i = 0; i < numRows; ++i) {
       size_t Nnz;
       // Use this class's getrow to make the below code simpler
-      getLocalRowCopy (i, localIndices_ (), Values_ (), Nnz);
+      getLocalRowCopy (i, localIndices_ , Values_ , Nnz);
+      scalar_type* Values = reinterpret_cast<scalar_type*>(Values_.data());
       if (mode == Teuchos::NO_TRANS) {
         for (size_t j = 0; j < Nnz; ++j) {
           const local_ordinal_type col = localIndices_[j];
           for (size_t k = 0; k < NumVectors; ++k) {
             y_ptr[i + y_stride*k] +=
-              alpha * Values_[j] * x_ptr[col + x_stride*k];
+              alpha * Values[j] * x_ptr[col + x_stride*k];
           }
         }
       }
@@ -753,7 +812,7 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
           const local_ordinal_type col = localIndices_[j];
           for (size_t k = 0; k < NumVectors; ++k) {
             y_ptr[col + y_stride*k] +=
-              alpha * Values_[j] * x_ptr[i + x_stride*k];
+              alpha * Values[j] * x_ptr[i + x_stride*k];
           }
         }
       }
@@ -762,7 +821,7 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
           const local_ordinal_type col = localIndices_[j];
           for (size_t k = 0; k < NumVectors; ++k) {
             y_ptr[col + y_stride*k] +=
-              alpha * STS::conjugate (Values_[j]) * x_ptr[i + x_stride*k];
+              alpha * STS::conjugate (Values[j]) * x_ptr[i + x_stride*k];
           }
         }
       }
@@ -777,13 +836,14 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
     for (size_t i = 0; i < numRows; ++i) {
       size_t Nnz;
       // Use this class's getrow to make the below code simpler
-      getLocalRowCopy (i, localIndices_ (), Values_ (), Nnz);
+      getLocalRowCopy (i, localIndices_ , Values_ , Nnz);
+      scalar_type* Values = reinterpret_cast<scalar_type*>(Values_.data());
       if (mode == Teuchos::NO_TRANS) {
         for (size_t k = 0; k < NumVectors; ++k) {
           ArrayView<const scalar_type> x_local = (x_ptr())[k]();
           ArrayView<scalar_type>       y_local = (y_ptr())[k]();
           for (size_t j = 0; j < Nnz; ++j) {
-            y_local[i] += alpha * Values_[j] * x_local[localIndices_[j]];
+            y_local[i] += alpha * Values[j] * x_local[localIndices_[j]];
           }
         }
       }
@@ -792,7 +852,7 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
           ArrayView<const scalar_type> x_local = (x_ptr())[k]();
           ArrayView<scalar_type>       y_local = (y_ptr())[k]();
           for (size_t j = 0; j < Nnz; ++j) {
-            y_local[localIndices_[j]] += alpha * Values_[j] * x_local[i];
+            y_local[localIndices_[j]] += alpha * Values[j] * x_local[i];
           }
         }
       }
@@ -802,7 +862,7 @@ applyNonAliased (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global
           ArrayView<scalar_type>       y_local = (y_ptr())[k]();
           for (size_t j = 0; j < Nnz; ++j) {
             y_local[localIndices_[j]] +=
-              alpha * STS::conjugate (Values_[j]) * x_local[i];
+              alpha * STS::conjugate (Values[j]) * x_local[i];
           }
         }
       }
@@ -839,15 +899,15 @@ LocalFilter<MatrixType>::getFrobeniusNorm () const
   typedef typename Teuchos::Array<scalar_type>::size_type size_type;
 
   const size_type maxNumRowEnt = getNodeMaxNumRowEntries ();
-  Teuchos::Array<local_ordinal_type> ind (maxNumRowEnt);
-  Teuchos::Array<scalar_type> val (maxNumRowEnt);
+  nonconst_local_inds_host_view_type ind ("ind",maxNumRowEnt);
+  nonconst_values_host_view_type val ("val",maxNumRowEnt);
   const size_t numRows = static_cast<size_t> (localRowMap_->getNodeNumElements ());
 
   // FIXME (mfh 03 Apr 2013) Scale during sum to avoid overflow.
   mag_type sumSquared = STM::zero ();
   for (size_t i = 0; i < numRows; ++i) {
     size_t numEntries = 0;
-    this->getLocalRowCopy (i, ind (), val (), numEntries);
+    this->getLocalRowCopy (i, ind, val, numEntries);
     for (size_type k = 0; k < static_cast<size_type> (numEntries); ++k) {
       const mag_type v_k_abs = STS::magnitude (val[k]);
       sumSquared += v_k_abs * v_k_abs;
diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp
index 0cab7ef2e6d6..22e4c368c8db 100644
--- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp
@@ -115,14 +115,14 @@ class LocalSparseTriangularSolver :
                  "either a RowMatrix or a CrsMatrix just fine.");
 
   // Use the local matrix types
-  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
-  using local_matrix_graph_type = typename local_matrix_type::StaticCrsGraphType;
-  using lno_row_view_t = typename local_matrix_graph_type::row_map_type;
-  using lno_nonzero_view_t = typename local_matrix_graph_type::entries_type;
-  using scalar_nonzero_view_t = typename local_matrix_type::values_type;
-  using TemporaryMemorySpace = typename local_matrix_graph_type::device_type::memory_space;
-  using PersistentMemorySpace = typename local_matrix_graph_type::device_type::memory_space;
-  using HandleExecSpace = typename local_matrix_graph_type::device_type::execution_space;
+  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
+  using local_matrix_graph_device_type = typename local_matrix_device_type::StaticCrsGraphType;
+  using lno_row_view_t = typename local_matrix_graph_device_type::row_map_type;
+  using lno_nonzero_view_t = typename local_matrix_graph_device_type::entries_type;
+  using scalar_nonzero_view_t = typename local_matrix_device_type::values_type;
+  using TemporaryMemorySpace = typename local_matrix_graph_device_type::device_type::memory_space;
+  using PersistentMemorySpace = typename local_matrix_graph_device_type::device_type::memory_space;
+  using HandleExecSpace = typename local_matrix_graph_device_type::device_type::execution_space;
   using k_handle = typename KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_row_view_t::const_value_type, typename lno_nonzero_view_t::const_value_type, typename scalar_nonzero_view_t::value_type, HandleExecSpace, TemporaryMemorySpace,PersistentMemorySpace >;
 
   /// \brief Constructor
diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp
index 8851131e6236..023364dfa677 100644
--- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp
@@ -377,7 +377,7 @@ initialize ()
   using Tpetra::Details::determineLocalTriangularStructure;
   using crs_matrix_type = Tpetra::CrsMatrix<scalar_type, local_ordinal_type,
     global_ordinal_type, node_type>;
-  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
+  using local_matrix_type = typename crs_matrix_type::local_matrix_device_type;
   using LO = local_ordinal_type;
 
   const char prefix[] = "Ifpack2::LocalSparseTriangularSolver::initialize: ";
@@ -411,7 +411,7 @@ initialize ()
   // mfh 30 Apr 2018: See GitHub Issue #2658.
   constexpr bool ignoreMapsForTriStructure = true;
   auto lclTriStructure = [&] {
-    auto lclMatrix = A_crs_->getLocalMatrix ();
+    auto lclMatrix = A_crs_->getLocalMatrixDevice ();
     auto lclRowMap = A_crs_->getRowMap ()->getLocalMap ();
     auto lclColMap = A_crs_->getColMap ()->getLocalMap ();
     auto lclTriStruct =
@@ -429,7 +429,7 @@ initialize ()
   if (reverseStorage_ && lclTriStructure.couldBeUpperTriangular &&
       htsImpl_.is_null ()) {
     // Reverse the storage for an upper triangular matrix
-    auto Alocal = A_crs_->getLocalMatrix();
+    auto Alocal = A_crs_->getLocalMatrixDevice();
     auto ptr    = Alocal.graph.row_map;
     auto ind    = Alocal.graph.entries;
     auto val    = Alocal.values;
@@ -563,7 +563,7 @@ compute ()
   if (Teuchos::nonnull(kh_) && this->isKokkosKernelsSptrsv_)
   {
     auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type> (A_);
-    auto Alocal = A_crs->getLocalMatrix();
+    auto Alocal = A_crs->getLocalMatrixDevice();
     auto ptr    = Alocal.graph.row_map;
     auto ind    = Alocal.graph.entries;
     auto val    = Alocal.values;
@@ -731,7 +731,7 @@ localTriangularSolve (const MV& Y,
   if (Teuchos::nonnull(kh_) && this->isKokkosKernelsSptrsv_ && trans == "N")
   {
     auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type> (this->A_);
-    auto A_lclk = A_crs->getLocalMatrix ();
+    auto A_lclk = A_crs->getLocalMatrixDevice ();
     auto ptr    = A_lclk.graph.row_map;
     auto ind    = A_lclk.graph.entries;
     auto val    = A_lclk.values;
@@ -753,7 +753,11 @@ localTriangularSolve (const MV& Y,
   else
   {
     const std::string diag = this->diag_;
-    auto A_lcl = this->A_crs_->getLocalMatrix ();
+    // NOTE (mfh 20 Aug 2017): KokkosSparse::trsv currently is a
+    // sequential, host-only code.  See
+    // https://github.com/kokkos/kokkos-kernels/issues/48. 
+
+    auto A_lcl = this->A_crs_->getLocalMatrixHost ();
 
     if (X.isConstantStride () && Y.isConstantStride ()) {
       auto X_lcl = X.getLocalViewHost (Tpetra::Access::ReadWrite);
diff --git a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp
index 1b8dcba6995a..6578a7256bc4 100644
--- a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_decl.hpp
@@ -80,6 +80,9 @@ class OverlappingPartitioner : public Partitioner<GraphType> {
   typedef typename GraphType::local_ordinal_type local_ordinal_type;
   typedef typename GraphType::global_ordinal_type global_ordinal_type;
   typedef typename GraphType::node_type node_type;
+  typedef typename GraphType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename GraphType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+ 
   typedef Tpetra::RowGraph<local_ordinal_type, global_ordinal_type, node_type> row_graph_type;
 
   //! Constructor.
diff --git a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp
index 058097930b33..1d732ccf911f 100644
--- a/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_OverlappingPartitioner_def.hpp
@@ -309,10 +309,8 @@ void OverlappingPartitioner<GraphType>::computeOverlappingPartitions()
     // of row `i'.
 
     int MaxNumEntries_tmp = Graph_->getNodeMaxNumRowEntries();
-    Teuchos::Array<local_ordinal_type> Indices;
-    Indices.resize (MaxNumEntries_tmp);
-    Teuchos::Array<local_ordinal_type> newIndices;
-    newIndices.resize(MaxNumEntries_tmp);
+    nonconst_local_inds_host_view_type Indices("Indices",MaxNumEntries_tmp);
+    nonconst_local_inds_host_view_type newIndices("newIndices",MaxNumEntries_tmp);
     
     if (!maintainSparsity_) {
 
@@ -322,7 +320,7 @@ void OverlappingPartitioner<GraphType>::computeOverlappingPartitions()
           const local_ordinal_type LRID = Parts_[part][i];
           
           size_t numIndices;
-          Graph_->getLocalRowCopy (LRID, Indices (), numIndices);
+          Graph_->getLocalRowCopy (LRID, Indices, numIndices);
 
           for (size_t j = 0; j < numIndices; ++j) {
             // use *local* indices only
@@ -366,12 +364,12 @@ void OverlappingPartitioner<GraphType>::computeOverlappingPartitions()
           const local_ordinal_type LRID = Parts_[part][i];
           
           size_t numIndices;
-          Graph_->getLocalRowCopy (LRID, Indices (), numIndices);
+          Graph_->getLocalRowCopy (LRID, Indices, numIndices);
           //JJH: the entries in Indices are already sorted.  However, the Tpetra documentation states
           //     that we can't count on this always being true, hence we sort.  Also note that there are
           //     unused entries at the end of Indices (it's sized to hold any row).  This means we can't
           //     just use Indices.end() in sorting and in std::includes
-          std::sort(Indices.begin(),Indices.begin()+numIndices);
+          Tpetra::sort(Indices,numIndices);
 
           for (size_t j = 0; j < numIndices; ++j) {
             // use *local* indices only
@@ -389,10 +387,12 @@ void OverlappingPartitioner<GraphType>::computeOverlappingPartitions()
               // Check if row associated with "col" increases connectivity already defined by row LRID's stencil.
               // If it does and maintainSparsity_ is true, do not add "col" to the current partition (block).
               size_t numNewIndices;
-              Graph_->getLocalRowCopy(col, newIndices(), numNewIndices);
-              std::sort(newIndices.begin(),newIndices.begin()+numNewIndices);
-              bool isSubset = std::includes(Indices.begin(),Indices.begin()+numIndices,
-                                   newIndices.begin(),newIndices.begin()+numNewIndices);
+              Graph_->getLocalRowCopy(col, newIndices, numNewIndices);
+              Tpetra::sort(newIndices,numNewIndices);
+              auto Indices_rcp = Kokkos::Compat::persistingView<nonconst_local_inds_host_view_type>(Indices, 0, numIndices);
+              auto newIndices_rcp = Kokkos::Compat::persistingView<nonconst_local_inds_host_view_type>(newIndices, 0, numNewIndices);
+              bool isSubset = std::includes(Indices_rcp.begin(),Indices_rcp.begin()+numIndices,
+                                   newIndices_rcp.begin(),newIndices_rcp.begin()+numNewIndices);
               if (isSubset) {
                 tmp[part].push_back (col);
               }
diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp
index 35c28396cd9f..552a5967e96e 100644
--- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_decl.hpp
@@ -65,6 +65,14 @@ class OverlappingRowMatrix :
   typedef typename MatrixType::global_ordinal_type global_ordinal_type;
   typedef typename MatrixType::node_type node_type;
   typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   using row_matrix_type = Tpetra::RowMatrix<scalar_type, local_ordinal_type,
 					    global_ordinal_type, node_type>;
 
@@ -207,12 +215,18 @@ class OverlappingRowMatrix :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (global_ordinal_type GlobalRow,
+                    nonconst_global_inds_host_view_type &Indices,
+                    nonconst_values_host_view_type &Values,
+                    size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowCopy (global_ordinal_type GlobalRow,
                     const Teuchos::ArrayView<global_ordinal_type> &Indices,
                     const Teuchos::ArrayView<scalar_type> &Values,
                     size_t &NumEntries) const;
-
+#endif
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -224,11 +238,18 @@ class OverlappingRowMatrix :
     with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getLocalRowCopy (local_ordinal_type LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getLocalRowCopy (local_ordinal_type LocalRow,
                    const Teuchos::ArrayView<local_ordinal_type> &Indices,
                    const Teuchos::ArrayView<scalar_type> &Values,
                    size_t &NumEntries) const;
+#endif
 
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
@@ -240,11 +261,16 @@ class OverlappingRowMatrix :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (global_ordinal_type GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowView (global_ordinal_type GlobalRow,
                     Teuchos::ArrayView<const global_ordinal_type> &indices,
                     Teuchos::ArrayView<const scalar_type> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -255,10 +281,16 @@ class OverlappingRowMatrix :
 
     Note: If \c LocalRow does not belong to this node, then \c indices is set to null.
   */
+ virtual void
+  getLocalRowView (local_ordinal_type LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getLocalRowView (local_ordinal_type LocalRow,
                    Teuchos::ArrayView<const local_ordinal_type> &indices,
                    Teuchos::ArrayView<const scalar_type> &values) const;
+#endif
 
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
@@ -368,11 +400,11 @@ class OverlappingRowMatrix :
 
   //! Graph of the matrix (as returned by getGraph()).
   Teuchos::RCP<const row_graph_type> graph_;
-
   //! Used in apply(), to avoid allocation each time.
-  mutable Teuchos::Array<local_ordinal_type> Indices_;
+  mutable nonconst_local_inds_host_view_type Indices_;
   //! Used in apply(), to avoid allocation each time.
-  mutable Teuchos::Array<scalar_type> Values_;
+  mutable nonconst_values_host_view_type Values_;
+
 
 }; // class OverlappingRowMatrix
 
diff --git a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp
index 8e220719cbd3..772825300b76 100644
--- a/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_OverlappingRowMatrix_def.hpp
@@ -223,8 +223,8 @@ OverlappingRowMatrix (const Teuchos::RCP<const row_matrix_type>& A,
   graph_ = Teuchos::rcp_const_cast<const row_graph_type>
     (Teuchos::rcp_implicit_cast<row_graph_type> (graph));
   // Resize temp arrays
-  Indices_.resize (MaxNumEntries_);
-  Values_.resize (MaxNumEntries_);
+  Kokkos::resize(Indices_,MaxNumEntries_);
+  Kokkos::resize(Values_,MaxNumEntries_);
 }
 
 
@@ -412,10 +412,10 @@ bool OverlappingRowMatrix<MatrixType>::isFillComplete() const
 template<class MatrixType>
 void
 OverlappingRowMatrix<MatrixType>::
-getGlobalRowCopy (global_ordinal_type GlobalRow,
-                  const Teuchos::ArrayView<global_ordinal_type> &Indices,
-                  const Teuchos::ArrayView<scalar_type>& Values,
-                  size_t& NumEntries) const
+ getGlobalRowCopy (global_ordinal_type GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
 {
   const local_ordinal_type LocalRow = RowMap_->getLocalElement (GlobalRow);
   if (LocalRow == Teuchos::OrdinalTraits<local_ordinal_type>::invalid ()) {
@@ -429,14 +429,27 @@ getGlobalRowCopy (global_ordinal_type GlobalRow,
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void OverlappingRowMatrix<MatrixType>::
+getGlobalRowCopy (global_ordinal_type GlobalRow,
+                  const Teuchos::ArrayView<global_ordinal_type> &Indices,
+                  const Teuchos::ArrayView<scalar_type> &Values,
+                  size_t &NumEntries) const {
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getGlobalRowCopy(GlobalRow,ind_in,val_in,NumEntries); 
+}
+#endif
 
 template<class MatrixType>
 void
 OverlappingRowMatrix<MatrixType>::
-getLocalRowCopy (local_ordinal_type LocalRow,
-                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
-                 const Teuchos::ArrayView<scalar_type> &Values,
-                 size_t &NumEntries) const
+  getLocalRowCopy (local_ordinal_type LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
 {
   using Teuchos::as;
   const size_t numMyRowsA = A_->getNodeNumRows ();
@@ -448,7 +461,42 @@ getLocalRowCopy (local_ordinal_type LocalRow,
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void
+OverlappingRowMatrix<MatrixType>::
+getLocalRowCopy (local_ordinal_type LocalRow,
+                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
+                 const Teuchos::ArrayView<scalar_type> &Values,
+             size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries);  
+}
+#endif
 
+template<class MatrixType>
+void
+OverlappingRowMatrix<MatrixType>::
+getGlobalRowView (global_ordinal_type GlobalRow,
+                  global_inds_host_view_type &indices,
+                  values_host_view_type &values) const {
+  const local_ordinal_type LocalRow = RowMap_->getLocalElement (GlobalRow);
+  if (LocalRow == Teuchos::OrdinalTraits<local_ordinal_type>::invalid())  {
+    indices = global_inds_host_view_type();
+    values = values_host_view_type();
+  } else {
+    if (Teuchos::as<size_t> (LocalRow) < A_->getNodeNumRows ()) {
+      A_->getGlobalRowView (GlobalRow, indices, values);
+    } else {
+      ExtMatrix_->getGlobalRowView (GlobalRow, indices, values);
+    }
+  }
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void
 OverlappingRowMatrix<MatrixType>::
@@ -468,8 +516,26 @@ getGlobalRowView (global_ordinal_type GlobalRow,
     }
   }
 }
+#endif
+
+template<class MatrixType>
+void
+OverlappingRowMatrix<MatrixType>::
+  getLocalRowView (local_ordinal_type LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const {
+  using Teuchos::as;
+  const size_t numMyRowsA = A_->getNodeNumRows ();
+  if (as<size_t> (LocalRow) < numMyRowsA) {
+    A_->getLocalRowView (LocalRow, indices, values);
+  } else {
+    ExtMatrix_->getLocalRowView (LocalRow - as<local_ordinal_type> (numMyRowsA),
+                                 indices, values);
+  }
 
+}
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void
 OverlappingRowMatrix<MatrixType>::
@@ -486,6 +552,7 @@ getLocalRowView (local_ordinal_type LocalRow,
                                  indices, values);
   }
 }
+#endif
 
 
 template<class MatrixType>
@@ -776,8 +843,8 @@ void OverlappingRowMatrix<MatrixType>::describe(Teuchos::FancyOStream &out,
                   << std::setw(width) << nE;
               if (vl == VERB_EXTREME) {
                 if (isGloballyIndexed()) {
-                  ArrayView<const typename MatrixType::global_ordinal_type> rowinds;
-                  ArrayView<const typename MatrixType::scalar_type> rowvals;
+                  global_inds_host_view_type rowinds;
+                  values_host_view_type rowvals;
                   getGlobalRowView (gid, rowinds, rowvals);
                   for (size_t j = 0; j < nE; ++j) {
                     out << " (" << rowinds[j]
@@ -786,8 +853,8 @@ void OverlappingRowMatrix<MatrixType>::describe(Teuchos::FancyOStream &out,
                   }
                 }
                 else if (isLocallyIndexed()) {
-                  ArrayView<const typename MatrixType::local_ordinal_type> rowinds;
-                  ArrayView<const typename MatrixType::scalar_type> rowvals;
+                  local_inds_host_view_type rowinds;
+                  values_host_view_type rowvals;
                   getLocalRowView (r, rowinds, rowvals);
                   for (size_t j=0; j < nE; ++j) {
                     out << " (" << getColMap()->getGlobalElement(rowinds[j])
diff --git a/packages/ifpack2/src/Ifpack2_Parameters.cpp b/packages/ifpack2/src/Ifpack2_Parameters.cpp
index dcd15337c3dd..519d9db3a4c4 100644
--- a/packages/ifpack2/src/Ifpack2_Parameters.cpp
+++ b/packages/ifpack2/src/Ifpack2_Parameters.cpp
@@ -132,6 +132,7 @@ void getValidParameters(Teuchos::ParameterList& params)
   params.set("relaxation: banded container superdiagonals", -1);
   params.set("relaxation: banded container subdiagonals", -1);
   params.set("relaxation: mtgs cluster size", 1);
+  params.set("relaxation: long row threshold", 0);
 
   // Ifpack2_SPARSKIT.cpp
   // ap 25 May 2016: all SPARSKIT for backwards compatibility ONLY
diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
index a7be84980ff3..7a067f21a39d 100644
--- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp
@@ -294,17 +294,27 @@ class RILUK:
 
   template <class NewMatrixType> friend class RILUK;
 
+  typedef typename crs_matrix_type::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename crs_matrix_type::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename crs_matrix_type::values_host_view_type values_host_view_type;
+
+
+  typedef typename crs_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
+
+
   //@}
   //! \name Implementation of Kokkos Kernels ILU(k).
   //@{
 
-  typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
-  typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
-  typedef typename local_matrix_type::values_type scalar_nonzero_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace;
+  typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
+  typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space TemporaryMemorySpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::memory_space PersistentMemorySpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type::execution_space HandleExecSpace;
   typedef typename KokkosKernels::Experimental::KokkosKernelsHandle
     <typename lno_row_view_t::const_value_type, typename lno_nonzero_view_t::const_value_type, typename scalar_nonzero_view_t::value_type,
     HandleExecSpace, TemporaryMemorySpace,PersistentMemorySpace > kk_handle_type;
diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
index c404695ea131..335513595984 100644
--- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp
@@ -47,6 +47,8 @@
 #include "Ifpack2_LocalSparseTriangularSolver.hpp"
 #include "Ifpack2_Details_getParamTryingTypes.hpp"
 #include "Kokkos_Sort.hpp"
+#include "KokkosKernels_SparseUtils.hpp"
+#include "KokkosKernels_Sorting.hpp"
 
 namespace Ifpack2 {
 
@@ -522,14 +524,12 @@ void RILUK<MatrixType>::initialize ()
                                     A_local_->getColMap (),
                                     entriesPerRow()));
         // copy entries into A_local_crs
-        Teuchos::Array<local_ordinal_type> indices(A_local_->getNodeMaxNumRowEntries());
-        Teuchos::Array<scalar_type> values(A_local_->getNodeMaxNumRowEntries());
+        nonconst_local_inds_host_view_type indices("indices",A_local_->getNodeMaxNumRowEntries());
+        nonconst_values_host_view_type values("values",A_local_->getNodeMaxNumRowEntries());
         for(local_ordinal_type i = 0; i < numRows; i++) {
           size_t numEntries = 0;
-          A_local_->getLocalRowCopy(i, indices(), values(), numEntries);
-          ArrayView<const local_ordinal_type> indicesInsert(indices.data(), numEntries);
-          ArrayView<const scalar_type> valuesInsert(values.data(), numEntries);
-          A_local_crs_nc->insertLocalValues(i, indicesInsert, valuesInsert);
+          A_local_->getLocalRowCopy(i, indices, values, numEntries);
+          A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()), indices.data());
         }
         A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
         A_local_crs = rcp_const_cast<const crs_matrix_type> (A_local_crs_nc);
@@ -604,10 +604,10 @@ initAllValues (const row_matrix_type& A)
 
   // Allocate temporary space for extracting the strictly
   // lower and upper parts of the matrix A.
-  Teuchos::Array<local_ordinal_type> InI(MaxNumEntries);
+  nonconst_local_inds_host_view_type InI("InI",MaxNumEntries);
   Teuchos::Array<local_ordinal_type> LI(MaxNumEntries);
   Teuchos::Array<local_ordinal_type> UI(MaxNumEntries);
-  Teuchos::Array<scalar_type> InV(MaxNumEntries);
+  nonconst_values_host_view_type InV("InV",MaxNumEntries);
   Teuchos::Array<scalar_type> LV(MaxNumEntries);
   Teuchos::Array<scalar_type> UV(MaxNumEntries);
 
@@ -640,7 +640,7 @@ initAllValues (const row_matrix_type& A)
 
     //TODO JJH 4April2014 An optimization is to use getLocalRowView.  Not all matrices support this,
     //                    we'd need to check via the Tpetra::RowMatrix method supportsRowViews().
-    A.getLocalRowCopy (local_row, InI(), InV(), NumIn); // Get Values and Indices
+    A.getLocalRowCopy (local_row, InI, InV, NumIn); // Get Values and Indices
 
     // Split into L and U (we don't assume that indices are ordered).
 
@@ -777,25 +777,30 @@ void RILUK<MatrixType>::compute ()
 
     // Need some integer workspace and pointers
     size_t NumUU;
-    Teuchos::ArrayView<const local_ordinal_type> UUI;
-    Teuchos::ArrayView<const scalar_type> UUV;
+    local_inds_host_view_type UUI;
+    values_host_view_type UUV;
     for (size_t j = 0; j < num_cols; ++j) {
       colflag[j] = -1;
     }
-
+    using IST = typename row_matrix_type::impl_scalar_type;
     for (size_t i = 0; i < L_->getNodeNumRows (); ++i) {
       local_ordinal_type local_row = i;
 
       // Fill InV, InI with current row of L, D and U combined
 
       NumIn = MaxNumEntries;
-      L_->getLocalRowCopy (local_row, InI (), InV (), NumL);
+      nonconst_local_inds_host_view_type InI_v(InI.data(),MaxNumEntries);
+      nonconst_values_host_view_type     InV_v(reinterpret_cast<IST*>(InV.data()),MaxNumEntries);
+
+      L_->getLocalRowCopy (local_row, InI_v , InV_v, NumL);
 
       InV[NumL] = DV(i); // Put in diagonal
       InI[NumL] = local_row;
 
-      U_->getLocalRowCopy (local_row, InI (NumL+1, MaxNumEntries-NumL-1),
-                           InV (NumL+1, MaxNumEntries-NumL-1), NumU);
+      nonconst_local_inds_host_view_type InI_sub(InI.data()+NumL+1,MaxNumEntries-NumL-1);
+      nonconst_values_host_view_type     InV_sub(reinterpret_cast<IST*>(InV.data())+NumL+1,MaxNumEntries-NumL-1);
+  
+      U_->getLocalRowCopy (local_row, InI_sub,InV_sub, NumU);
       NumIn = NumL+NumU+1;
 
       // Set column flags
@@ -807,7 +812,7 @@ void RILUK<MatrixType>::compute ()
 
       for (size_t jj = 0; jj < NumL; ++jj) {
         local_ordinal_type j = InI[jj];
-        scalar_type multiplier = InV[jj]; // current_mults++;
+        IST multiplier = InV[jj]; // current_mults++;
         
         InV[jj] *= static_cast<scalar_type>(DV(j));
         
@@ -821,9 +826,10 @@ void RILUK<MatrixType>::compute ()
             // colflag above using size_t (which is generally unsigned),
             // but now we're querying it using int (which is signed).
             if (kk > -1) {
-              InV[kk] -= multiplier * UUV[k];
+              InV[kk] -= static_cast<scalar_type>(multiplier * UUV[k]);
             }
           }
+
         }
         else {
           for (size_t k = 0; k < NumUU; ++k) {
@@ -832,14 +838,15 @@ void RILUK<MatrixType>::compute ()
             // but now we're querying it using int (which is signed).
             const int kk = colflag[UUI[k]];
             if (kk > -1) {
-              InV[kk] -= multiplier*UUV[k];
+              InV[kk] -= static_cast<scalar_type>(multiplier*UUV[k]);
             }
             else {
-              diagmod -= multiplier*UUV[k];
+              diagmod -= static_cast<scalar_type>(multiplier*UUV[k]);
             }
           }
         }
       }
+
       if (NumL) {
         // Replace current row of L
         L_->replaceLocalValues (local_row, InI (0, NumL), InV (0, NumL));
@@ -868,7 +875,7 @@ void RILUK<MatrixType>::compute ()
       }
 
       if (NumU) {
-        // Replace current row of L and U
+        // Replace current row of L and U        
         U_->replaceLocalValues (local_row, InI (NumL+1, NumU), InV (NumL+1, NumU));
       }
 
@@ -909,20 +916,18 @@ void RILUK<MatrixType>::compute ()
                                     A_local_->getColMap (),
                                     entriesPerRow()));
         // copy entries into A_local_crs
-        Teuchos::Array<local_ordinal_type> indices(A_local_->getNodeMaxNumRowEntries());
-        Teuchos::Array<scalar_type> values(A_local_->getNodeMaxNumRowEntries());
+        nonconst_local_inds_host_view_type indices("indices",A_local_->getNodeMaxNumRowEntries());
+        nonconst_values_host_view_type values("values",A_local_->getNodeMaxNumRowEntries());
         for(local_ordinal_type i = 0; i < numRows; i++) {
           size_t numEntries = 0;
-          A_local_->getLocalRowCopy(i, indices(), values(), numEntries);
-          ArrayView<const local_ordinal_type> indicesInsert(indices.data(), numEntries);
-          ArrayView<const scalar_type> valuesInsert(values.data(), numEntries);
-          A_local_crs_nc->insertLocalValues(i, indicesInsert, valuesInsert);
+          A_local_->getLocalRowCopy(i, indices, values, numEntries);
+          A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()),indices.data());
         }
         A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
         A_local_crs = rcp_const_cast<const crs_matrix_type> (A_local_crs_nc);
       }
-      A_local_rowmap_  = A_local_crs->getLocalMatrix().graph.row_map;
-      A_local_entries_ = A_local_crs->getLocalMatrix().graph.entries;
+      A_local_rowmap_  = A_local_crs->getLocalMatrixDevice().graph.row_map;
+      A_local_entries_ = A_local_crs->getLocalMatrixDevice().graph.entries;
       A_local_values_  = A_local_crs->getLocalValuesView();
     }
 
@@ -934,13 +939,15 @@ void RILUK<MatrixType>::compute ()
       U_->setAllToScalar (STS::zero ());
     }
     
-    auto L_rowmap  = L_->getLocalMatrix().graph.row_map;
-    auto L_entries = L_->getLocalMatrix().graph.entries;
+    using row_map_type = typename crs_matrix_type::local_matrix_device_type::row_map_type;
+
+    row_map_type L_rowmap  = L_->getLocalMatrixDevice().graph.row_map;
+    auto L_entries = L_->getLocalMatrixDevice().graph.entries;
     auto L_values  = L_->getLocalValuesView();
-    auto U_rowmap  = U_->getLocalMatrix().graph.row_map;
-    auto U_entries = U_->getLocalMatrix().graph.entries;
+    row_map_type U_rowmap  = U_->getLocalMatrixDevice().graph.row_map;
+    auto U_entries = U_->getLocalMatrixDevice().graph.entries;
     auto U_values  = U_->getLocalValuesView();
-    
+
     KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_, 
                                                 A_local_rowmap_, A_local_entries_, A_local_values_, 
                                                 L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values );
diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
index 05be9ed133ad..5be4526df414 100644
--- a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
@@ -208,7 +208,7 @@ option.  See the documentation of setParameters() for details.
 Gauss-Seidel / SOR also comes in a symmetric version.  This method
 first does a Forward sweep, then a Backward sweep.  Only the symmetric
 version of this preconditioner is guaranteed to be symmetric (or Hermitian,
-if the matrix's data are complex).
+if the matrix data are complex).
 
 Users may set the relaxation method via the "relaxation: type"
 parameter.  For all relaxation methods, users may specify the number
@@ -617,20 +617,22 @@ class Relaxation :
   typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
   typedef Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type> import_type;
 
-  Teuchos::RCP<Ifpack2::Details::InverseDiagonalKernel<op_type> > invDiagKernel_;
+  typedef typename crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type;
 
+  Teuchos::RCP<Ifpack2::Details::InverseDiagonalKernel<op_type> > invDiagKernel_;
 
   //@}
   //! \name Implementation of multithreaded Gauss-Seidel.
   //@{
 
-  typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
-  typedef typename local_matrix_type::StaticCrsGraphType::row_map_type lno_row_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
-  typedef typename local_matrix_type::values_type scalar_nonzero_view_t;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type TemporaryWorkSpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::device_type PersistentWorkSpace;
-  typedef typename local_matrix_type::StaticCrsGraphType::execution_space MyExecSpace;
+  typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t;
+  typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type TemporaryWorkSpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::device_type PersistentWorkSpace;
+  typedef typename local_matrix_device_type::StaticCrsGraphType::execution_space MyExecSpace;
   typedef typename KokkosKernels::Experimental::KokkosKernelsHandle
       <typename lno_row_view_t::const_value_type, local_ordinal_type,typename scalar_nonzero_view_t::value_type,
       MyExecSpace, TemporaryWorkSpace,PersistentWorkSpace > mt_kernel_handle_type;
@@ -797,6 +799,8 @@ class Relaxation :
   bool checkDiagEntries_ = false;
   //! For MTSGS, the cluster size (use point coloring if equal to 1)
   int clusterSize_ = 1;
+  //! For MTSGS, the threshold for long/bulk rows (rows with at least this many nonzeros)
+  int longRowThreshold_ = 0;
 
   //! Number of outer-sweeps for the two-stage Gauss Seidel
   int NumOuterSweeps_ = 1;
diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
index 1eda34ff938a..e72195b98600 100644
--- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
@@ -327,6 +327,9 @@ Relaxation<MatrixType>::getValidParameters () const
     const int cluster_size = 1;
     pl->set("relaxation: mtgs cluster size", cluster_size);
 
+    const int long_row_threshold = 0;
+    pl->set("relaxation: long row threshold", long_row_threshold);
+
     validParams_ = rcp_const_cast<const ParameterList> (pl);
   }
   return validParams_;
@@ -367,6 +370,9 @@ void Relaxation<MatrixType>::setParametersImpl (Teuchos::ParameterList& pl)
   int cluster_size = 1;
   if(pl.isParameter ("relaxation: mtgs cluster size")) //optional parameter
     cluster_size = pl.get<int> ("relaxation: mtgs cluster size");
+  int long_row_threshold = 0;
+  if(pl.isParameter ("relaxation: long row threshold")) //optional parameter
+    long_row_threshold = pl.get<int> ("relaxation: long row threshold");
 
   Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = pl.get<Teuchos::ArrayRCP<local_ordinal_type> >("relaxation: local smoothing indices");
 
@@ -378,6 +384,18 @@ void Relaxation<MatrixType>::setParametersImpl (Teuchos::ParameterList& pl)
     pl.remove("relaxation: inner damping factor");
     pl.set("relaxation: inner damping factor",df);
   }
+  //If long row algorithm was requested, make sure non-cluster (point) multicolor Gauss-Seidel (aka MTGS/MTSGS) will be used.
+  if (long_row_threshold > 0) {
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        cluster_size != 1, std::invalid_argument, "Ifpack2::Relaxation: "
+        "Requested long row MTGS/MTSGS algorithm and cluster GS/SGS, but those are not compatible.");
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        precType != Details::RelaxationType::MTGS && precType != Details::RelaxationType::MTSGS,
+        std::invalid_argument, "Ifpack2::Relaxation: "
+        "Requested long row MTGS/MTSGS algorithm, but this is only compatible with preconditioner types "
+        "'MT Gauss-Seidel' and 'MT Symmetric Gauss-Seidel'.");
+  }
+
   const ST innerDampingFactor = pl.get<ST> ("relaxation: inner damping factor");
   const int numInnerSweeps = pl.get<int> ("relaxation: inner sweeps");
   const int numOuterSweeps = pl.get<int> ("relaxation: outer sweeps");
@@ -396,6 +414,7 @@ void Relaxation<MatrixType>::setParametersImpl (Teuchos::ParameterList& pl)
   fixTinyDiagEntries_    = fixTinyDiagEntries;
   checkDiagEntries_      = checkDiagEntries;
   clusterSize_           = cluster_size;
+  longRowThreshold_      = long_row_threshold;
   is_matrix_structurally_symmetric_ = is_matrix_structurally_symmetric;
   ifpack2_dump_matrix_ = ifpack2_dump_matrix;
   localSmoothingIndices_ = localSmoothingIndices;
@@ -726,12 +745,14 @@ void Relaxation<MatrixType>::initialize ()
       if (mtKernelHandle_->get_gs_handle () == nullptr) {
         if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2)
           mtKernelHandle_->create_gs_handle (KokkosSparse::GS_TWOSTAGE);
-        else if(this->clusterSize_ == 1)
+        else if(this->clusterSize_ == 1) {
           mtKernelHandle_->create_gs_handle ();
+          mtKernelHandle_->get_point_gs_handle()->set_long_row_threshold(longRowThreshold_);
+        }
         else
           mtKernelHandle_->create_gs_handle (KokkosSparse::CLUSTER_DEFAULT, this->clusterSize_);
       }
-      local_matrix_type kcsr = crsMat->getLocalMatrix ();
+      local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice ();
       if (PrecType_ == Details::GS2 || PrecType_ == Details::SGS2) {
         // set parameters for two-stage GS
         mtKernelHandle_->set_gs_set_num_inner_sweeps (NumInnerSweeps_);
@@ -895,13 +916,14 @@ void Relaxation<MatrixType>::computeBlockCrs ()
     if (DoL1Method_ && IsParallel_) {
       const scalar_type two = one + one;
       const size_t maxLength = A_->getNodeMaxNumRowEntries ();
-      Array<LO> indices (maxLength);
-      Array<scalar_type> values (maxLength * blockSize * blockSize);
+      nonconst_local_inds_host_view_type indices ("indices",maxLength);
+      nonconst_values_host_view_type values_ ("values",maxLength * blockSize * blockSize);
       size_t numEntries = 0;
 
       for (LO i = 0; i < lclNumMeshRows; ++i) {
         // FIXME (mfh 16 Dec 2015) Get views instead of copies.
-        blockCrsA->getLocalRowCopy (i, indices (), values (), numEntries);
+        blockCrsA->getLocalRowCopy (i, indices, values_, numEntries);
+        scalar_type * values = reinterpret_cast<scalar_type*>(values_.data());
 
         auto diagBlock = Kokkos::subview (blockDiag, i, ALL (), ALL ());
         for (LO subRow = 0; subRow < blockSize; ++subRow) {
@@ -1226,12 +1248,12 @@ void Relaxation<MatrixType>::compute ()
         auto diag = Diagonal->getLocalViewHost(Tpetra::Access::ReadWrite);
         const magnitude_type two = STM::one () + STM::one ();
         const size_t maxLength = A_row.getNodeMaxNumRowEntries ();
-        Array<local_ordinal_type> indices (maxLength);
-        Array<scalar_type> values (maxLength);
+        nonconst_local_inds_host_view_type indices("indices",maxLength);
+        nonconst_values_host_view_type values("values",maxLength);
         size_t numEntries;
 
         for (LO i = 0; i < numMyRows; ++i) {
-          A_row.getLocalRowCopy (i, indices (), values (), numEntries);
+          A_row.getLocalRowCopy (i, indices, values, numEntries);
           magnitude_type diagonal_boost = STM::zero ();
           for (size_t k = 0 ; k < numEntries; ++k) {
             if (indices[k] >= numMyRows) {
@@ -1304,7 +1326,7 @@ void Relaxation<MatrixType>::compute ()
         (crsMat == nullptr, std::logic_error, methodName << ": "
          "Multithreaded Gauss-Seidel methods currently only work "
          "when the input matrix is a Tpetra::CrsMatrix.");
-      local_matrix_type kcsr = crsMat->getLocalMatrix ();
+      local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice ();
 
       //TODO BMK: This should be ReadOnly, and KokkosKernels should accept a
       //const-valued view for user-provided D^-1. OK for now, Diagonal_ is nonconst.
@@ -2080,7 +2102,7 @@ ApplyInverseMTGS_CrsMatrix(
       */
   }
 
-  local_matrix_type kcsr = crsMat->getLocalMatrix ();
+  local_matrix_device_type kcsr = crsMat->getLocalMatrixDevice ();
 
   bool update_y_vector = true;
   //false as it was done up already, and we dont want to zero it in each sweep.
diff --git a/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp
index 33a928a8990e..53d039b10181 100644
--- a/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_ReorderFilter_decl.hpp
@@ -73,6 +73,14 @@ class ReorderFilter :
   typedef typename MatrixType::local_ordinal_type local_ordinal_type;
   typedef typename MatrixType::global_ordinal_type global_ordinal_type;
   typedef typename MatrixType::node_type node_type;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
   typedef Tpetra::RowMatrix<scalar_type,
                             local_ordinal_type,
@@ -209,11 +217,17 @@ class ReorderFilter :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (global_ordinal_type GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowCopy(global_ordinal_type GlobalRow,
                                 const Teuchos::ArrayView<global_ordinal_type> &Indices,
                                 const Teuchos::ArrayView<scalar_type> &Values,
                                 size_t &NumEntries) const;
-
+#endif
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
     \param LocalRow   - (In) Local row number for which indices are desired.
@@ -225,10 +239,17 @@ class ReorderFilter :
     with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getLocalRowCopy (local_ordinal_type LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowCopy(local_ordinal_type DropRow,
                                const Teuchos::ArrayView<local_ordinal_type> &Indices,
                                const Teuchos::ArrayView<scalar_type> &Values,
                                size_t &NumEntries) const ;
+#endif
 
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
@@ -239,10 +260,15 @@ class ReorderFilter :
     \pre <tt>isLocallyIndexed() == false</tt>
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (global_ordinal_type GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowView(global_ordinal_type GlobalRow,
                                 Teuchos::ArrayView<const global_ordinal_type> &indices,
                                 Teuchos::ArrayView<const scalar_type> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -253,10 +279,16 @@ class ReorderFilter :
 
     Note: If \c LocalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (local_ordinal_type LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+
   virtual void getLocalRowView(local_ordinal_type LocalRow,
                                Teuchos::ArrayView<const local_ordinal_type> &indices,
                                Teuchos::ArrayView<const scalar_type> &values) const;
-
+#endif
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
     the zero and non-zero diagonals owned by this node. */
@@ -352,11 +384,10 @@ class ReorderFilter :
   //! Permutation: Reordered to original
   Teuchos::ArrayRCP<local_ordinal_type> reverseperm_;
 
-  //! Used in apply, to avoid allocation each time.
-  mutable Teuchos::Array<local_ordinal_type> Indices_;
-  //! Used in apply, to avoid allocation each time.
-  mutable Teuchos::Array<scalar_type> Values_;
-
+  //! Used in ExtractMyRowCopy, to avoid allocation each time.
+  mutable nonconst_local_inds_host_view_type Indices_;
+  //! Used in ExtractMyRowCopy, to avoid allocation each time.
+  mutable nonconst_values_host_view_type Values_;
 };// class ReorderFilter
 
 }// namespace Ifpack2
diff --git a/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp b/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp
index 1d3671d32f3a..0a216dd96103 100644
--- a/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_ReorderFilter_def.hpp
@@ -79,8 +79,8 @@ ReorderFilter (const Teuchos::RCP<const row_matrix_type>& A,
     "Ifpack2::ReorderFilter: The input matrix is not square.");
 
   // Temp arrays for apply
-  Indices_.resize (A_->getNodeMaxNumRowEntries ());
-  Values_.resize (A_->getNodeMaxNumRowEntries ());
+  Kokkos::resize(Indices_,A_->getNodeMaxNumRowEntries ());
+  Kokkos::resize(Values_,A_->getNodeMaxNumRowEntries ());
 }
 
 
@@ -286,10 +286,10 @@ bool ReorderFilter<MatrixType>::isFillComplete() const
 
 template<class MatrixType>
 void ReorderFilter<MatrixType>::
-getGlobalRowCopy (global_ordinal_type globalRow,
-                  const Teuchos::ArrayView<global_ordinal_type>& globalInd,
-                  const Teuchos::ArrayView<scalar_type>& val,
-                  size_t& numEntries) const
+ getGlobalRowCopy (global_ordinal_type globalRow,
+                   nonconst_global_inds_host_view_type &globalInd,
+                   nonconst_values_host_view_type &val,
+                   size_t& numEntries) const
 {
   using Teuchos::Array;
   using Teuchos::ArrayView;
@@ -306,37 +306,38 @@ getGlobalRowCopy (global_ordinal_type globalRow,
     << " is not owned by the calling process with rank "
     << rowMap.getComm ()->getRank () << ".");
 
-  if (sizeof (GO) == sizeof (LO)) {
-    // This means we can convert local to global in place.
-    ArrayView<LO> localInd = av_reinterpret_cast<LO> (globalInd);
-    this->getLocalRowCopy (localRow, localInd, val, numEntries);
+  // The Indices_ temp array is only used in apply, not getLocalRowCopy, so this is safe
+  numEntries = this->getNumEntriesInLocalRow (localRow);
+  this->getLocalRowCopy (localRow, Indices_, val, numEntries);
 
-    // Convert local indices back to global indices.
-    for (size_t k = 0; k < numEntries; ++k) {
-      globalInd[k] = rowMap.getGlobalElement (localInd[k]);
-    }
-  }
-  else {
-    // LO and GO have different sizes, so we need a temp array
-    // for converting local to global.
-    numEntries = this->getNumEntriesInLocalRow (localRow);
-    Array<LO> localInd (numEntries);
-    this->getLocalRowCopy (localRow, localInd, val, numEntries);
-
-    // Convert local indices back to global indices.
-    for (size_t k = 0; k < numEntries; ++k) {
-      globalInd[k] = rowMap.getGlobalElement (localInd[k]);
-    }
+  // Convert local indices back to global indices.
+  for (size_t k = 0; k < numEntries; ++k) {
+    globalInd[k] = rowMap.getGlobalElement (Indices_[k]);
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void ReorderFilter<MatrixType>::
+getGlobalRowCopy (global_ordinal_type globalRow,
+                  const Teuchos::ArrayView<global_ordinal_type>& Indices,
+                  const Teuchos::ArrayView<scalar_type>& Values,
+                  size_t& numEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_global_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getGlobalRowCopy(globalRow,ind_in,val_in,numEntries);  
+}
+#endif
 
 template<class MatrixType>
 void ReorderFilter<MatrixType>::
 getLocalRowCopy (local_ordinal_type LocalRow,
-                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
-                 const Teuchos::ArrayView<scalar_type> &Values,
-                 size_t &NumEntries) const
+    nonconst_local_inds_host_view_type &Indices,
+    nonconst_values_host_view_type &Values,
+    size_t& NumEntries) const
+
 {
   TEUCHOS_TEST_FOR_EXCEPTION(
     ! A_->getRowMap ()->isNodeLocalElement (LocalRow),
@@ -370,7 +371,29 @@ getLocalRowCopy (local_ordinal_type LocalRow,
   }
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void ReorderFilter<MatrixType>::getLocalRowCopy (local_ordinal_type LocalRow,
+                 const Teuchos::ArrayView<local_ordinal_type> &Indices,
+                 const Teuchos::ArrayView<scalar_type> &Values,
+                 size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
+template<class MatrixType>
+void ReorderFilter<MatrixType>::getGlobalRowView(global_ordinal_type /* GlobalRow */,
+                                                  global_inds_host_view_type &/*indices*/,
+                                                  values_host_view_type &/*values*/) const
+{
+  throw std::runtime_error("Ifpack2::ReorderFilter: does not support getGlobalRowView.");
+}
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void ReorderFilter<MatrixType>::
 getGlobalRowView (global_ordinal_type /* GlobalRow */,
@@ -379,8 +402,18 @@ getGlobalRowView (global_ordinal_type /* GlobalRow */,
 {
   throw std::runtime_error("Ifpack2::ReorderFilter: does not support getGlobalRowView.");
 }
+#endif
 
 
+template<class MatrixType>
+void ReorderFilter<MatrixType>::getLocalRowView(local_ordinal_type /* LocalRow */,
+    local_inds_host_view_type & /*indices*/,
+    values_host_view_type & /*values*/) const
+{
+  throw std::runtime_error("Ifpack2::ReorderFilter: does not support getLocalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void ReorderFilter<MatrixType>::
 getLocalRowView (local_ordinal_type /* LocalRow */,
@@ -389,6 +422,7 @@ getLocalRowView (local_ordinal_type /* LocalRow */,
 {
   throw std::runtime_error("Ifpack2::ReorderFilter: does not support getLocalRowView.");
 }
+#endif
 
 
 template<class MatrixType>
@@ -445,25 +479,26 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
   for (size_t i = 0; i < A_->getNodeNumRows (); ++i) {
     size_t Nnz;
     // Use this class's getrow to make the below code simpler
-    getLocalRowCopy (i, Indices_ (), Values_ (), Nnz);
+    getLocalRowCopy (i, Indices_ , Values_ , Nnz);
+    scalar_type* Values = reinterpret_cast<scalar_type*>(Values_.data());
     if (mode == Teuchos::NO_TRANS) {
       for (size_t j = 0; j < Nnz; ++j) {
         for (size_t k = 0; k < NumVectors; ++k) {
-          y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]];
+          y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]];
         }
       }
     }
     else if (mode == Teuchos::TRANS) {
       for (size_t j = 0; j < Nnz; ++j) {
         for (size_t k = 0; k < NumVectors; ++k) {
-          y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i];
         }
       }
     }
     else { //mode==Teuchos::CONJ_TRANS
       for (size_t j = 0; j < Nnz; ++j) {
         for (size_t k = 0; k < NumVectors; ++k) {
-          y_ptr[k][Indices_[j]] += STS::conjugate(Values_[j]) * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += STS::conjugate(Values[j]) * x_ptr[k][i];
         }
       }
     }
diff --git a/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp
index d454ab06aa8c..d540bfdccad7 100644
--- a/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_SingletonFilter_decl.hpp
@@ -68,6 +68,14 @@ class SingletonFilter :
   typedef typename MatrixType::local_ordinal_type LocalOrdinal;
   typedef typename MatrixType::global_ordinal_type GlobalOrdinal;
   typedef typename MatrixType::node_type Node;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;
   typedef Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> row_matrix_type;
   typedef typename row_matrix_type::mag_type mag_type;
@@ -173,11 +181,17 @@ class SingletonFilter :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                   nonconst_global_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow,
                                 const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                                 const Teuchos::ArrayView<Scalar> &Values,
                                 size_t &NumEntries) const;
-
+#endif
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -189,11 +203,17 @@ class SingletonFilter :
     with row \c LocalRow. If \c LocalRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowCopy(LocalOrdinal LocalRow,
                                const Teuchos::ArrayView<LocalOrdinal> &Indices,
                                const Teuchos::ArrayView<Scalar> &Values,
                                size_t &NumEntries) const ;
-
+#endif
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
     \param GlobalRow - (In) Global row number for which indices are desired.
@@ -204,10 +224,15 @@ class SingletonFilter :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (GlobalOrdinal GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowView(GlobalOrdinal GlobalRow,
                                 Teuchos::ArrayView<const GlobalOrdinal> &indices,
                                 Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param LocalRow - (In) Local row number for which indices are desired.
@@ -218,10 +243,15 @@ class SingletonFilter :
 
     Note: If \c LocalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (LocalOrdinal LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowView(LocalOrdinal LocalRow,
                                Teuchos::ArrayView<const LocalOrdinal> &indices,
                                Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
     the zero and non-zero diagonals owned by this node. */
@@ -330,9 +360,9 @@ class SingletonFilter :
   //! NumEntries_[i] contains the nonzero entries in row `i'.
   std::vector<size_t> NumEntries_;
   //! Used in ExtractMyRowCopy, to avoid allocation each time.
-  mutable Teuchos::Array<LocalOrdinal> Indices_;
+  mutable nonconst_local_inds_host_view_type Indices_;
   //! Used in ExtractMyRowCopy, to avoid allocation each time.
-  mutable Teuchos::Array<Scalar> Values_;
+  mutable nonconst_values_host_view_type Values_;
 };// class SingletonFilter
 
 }// namespace Ifpack2
diff --git a/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp b/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp
index 2a08ba929665..4390df0818a8 100644
--- a/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_SingletonFilter_def.hpp
@@ -75,8 +75,8 @@ SingletonFilter<MatrixType>::SingletonFilter(const Teuchos::RCP<const Tpetra::Ro
   MaxNumEntriesA_ = A_->getNodeMaxNumRowEntries();
 
   // ExtractMyRowCopy() will use these vectors
-  Indices_.resize(MaxNumEntriesA_);
-  Values_.resize(MaxNumEntriesA_);
+  Kokkos::resize(Indices_,MaxNumEntriesA_);
+  Kokkos::resize(Values_,MaxNumEntriesA_);
 
   // Initialize reordering vector to -1
   Reorder_.resize(NumRowsA_);
@@ -285,6 +285,17 @@ bool SingletonFilter<MatrixType>::isFillComplete() const
   return A_->isFillComplete();
 }
 
+template<class MatrixType>
+void SingletonFilter<MatrixType>::
+getGlobalRowCopy (GlobalOrdinal /*LocalRow*/,
+                  nonconst_global_inds_host_view_type &/*Indices*/,
+                  nonconst_values_host_view_type &/*Values*/,
+                  size_t& /*NumEntries*/) const
+{
+  throw std::runtime_error("Ifpack2::SingletonFilter does not implement getGlobalRowCopy.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void SingletonFilter<MatrixType>::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */,
                                                   const Teuchos::ArrayView<GlobalOrdinal> &/* Indices */,
@@ -293,18 +304,20 @@ void SingletonFilter<MatrixType>::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */
 {
   throw std::runtime_error("Ifpack2::SingletonFilter does not implement getGlobalRowCopy.");
 }
+#endif
 
 template<class MatrixType>
-void SingletonFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
-                                              const Teuchos::ArrayView<LocalOrdinal> &Indices,
-                                              const Teuchos::ArrayView<Scalar> &Values,
-                                              size_t &NumEntries) const
+void SingletonFilter<MatrixType>::
+  getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const
 {
   TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >=  NumRows_ || (size_t) Indices.size() <  NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SingletonFilter::getLocalRowCopy invalid row or array size.");
 
   size_t Nnz;
   LocalOrdinal ARow = InvReorder_[LocalRow];
-  A_->getLocalRowCopy(ARow,Indices_(),Values_(),Nnz);
+  A_->getLocalRowCopy(ARow,Indices_,Values_,Nnz);
 
   // populate the user's vectors
   NumEntries = 0;
@@ -316,9 +329,32 @@ void SingletonFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
       NumEntries++;
     }
   }
+}
 
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
+void SingletonFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
+                                              const Teuchos::ArrayView<LocalOrdinal> &Indices,
+                                              const Teuchos::ArrayView<Scalar> &Values,
+                                              size_t &NumEntries) const
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(LocalRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
+template<class MatrixType>
+void SingletonFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
+                                                  global_inds_host_view_type &/*indices*/,
+                                                  values_host_view_type &/*values*/) const
+{
+  throw std::runtime_error("Ifpack2::SingletonFilter: does not support getGlobalRowView.");
 }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void SingletonFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
                                                   Teuchos::ArrayView<const GlobalOrdinal> &/* indices */,
@@ -326,7 +362,17 @@ void SingletonFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */
 {
   throw std::runtime_error("Ifpack2::SingletonFilter: does not support getGlobalRowView.");
 }
+#endif
+
+template<class MatrixType>
+void SingletonFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
+    local_inds_host_view_type & /*indices*/,
+    values_host_view_type & /*values*/) const
+{
+  throw std::runtime_error("Ifpack2::SingletonFilter: does not support getLocalRowView.");
+}
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 template<class MatrixType>
 void SingletonFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
                                                  Teuchos::ArrayView<const LocalOrdinal> &/* indices */,
@@ -334,6 +380,7 @@ void SingletonFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
 {
   throw std::runtime_error("Ifpack2::SingletonFilter: does not support getLocalRowView.");
 }
+#endif
 
 template<class MatrixType>
 void SingletonFilter<MatrixType>::getLocalDiagCopy(Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &diag) const
@@ -380,7 +427,7 @@ void SingletonFilter<MatrixType>::apply(const Tpetra::MultiVector<Scalar,LocalOr
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
     size_t Nnz;
     // Use this class's getrow to make the below code simpler
-    getLocalRowCopy(i,Indices_(),Values_(),Nnz);
+    getLocalRowCopy(i,Indices_,Values_,Nnz);
     if (mode==Teuchos::NO_TRANS){
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
@@ -430,7 +477,7 @@ void SingletonFilter<MatrixType>::SolveSingletonsTempl(const Tpetra::MultiVector
     LocalOrdinal ii = SingletonIndex_[i];
     // get the diagonal value for the singleton
     size_t Nnz;
-    A_->getLocalRowCopy(ii,Indices_(),Values_(),Nnz);
+    A_->getLocalRowCopy(ii,Indices_,Values_,Nnz);
     for (size_t j = 0 ; j < Nnz ; ++j) {
       if (Indices_[j] == ii) {
         for (size_t k = 0 ; k < LHS.getNumVectors() ; ++k)
@@ -467,7 +514,7 @@ void SingletonFilter<MatrixType>::CreateReducedRHSTempl(const Tpetra::MultiVecto
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
     LocalOrdinal ii = InvReorder_[i];
     size_t Nnz;
-    A_->getLocalRowCopy(ii,Indices_(),Values_(),Nnz);
+    A_->getLocalRowCopy(ii,Indices_,Values_,Nnz);
 
     for (size_t j = 0 ; j < Nnz ; ++j) {
       if (Reorder_[Indices_[j]] == -1) {
diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp
index c931c621e870..a7eb5a2489f1 100644
--- a/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_SparseContainer_decl.hpp
@@ -167,6 +167,8 @@ class SparseContainer
   using InverseGlobalOrdinal = typename InverseType::global_ordinal_type;
   using InverseNode = typename InverseType::node_type;
 
+  using typename ContainerImpl<MatrixType, InverseScalar>::block_crs_matrix_type;
+
   using inverse_mv_type = Tpetra::MultiVector<InverseScalar, InverseLocalOrdinal, InverseGlobalOrdinal, InverseNode>;
   using InverseCrs = Tpetra::CrsMatrix<InverseScalar, InverseLocalOrdinal, InverseGlobalOrdinal, InverseNode>;
   using InverseMap = typename Tpetra::Map<InverseLocalOrdinal, InverseGlobalOrdinal, InverseNode>;
diff --git a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp
index b83700ff0dd2..75c3d81e9090 100644
--- a/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_SparseContainer_def.hpp
@@ -529,14 +529,16 @@ extract ()
       Array<size_t> rowEntryCounts(blockPointSize, 0);
       //blockRow counts the BlockCrs LIDs that are going into this block
       //Rows are inserted into the CrsMatrix in sequential order
+      using inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+      using vals_type = typename block_crs_matrix_type::values_host_view_type;
       for(LO blockRow = 0; blockRow < blockSize; blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        inds_type indices;
+        vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();
         for(LO br = 0; br < this->bcrsBlockSize_; br++)
         {
           for(LO k = 0; k < numEntries; k++)
@@ -557,11 +559,11 @@ extract ()
       for(LO blockRow = 0; blockRow < blockSize; blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        inds_type indices;
+        vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();
         for(LO br = 0; br < this->bcrsBlockSize_; br++)
         {
           indicesToInsert.clear();
diff --git a/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp b/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp
index 8a0c27499f2e..1d3ff1b4d600 100644
--- a/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_SparsityFilter_decl.hpp
@@ -86,6 +86,14 @@ class SparsityFilter :
   typedef typename MatrixType::local_ordinal_type LocalOrdinal;
   typedef typename MatrixType::global_ordinal_type GlobalOrdinal;
   typedef typename MatrixType::node_type Node;
+  typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+  typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+  typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;
   typedef Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> row_matrix_type;
   typedef typename row_matrix_type::mag_type mag_type;
@@ -192,11 +200,17 @@ class SparsityFilter :
     with row \c GlobalRow. If \c GlobalRow does not belong to this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+  virtual void
+  getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                    nonconst_global_inds_host_view_type &Indices,
+                    nonconst_values_host_view_type &Values,
+                    size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE  
   virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow,
                                 const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                                 const Teuchos::ArrayView<Scalar> &Values,
                                 size_t &NumEntries) const;
-
+#endif
   //! Extract a list of entries in a specified local row of the graph. Put into storage allocated by calling routine.
   /*!
     \param DropRow - (In) Drop row number for which indices are desired.
@@ -208,11 +222,18 @@ class SparsityFilter :
     with row \c DropRow. If \c DropRow is not valid for this node, then \c Indices and \c Values are unchanged and \c NumIndices is
     returned as Teuchos::OrdinalTraits<size_t>::invalid().
   */
+
+  virtual void
+  getLocalRowCopy (LocalOrdinal LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowCopy(LocalOrdinal DropRow,
                                const Teuchos::ArrayView<LocalOrdinal> &Indices,
                                const Teuchos::ArrayView<Scalar> &Values,
                                size_t &NumEntries) const ;
-
+#endif
   //! Extract a const, non-persisting view of global indices in a specified row of the matrix.
   /*!
     \param GlobalRow - (In) Global row number for which indices are desired.
@@ -223,10 +244,15 @@ class SparsityFilter :
 
     Note: If \c GlobalRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getGlobalRowView (GlobalOrdinal GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getGlobalRowView(GlobalOrdinal GlobalRow,
                                 Teuchos::ArrayView<const GlobalOrdinal> &indices,
                                 Teuchos::ArrayView<const Scalar> &values) const;
-
+#endif
   //! Extract a const, non-persisting view of local indices in a specified row of the matrix.
   /*!
     \param DropRow - (In) Drop row number for which indices are desired.
@@ -237,9 +263,15 @@ class SparsityFilter :
 
     Note: If \c DropRow does not belong to this node, then \c indices is set to null.
   */
+  virtual void
+  getLocalRowView (LocalOrdinal LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void getLocalRowView(LocalOrdinal DropRow,
                                Teuchos::ArrayView<const LocalOrdinal> &indices,
                                Teuchos::ArrayView<const Scalar> &values) const;
+#endif
 
   //! \brief Get a copy of the diagonal entries owned by this node, with local row indices.
   /*! Returns a distributed Vector object partitioned according to this matrix's row map, containing the
@@ -315,9 +347,9 @@ class SparsityFilter :
   //! NumEntries_[i] contains the nonzero entries in row `i'.
   std::vector<size_t> NumEntries_;
   //! Used in ExtractMyRowCopy, to avoid allocation each time.
-  mutable Teuchos::Array<LocalOrdinal> Indices_;
+  mutable nonconst_local_inds_host_view_type Indices_;
   //! Used in ExtractMyRowCopy, to avoid allocation each time
-  mutable Teuchos::Array<Scalar> Values_;
+  mutable nonconst_values_host_view_type Values_;
 
 };// class SparsityFilter
 
diff --git a/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp b/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp
index c0ae66b9e8a4..814420e2b019 100644
--- a/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_SparsityFilter_def.hpp
@@ -86,8 +86,8 @@ SparsityFilter<MatrixType>::SparsityFilter(const Teuchos::RCP<const Tpetra::RowM
   MaxNumEntriesA_ = A_->getNodeMaxNumRowEntries();
 
   // ExtractMyRowCopy() will use these vectors
-  Indices_.resize(MaxNumEntries_);
-  Values_.resize(MaxNumEntries_);
+  Kokkos::resize(Indices_,MaxNumEntries_);
+  Kokkos::resize(Values_,MaxNumEntries_);
 
   size_t ActualMaxNumEntries = 0;
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
@@ -274,22 +274,36 @@ bool SparsityFilter<MatrixType>::isFillComplete() const
 
 //==========================================================================
 template<class MatrixType>
-void SparsityFilter<MatrixType>::getGlobalRowCopy(GlobalOrdinal /* GlobalRow */,
+void SparsityFilter<MatrixType>::
+getGlobalRowCopy (GlobalOrdinal /*GlobalRow*/,
+                  nonconst_global_inds_host_view_type &/*Indices*/,
+                  nonconst_values_host_view_type &/*Values*/,
+                  size_t& /*NumEntries*/) const {
+  throw std::runtime_error("Ifpack2::SparsityFilter does not implement getGlobalRowCopy.");
+}
+
+//==========================================================================
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE  
+template<class MatrixType>
+void SparsityFilter<MatrixType>::
+getGlobalRowCopy(GlobalOrdinal /* GlobalRow */,
                                                   const Teuchos::ArrayView<GlobalOrdinal> &/* Indices */,
                                                   const Teuchos::ArrayView<Scalar> &/* Values */,
                                                   size_t &/* NumEntries */) const
 {
   throw std::runtime_error("Ifpack2::SparsityFilter does not implement getGlobalRowCopy.");
 }
+#endif
 
 //==========================================================================
 template<class MatrixType>
-void SparsityFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
-                                              const Teuchos::ArrayView<LocalOrdinal> &Indices,
-                                              const Teuchos::ArrayView<Scalar> &Values,
-                                              size_t &NumEntries) const
+void SparsityFilter<MatrixType>::
+  getLocalRowCopy (LocalOrdinal LocalRow,
+      nonconst_local_inds_host_view_type &Indices,
+      nonconst_values_host_view_type &Values,
+      size_t& NumEntries) const 
 {
-  TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >=  NumRows_ || (size_t) Indices.size() <  NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SparsityFilter::getLocalRowCopy invalid row or array size.");
+TEUCHOS_TEST_FOR_EXCEPTION((LocalRow < 0 || (size_t) LocalRow >=  NumRows_ || (size_t) Indices.size() <  NumEntries_[LocalRow]), std::runtime_error, "Ifpack2::SparsityFilter::getLocalRowCopy invalid row or array size.");
 
   // Note: This function will work correctly if called by apply, say, with Indices_ and Values_ as
   // parameters.  The structure of the loop below should make that obvious.
@@ -298,7 +312,7 @@ void SparsityFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
   // This is because I need more space than that given by
   // the user (for the external nodes)
   size_t A_NumEntries=0;
-  A_->getLocalRowCopy(LocalRow,Indices_(),Values_(),A_NumEntries);
+  A_->getLocalRowCopy(LocalRow,Indices_,Values_,A_NumEntries);
   magnitudeType Threshold = Teuchos::ScalarTraits<magnitudeType>::zero();
   std::vector<magnitudeType> Values2(A_NumEntries,Teuchos::ScalarTraits<magnitudeType>::zero());
 
@@ -339,25 +353,61 @@ void SparsityFilter<MatrixType>::getLocalRowCopy(LocalOrdinal LocalRow,
       break;
   }
 
+
 }
 
+//==========================================================================
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE  
+template<class MatrixType>
+void SparsityFilter<MatrixType>::
+getLocalRowCopy(LocalOrdinal DropRow,
+    const Teuchos::ArrayView<LocalOrdinal> &Indices,
+    const Teuchos::ArrayView<Scalar> &Values,
+    size_t &NumEntries) const 
+{
+  using IST = typename row_matrix_type::impl_scalar_type;
+  nonconst_local_inds_host_view_type ind_in(Indices.data(),Indices.size());
+  nonconst_values_host_view_type val_in(reinterpret_cast<IST*>(Values.data()),Values.size());
+  getLocalRowCopy(DropRow,ind_in,val_in,NumEntries);  
+}
+#endif
+
 //==========================================================================
 template<class MatrixType>
+void SparsityFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
+                                                  global_inds_host_view_type &/*indices*/,
+                                                  values_host_view_type &/*values*/) const
+{
+  throw std::runtime_error("Ifpack2::SparsityFilter: does not support getGlobalRowView.");
+}
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
 void SparsityFilter<MatrixType>::getGlobalRowView(GlobalOrdinal /* GlobalRow */,
                                                   Teuchos::ArrayView<const GlobalOrdinal> &/* indices */,
                                                   Teuchos::ArrayView<const Scalar> &/* values */) const
 {
   throw std::runtime_error("Ifpack2::SparsityFilter: does not support getGlobalRowView.");
 }
+#endif
 
 //==========================================================================
 template<class MatrixType>
+void SparsityFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
+    local_inds_host_view_type & /*indices*/,
+    values_host_view_type & /*values*/) const
+{
+  throw std::runtime_error("Ifpack2::SparsityFilter: does not support getLocalRowView.");
+}
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+template<class MatrixType>
 void SparsityFilter<MatrixType>::getLocalRowView(LocalOrdinal /* LocalRow */,
                                                  Teuchos::ArrayView<const LocalOrdinal> &/* indices */,
                                                  Teuchos::ArrayView<const Scalar> &/* values */) const
 {
   throw std::runtime_error("Ifpack2::SparsityFilter: does not support getLocalRowView.");
 }
+#endif
 
 //==========================================================================
 template<class MatrixType>
@@ -404,21 +454,22 @@ void SparsityFilter<MatrixType>::apply(const Tpetra::MultiVector<Scalar,LocalOrd
   for (size_t i = 0 ; i < NumRows_ ; ++i) {
     size_t Nnz;
     // Use this class's getrow to make the below code simpler
-    getLocalRowCopy(i,Indices_(),Values_(),Nnz);
+    getLocalRowCopy(i,Indices_,Values_,Nnz);
+    Scalar* Values = reinterpret_cast<Scalar*>(Values_.data());
     if (mode==Teuchos::NO_TRANS){
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][i] += Values_[j] * x_ptr[k][Indices_[j]];
+          y_ptr[k][i] += Values[j] * x_ptr[k][Indices_[j]];
     }
     else if (mode==Teuchos::TRANS){
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][Indices_[j]] += Values_[j] * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += Values[j] * x_ptr[k][i];
     }
     else { //mode==Teuchos::CONJ_TRANS
       for (size_t j = 0 ; j < Nnz ; ++j)
         for (size_t k = 0 ; k < NumVectors ; ++k)
-          y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits<Scalar>::conjugate(Values_[j]) * x_ptr[k][i];
+          y_ptr[k][Indices_[j]] += Teuchos::ScalarTraits<Scalar>::conjugate(Values[j]) * x_ptr[k][i];
     }
   }
 }
diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp
index df2d0aca7e76..014ef0ed2f0d 100644
--- a/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_decl.hpp
@@ -138,7 +138,7 @@ class TriDiContainer
   using HostViewLocal = typename Kokkos::View<LSC**, Kokkos::HostSpace>;
   using typename ContainerImpl<MatrixType, LocalScalarType>::HostSubviewLocal;
   using typename ContainerImpl<MatrixType, LocalScalarType>::ConstHostSubviewLocal;
-
+  using typename ContainerImpl<MatrixType, LocalScalarType>::block_crs_matrix_type;
   static_assert (std::is_same<MatrixType, Tpetra::RowMatrix<SC, LO, GO, NO>>::value,
                  "Ifpack2::TriDiContainer: MatrixType must be a Tpetra::RowMatrix specialization.");
 
diff --git a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp
index a6f345273f47..ca53ad31bce1 100644
--- a/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_TriDiContainer_def.hpp
@@ -161,14 +161,16 @@ void TriDiContainer<MatrixType, LocalScalarType>::extract()
         LO localCol = this->translateRowToCol(blockRows[j]);
         colToBlockOffset[localCol] = blockStart + j;
       }
+      using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
+      using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
       for(LO blockRow = 0; blockRow < LO(blockRows.size()); blockRow++)
       {
         //get a raw view of the whole block row
-        const LO* indices;
-        SC* values;
-        LO numEntries;
+        h_inds_type indices;
+        h_vals_type values;
         LO inputRow = this->blockRows_[blockStart + blockRow];
-        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values, numEntries);
+        this->inputBlockMatrix_->getLocalRowView(inputRow, indices, values);
+        LO numEntries = (LO) indices.size();
         for(LO k = 0; k < numEntries; k++)
         {
           LO colOffset = colToBlockOffset[indices[k]];
diff --git a/packages/ifpack2/src/Ifpack2_Utilities.cpp b/packages/ifpack2/src/Ifpack2_Utilities.cpp
index 2e3e8807b5ab..e7f4cbc15eab 100644
--- a/packages/ifpack2/src/Ifpack2_Utilities.cpp
+++ b/packages/ifpack2/src/Ifpack2_Utilities.cpp
@@ -49,9 +49,8 @@ namespace Details {
     // precTypeUpper is the upper-case version of precType.
     std::string precTypeUpper (precType);
     if (precTypeUpper.size () > 0) {
-      std::locale locale;
       for (size_t k = 0; k < precTypeUpper.size (); ++k) {
-        precTypeUpper[k] = std::toupper<char> (precTypeUpper[k], locale);
+        precTypeUpper[k] = ::toupper(precTypeUpper[k]);
       }
     }
     return precTypeUpper;
diff --git a/packages/ifpack2/test/belos/build_problem.hpp b/packages/ifpack2/test/belos/build_problem.hpp
index 085f527449bf..6654040415ef 100644
--- a/packages/ifpack2/test/belos/build_problem.hpp
+++ b/packages/ifpack2/test/belos/build_problem.hpp
@@ -206,8 +206,10 @@ build_problem (Teuchos::ParameterList& test_params,
     // new matrix.
     RCP<crs_matrix_type> A_constGraph (new crs_matrix_type (A->getCrsGraph ()));
     // Copy the values row by row from A into A_constGraph.
-    ArrayView<const LO> ind;
-    ArrayView<const Scalar> val;
+    using lids_type = typename crs_matrix_type::local_inds_host_view_type;
+    using vals_type = typename crs_matrix_type::values_host_view_type;
+    lids_type ind;
+    vals_type val;
     const LO numLocalRows = static_cast<LO> (A->getNodeNumRows ());
     for (LO localRow = 0; localRow < numLocalRows; ++localRow) {
       A->getLocalRowView (localRow, ind, val);
diff --git a/packages/ifpack2/test/belos/tpetra_native.cpp b/packages/ifpack2/test/belos/tpetra_native.cpp
index 0be088d9732b..8fe9a760b394 100644
--- a/packages/ifpack2/test/belos/tpetra_native.cpp
+++ b/packages/ifpack2/test/belos/tpetra_native.cpp
@@ -32,8 +32,8 @@ deepCopyFillCompleteCrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
     (! A.isFillComplete (), std::invalid_argument,
      "deepCopyFillCompleteCrsMatrix: Input matrix A must be fillComplete.");
   RCP<crs_matrix_type> A_copy (new crs_matrix_type (A.getCrsGraph ()));
-  auto A_copy_lcl = A_copy->getLocalMatrix ();
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_copy_lcl = A_copy->getLocalMatrixDevice ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   Kokkos::deep_copy (A_copy_lcl.values, A_lcl.values);
   A_copy->fillComplete (A.getDomainMap (), A.getRangeMap ());
   return A_copy;
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp
index feee75b31aad..fb96744ea6ac 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockCrsUtil.hpp
@@ -244,9 +244,9 @@ struct BlockCrsMatrixMaker {
   // of local column abs sums.
   static void make_row_and_col_diag_dominant (Tpetra_BlockCrsMatrix& a) {
     const auto& g = a.getCrsGraph();
-    const auto& rowptr = g.getLocalGraph().row_map;
-    const auto& colidx = g.getLocalGraph().entries;
-    const auto& values = a.getValuesHost();
+    const auto& rowptr = g.getLocalGraphHost().row_map;
+    const auto& colidx = g.getLocalGraphHost().entries;
+    const auto& values = a.getValuesHostNonConst();
 
     const auto row_map = g.getRowMap();
     const auto col_map = g.getColMap();
@@ -385,10 +385,10 @@ struct BlockCrsMatrixMaker {
       }
     }
 
-    typename Tpetra_CrsGraph::local_graph_type g;
+    typename Tpetra_CrsGraph::local_graph_device_type g;
     {
-      typedef typename Tpetra_CrsGraph::local_graph_type::row_map_type row_map_type;
-      typedef typename Tpetra_CrsGraph::local_graph_type::entries_type entries_type;
+      typedef typename Tpetra_CrsGraph::local_graph_device_type::row_map_type row_map_type;
+      typedef typename Tpetra_CrsGraph::local_graph_device_type::entries_type entries_type;
       const GO nr = my_row_gids.size();
       typename row_map_type::non_const_type::HostMirror rowptr("rowptr", nr + 1);
       typename entries_type::HostMirror colidx;
@@ -440,7 +440,7 @@ struct BlockCrsMatrixMaker {
         Kokkos::deep_copy(row_map_tmp, rowptr);
         entries_type entries("entries", colidx.size());
         Kokkos::deep_copy(entries, colidx);
-        g = typename Tpetra_CrsGraph::local_graph_type(entries, row_map_tmp);
+        g = typename Tpetra_CrsGraph::local_graph_device_type(entries, row_map_tmp);
       }
 
       if ( ! tridiags_only) {
@@ -490,8 +490,8 @@ struct BlockCrsMatrixMaker {
   get_offdiag_idxs (const StructuredBlock& sb, const Tpetra_CrsGraph& g, const Tpetra_Map& col_map,
                     const Int& lr, const Int& I, const Int& J, const Int& K, Int offdiag_idxs[2]) {
     offdiag_idxs[0] = offdiag_idxs[1] = -1;
-    const auto& rowptr = g.getLocalGraph().row_map;
-    const auto& colidx = g.getLocalGraph().entries;
+    const auto& rowptr = g.getLocalGraphHost().row_map;
+    const auto& colidx = g.getLocalGraphHost().entries;
     GO rid_offdiags[2];
     rid_offdiags[0] = rid_offdiags[1] = Teuchos::OrdinalTraits<GO>::invalid();
     if (K > 0) rid_offdiags[0] = sb.ijk2id(I, J, K-1);
@@ -529,8 +529,8 @@ struct BlockCrsMatrixMaker {
     // Raw pointers for threading.
     auto m = mr.get();
     auto g = gr.get();
-    const auto& rowptr = g->getLocalGraph().row_map;
-    const auto& colidx = g->getLocalGraph().entries;
+    const auto& rowptr = g->getLocalGraphHost().row_map;
+    const auto& colidx = g->getLocalGraphHost().entries;
     const LO nr = rowptr.extent_int(0) - 1;
     const auto row_map = g->getRowMap().get();
     const auto col_map = g->getColMap().get();
@@ -596,7 +596,7 @@ struct BlockCrsMatrixMaker {
       if (tridiag_is_identity || block_diag)
         zero_offdiag_idxs(offdiag_idxs, blockrow);
       for (size_t j = rowptr(lr); j < rowptr(lr+1); ++j) {
-        auto block = m->getLocalBlock(lr, colidx(j));
+        auto block = m->getLocalBlockHostNonConst(lr, colidx(j));
         const auto b = j - rowptr(lr);
         for (Int bi = 0; bi < bs; ++bi)
           for (Int bj = 0; bj < bs; ++bj)
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp
index 1b9689290d30..6c2f944c3937 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockRelaxation.cpp
@@ -122,14 +122,16 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, Test0, Scalar, LocalOr
 
   prec.applyMat(x, y);
 
-  Teuchos::ArrayRCP<const Scalar> yview = y.get1dView();
-
   //Since crsmatrix is a diagonal matrix with 2 on the diagonal,
   //y should be full of 2's now.
 
   Teuchos::ArrayRCP<Scalar> twos(num_rows_per_proc*2, 2);
 
-  TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits<Scalar>::eps());
+  {
+    // Restrict scope of host access
+    Teuchos::ArrayRCP<const Scalar> yview = y.get1dView();
+    TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits<Scalar>::eps());
+  }
 
   prec.apply(x, y);
 
@@ -137,7 +139,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, Test0, Scalar, LocalOr
 
   Teuchos::ArrayRCP<Scalar> halfs(num_rows_per_proc*2, 0.5);
 
-  TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits<Scalar>::eps());
+  {
+    // Restrict scope of host access
+    Teuchos::ArrayRCP<const Scalar> yview = y.get1dView();
+    TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits<Scalar>::eps());
+  }
 }
 
 // Test apply() with x == y.
@@ -744,16 +750,20 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestBlockContainers, S
   //the (block) graph should be diagonal
   auto crsgraph = tif_utest::create_banded_graph<LO,GO,Node>(num_rows_per_proc, 2);
 
-  auto bcrsmatrix = Teuchos::rcp(new
-      Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node>(*crsgraph, blockSize));
+  using block_crs_matrix_type = Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node>;
+  using h_inds = typename block_crs_matrix_type::local_inds_host_view_type;
+  using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type;
+
+  auto bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*crsgraph, blockSize));
+                                
   
   //Fill in values of the the matrix
   for(LO l_row = 0; (size_t) l_row < bcrsmatrix->getNodeNumRows(); ++l_row)
   {
-    const LO * inds;
-    Scalar * vals;
-    LO numInd;
-    bcrsmatrix->getLocalRowView(l_row, inds, vals, numInd);
+    h_inds inds;
+    h_vals vals;
+    bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals);
+    LO numInd = (LO) inds.size();
     for(int k = 0; k < blockSize * blockSize * numInd; k++)
       vals[k] = 0;
     for (LO j = 0; j < numInd; ++j)
@@ -895,16 +905,18 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestBlockContainersDec
   //the (block) graph should be diagonal
   auto crsgraph = tif_utest::create_banded_graph<LO,GO,Node>(num_rows_per_proc, 1);
 
-  auto bcrsmatrix = Teuchos::rcp(new
-      Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node>(*crsgraph, blockSize));
+  using block_crs_matrix_type = Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node>;
+  using h_inds = typename block_crs_matrix_type::local_inds_host_view_type;
+  using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type;
+  auto bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*crsgraph, blockSize));
   
   //Fill in values of the the matrix
   for(LO l_row = 0; (size_t) l_row < bcrsmatrix->getNodeNumRows(); ++l_row)
   {
-    const LO * inds;
-    Scalar * vals;
-    LO numInd;
-    bcrsmatrix->getLocalRowView(l_row, inds, vals, numInd);
+    h_inds inds;
+    h_vals vals;
+    bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals);
+    LO numInd = (LO)inds.size();
     for (LO j = 0; j < numInd; ++j)
     {
       const LO lcl_col = inds[j];
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
index ef7573dea2cb..912e8e5e5baa 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
@@ -79,7 +79,7 @@ struct BlockTriDiContainerTester {
     const auto col_map = g.getColMap();
     const auto gid = row_map->getGlobalElement(row_lid_to_match);
     const auto col_lid = col_map->getLocalElement(gid);
-    auto block = A.getLocalBlock(row_lid, col_lid);
+    auto block = A.getLocalBlockHostNonConst(row_lid, col_lid);
     const Int bs = block.extent(1);
     for (Int bi = 0; bi < bs; ++bi)
       for (Int bj = 0; bj < bs; ++bj)
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp
index 5c400c75db79..b1b1c10af499 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFactory.cpp
@@ -61,6 +61,8 @@ for preconditioners it produces.
 #include <Tpetra_BlockMultiVector.hpp>
 #include <Tpetra_BlockCrsMatrix.hpp>
 
+#include <Ifpack2_BlockRelaxation.hpp>
+
 namespace {
 using Tpetra::global_size_t;
 typedef tif_utest::Node Node;
@@ -187,7 +189,28 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Factory, BlockCrs, Scalar, LocalOrdinal
   check_precond_basics(prec_relax, out, success);
   check_precond_apply(prec_relax, out, success);
 
-  // NOTE: As we expand support for the BlockCrsMatrix to other smoother types besides RELAXATION, tests should be added here.
+  // Basic block relaxation tests
+  prec_relax = factory.create<row_matrix_type> ("BLOCKRELAXATION", rowmatrix);
+  TEST_EQUALITY(prec_relax != Teuchos::null, true);
+  check_precond_basics(prec_relax, out, success);
+  check_precond_apply(prec_relax, out, success);
+
+  // Block-Tridiagonal
+  {
+    Teuchos::ParameterList params;
+    params.set("relaxation: container", "BlockTriDi");
+    params.set("relaxation: type", "MT Split Jacobi");
+    params.set("partitioner: type", "linear");
+    params.set("partitioner: local parts", num_rows_per_proc);
+
+    prec_relax = factory.create<row_matrix_type> ("BLOCKRELAXATION", rowmatrix);
+    TEST_EQUALITY(prec_relax != Teuchos::null, true);
+    prec_relax->setParameters(params);
+    check_precond_basics(prec_relax, out, success);
+    check_precond_apply(prec_relax, out, success);
+  }
+
+
 }
 
 
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp
index f3e67a00902e..aedd3f28e200 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestFiltering.cpp
@@ -128,18 +128,22 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Filtering, Test0, Scalar, LocalOrdinal,
 
   // Apply w/ GetRow
   size_t max_nz_per_row=LocalA.getNodeMaxNumRowEntries();
-  Teuchos::Array<LocalOrdinal> Indices(max_nz_per_row);
-  Teuchos::Array<Scalar> Values(max_nz_per_row);
-  Teuchos::ArrayRCP<const Scalar> xview=lx.get1dView();
-
-  for(LocalOrdinal i=0; i < (LocalOrdinal)num_rows_per_proc; i++){
-    size_t NumEntries;
-    LocalA.getLocalRowCopy(i,Indices(),Values(),NumEntries);
-    Scalar sum=0;
-    for(LocalOrdinal j=0; (size_t) j < NumEntries; j++){
-      sum+=Values[j] * xview[Indices[j]];
+  using lids_type = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::nonconst_local_inds_host_view_type;
+  using vals_type = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::nonconst_values_host_view_type;
+  lids_type Indices("Indices",max_nz_per_row);
+  vals_type Values("Values",max_nz_per_row);
+  {// Host view needs to be scoped
+    Teuchos::ArrayRCP<const Scalar> xview=lx.get1dView();
+
+    for(LocalOrdinal i=0; i < (LocalOrdinal)num_rows_per_proc; i++){
+      size_t NumEntries;
+      LocalA.getLocalRowCopy(i,Indices,Values,NumEntries);
+      Scalar sum=0;
+      for(LocalOrdinal j=0; (size_t) j < NumEntries; j++){
+        sum+=Values[j] * xview[Indices[j]];
+      }
+      lz.replaceLocalValue(i,sum);
     }
-    lz.replaceLocalValue(i,sum);
   }
 
   // Diff
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp
index 9c1a71e13f40..ba437e387d3d 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestHelpers.hpp
@@ -682,8 +682,11 @@ template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node>
 Teuchos::RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >
   create_banded_block_matrix(const Teuchos::RCP<const Tpetra::CrsGraph<LocalOrdinal,GlobalOrdinal,Node> >& graph, const int blockSize, const size_t rbandwidth)
 {
-  Teuchos::RCP<Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > bcrsmatrix
-    = Teuchos::rcp(new Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>(*graph, blockSize));
+  using block_crs_matrix_type = Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+  using h_inds = typename block_crs_matrix_type::local_inds_host_view_type;
+  using h_vals = typename block_crs_matrix_type::nonconst_values_host_view_type;
+
+  Teuchos::RCP<block_crs_matrix_type> bcrsmatrix = Teuchos::rcp(new block_crs_matrix_type(*graph, blockSize));
   const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node>& meshRowMap = *bcrsmatrix->getRowMap();
 
   const int blockMatSize = blockSize*blockSize;
@@ -699,11 +702,10 @@ Teuchos::RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node
 
   for(LocalOrdinal l_row = 0; (size_t) l_row < meshRowMap.getNodeNumElements(); ++l_row)
   {
-
-    const LocalOrdinal * inds;
-    Scalar * vals;
-    LocalOrdinal numInd;
-    bcrsmatrix->getLocalRowView(l_row, inds, vals, numInd);
+    h_inds inds;
+    h_vals vals;
+    bcrsmatrix->getLocalRowViewNonConst(l_row, inds, vals);
+    LocalOrdinal numInd = (LocalOrdinal)inds.size();
     for (LocalOrdinal j = 0; j < numInd; ++j)
     {
       const LocalOrdinal lcl_col = inds[j];
@@ -954,6 +956,16 @@ Teuchos::RCP<const Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > c
   class NotCrsMatrix :
     public Ifpack2::Details::RowMatrix<Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > {
   public:
+    typedef typename Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal,Node> MatrixType;
+    typedef typename MatrixType::global_inds_host_view_type global_inds_host_view_type;
+    typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
+    typedef typename MatrixType::values_host_view_type values_host_view_type;
+
+    typedef typename MatrixType::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+    typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
+    typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
+
+
     NotCrsMatrix (Teuchos::RCP<Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& A) : A_(A){;}
     virtual ~NotCrsMatrix(){;}
     virtual Teuchos::RCP<const Teuchos::Comm<int> > getComm() const {return A_->getComm();}
@@ -984,26 +996,50 @@ Teuchos::RCP<const Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > c
     virtual bool isFillComplete() const {return A_->isFillComplete();}
     virtual bool supportsRowViews() const {return A_->supportsRowViews();}
 
+    virtual void
+    getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                      nonconst_global_inds_host_view_type &indices,
+                      nonconst_values_host_view_type &values,size_t &NumEntries) const {A_->getGlobalRowCopy(GlobalRow,indices,values,NumEntries);}
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowCopy (GlobalOrdinal GlobalRow,
                       const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                       const Teuchos::ArrayView<Scalar> &Values,
                       size_t &NumEntries) const {A_->getGlobalRowCopy(GlobalRow,Indices,Values,NumEntries);}
+#endif
 
+    virtual void
+    getLocalRowCopy (LocalOrdinal LocalRow,
+                      nonconst_local_inds_host_view_type & indices,
+                     nonconst_values_host_view_type & values,size_t &NumEntries) const {A_->getLocalRowCopy(LocalRow,indices,values,NumEntries);}
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowCopy (LocalOrdinal LocalRow,
                      const Teuchos::ArrayView<LocalOrdinal> &Indices,
                      const Teuchos::ArrayView<Scalar> &Values,
                      size_t &NumEntries) const {A_->getLocalRowCopy(LocalRow,Indices,Values,NumEntries);}
+#endif
+
+  virtual void
+  getGlobalRowView (GlobalOrdinal GlobalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const {A_->getGlobalRowView(GlobalRow,indices,values);}
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowView (GlobalOrdinal GlobalRow,
                       Teuchos::ArrayView<const GlobalOrdinal> &indices,
                       Teuchos::ArrayView<const Scalar> &values) const {A_->getGlobalRowView(GlobalRow,indices,values);}
-
+#endif
+  virtual void
+  getLocalRowView (LocalOrdinal LocalRow,
+                   local_inds_host_view_type & indices,
+                   values_host_view_type & values) const {A_->getLocalRowView(LocalRow,indices,values);}
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowView (LocalOrdinal LocalRow,
                      Teuchos::ArrayView<const LocalOrdinal> &indices,
                      Teuchos::ArrayView<const Scalar> &values) const {A_->getLocalRowView(LocalRow,indices,values);}
+#endif
 
     virtual void getLocalDiagCopy (Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &diag) const {A_->getLocalDiagCopy(diag);}
     virtual void leftScale (const Tpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x) {A_->leftScale(x);}
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp
index 69859e9c6a57..9195659ffc11 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestIlukGraph.cpp
@@ -72,7 +72,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL(Ifpack2IlukGraph, IlukGraphTest0, LocalOrdinal
 //Teuchos::FancyOStream& out, bool& success
 
   typedef Tpetra::CrsGraph<LocalOrdinal,GlobalOrdinal,Node> crs_graph_type;
-  typedef typename crs_graph_type::local_graph_type local_graph_type;
+  typedef typename crs_graph_type::local_graph_device_type local_graph_type;
   typedef typename local_graph_type::row_map_type lno_row_view_t;
   typedef typename local_graph_type::entries_type lno_nonzero_view_t;
   typedef typename local_graph_type::device_type::memory_space TemporaryMemorySpace;
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp
index 82a8e1edc081..4c4e6d0945c9 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp
@@ -124,7 +124,7 @@ localSolve (Tpetra::MultiVector<
     (mode == Teuchos::TRANS ? "T" : "N");
   const std::string diag = implicitUnitDiag ? "U" : "N";
 
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_lcl = A.getLocalMatrixHost ();
 
   if (X.isConstantStride () && Y.isConstantStride ()) {
     auto X_lcl = X.getLocalViewHost (Tpetra::Access::OverwriteAll);
@@ -207,6 +207,7 @@ void testCompareToLocalSolve (bool& success, Teuchos::FancyOStream& out,
   typedef GlobalOrdinal GO;
   typedef Tpetra::Map<LO, GO> map_type;
   typedef typename map_type::device_type device_type;
+  typedef Tpetra::CrsGraph<LO, GO> crs_graph_type;
   typedef Tpetra::CrsMatrix<Scalar, LO, GO> crs_matrix_type;
   typedef Tpetra::RowMatrix<Scalar, LO, GO> row_matrix_type;
   typedef Tpetra::MultiVector<Scalar, LO, GO> mv_type;
@@ -363,10 +364,10 @@ void testCompareToLocalSolve (bool& success, Teuchos::FancyOStream& out,
       // (it shouldn't).
       RCP<crs_matrix_type> A_copy;
       {
-        typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
-        typedef typename crs_matrix_type::local_graph_type local_graph_type;
+        typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type;
+        typedef typename crs_graph_type::local_graph_device_type local_graph_type;
 
-        local_matrix_type A_lcl = A->getLocalMatrix ();
+        local_matrix_type A_lcl = A->getLocalMatrixDevice ();
 
         typename local_matrix_type::row_map_type::non_const_type ptr ("A_copy.ptr", A_lcl.graph.row_map.extent (0));
         Kokkos::deep_copy (ptr, A_lcl.graph.row_map);
@@ -763,64 +764,41 @@ testArrowMatrixWithDense (bool& success, Teuchos::FancyOStream& out, const LO lc
   TEST_EQUALITY( c(lclNumRows-1), c_n_expected );
 }
 
-template<class SC = Tpetra::Vector<>::scalar_type,
-         class LO = Tpetra::Vector<>::local_ordinal_type,
-         class GO = Tpetra::Vector<>::global_ordinal_type>
-void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
+
+template<class crs_matrix_type, class map_type>
+bool
+testArrowMatrixAssembly(const int lclNumRows,
+                        const bool explicitlyStoreUnitDiagonalOfL,
+                        RCP<const map_type> rowMap,
+                        RCP<const map_type> colMap,
+                        RCP<const map_type> domMap,
+                        RCP<const map_type> ranMap,
+                        RCP<crs_matrix_type> & L,
+                        RCP<crs_matrix_type> & U,
+                        Teuchos::FancyOStream& out)
 {
-  typedef Tpetra::Map<LO, GO> map_type;
-  typedef typename map_type::device_type device_type;
-  typedef Tpetra::CrsMatrix<SC, LO, GO> crs_matrix_type;
-  typedef Tpetra::RowMatrix<SC, LO, GO> row_matrix_type;
-  typedef Tpetra::Vector<SC, LO, GO> vec_type;
-  typedef Ifpack2::LocalSparseTriangularSolver<row_matrix_type> solver_type;
+  int gblSuccess=1, lclSuccess=1;
+  bool success=true;
+  using LO = typename crs_matrix_type::local_ordinal_type;
+  using SC = typename crs_matrix_type::scalar_type;
+
   typedef Kokkos::Details::ArithTraits<SC> KAT;
   typedef typename KAT::val_type IST;
   typedef typename KAT::mag_type mag_type;
-  int lclSuccess = 1;
-  int gblSuccess = 1;
-
-  const bool explicitlyStoreUnitDiagonalOfL = false;
-
-  Teuchos::OSTab tab0 (out);
-  out << "Ifpack2::LocalSparseTriangularSolver: Test with arrow matrix" << endl;
-  Teuchos::OSTab tab1 (out);
-
-  auto comm = Tpetra::getDefaultComm ();
-
-  const LO lclNumRows = 8; // power of two (see above)
-  const LO lclNumCols = lclNumRows;
-  const GO gblNumRows = comm->getSize () * lclNumRows;
-  const GO indexBase = 0;
-  RCP<const map_type> rowMap =
-    rcp (new map_type (static_cast<GST> (gblNumRows),
-                       static_cast<std::size_t> (lclNumRows),
-                       indexBase, comm));
-
-  // At this point, we know Kokkos has been initialized, so test the
-  // dense version of the problem.
-  testArrowMatrixWithDense<SC, LO, device_type> (success, out, lclNumRows);
-
-  // If we construct an upper or lower triangular matrix with an
-  // implicit unit diagonal, then we need to specify the column Map
-  // explicitly.  Otherwise, the matrix will report having the wrong
-  // number of columns.  In this case, the local matrix is square and
-  // every column is populated, so we can set column Map = row Map.
-  RCP<const map_type> colMap = rowMap;
-  RCP<const map_type> domMap = rowMap;
-  RCP<const map_type> ranMap = rowMap;
-
-  typedef typename crs_matrix_type::local_graph_type local_graph_type;
-  typedef typename crs_matrix_type::local_matrix_type local_matrix_type;
+  typedef typename crs_matrix_type::local_graph_device_type local_graph_type;
+  typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type;
   typedef typename local_matrix_type::row_map_type::non_const_type row_offsets_type;
   typedef typename local_graph_type::entries_type::non_const_type col_inds_type;
   typedef typename local_matrix_type::values_type::non_const_type values_type;
 
+  const LO lclNumCols = lclNumRows;
+
+  auto comm = rowMap->getComm();
+
   //
   // The suffix _d here stands for (GPU) "device," and the suffix _h
-  // stands for (CPU) "host."
+  // stands for (CPU) "host."  
   //
-
   row_offsets_type L_ptr_d ("ptr", lclNumRows + 1);
   auto L_ptr_h = Kokkos::create_mirror_view (L_ptr_d);
   row_offsets_type U_ptr_d ("ptr", lclNumRows + 1);
@@ -906,7 +884,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
   TEST_EQUALITY( gblSuccess, 1 );
   if (! gblSuccess) {
     out << "Aborting test" << endl;
-    return;
+    return gblSuccess;
   }
 
   Kokkos::deep_copy (L_ptr_d, L_ptr_h);
@@ -918,7 +896,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
   Kokkos::deep_copy (U_val_d, U_val_h);
 
   out << "Create the lower triangular Tpetra::CrsMatrix L" << endl;
-  RCP<crs_matrix_type> L;
   TEST_NOTHROW( L = rcp (new crs_matrix_type (rowMap, colMap, L_ptr_d, L_ind_d, L_val_d)) );
   TEST_ASSERT( ! L.is_null () );
   lclSuccess = success ? 1 : 0;
@@ -927,7 +904,7 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
   TEST_EQUALITY( gblSuccess, 1 );
   if (! gblSuccess) {
     out << "Aborting test" << endl;
-    return;
+    return gblSuccess;
   }
   out << "Call fillComplete on the lower triangular matrix L" << endl;
   TEST_NOTHROW( L->fillComplete (domMap, ranMap) );
@@ -937,17 +914,114 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
   TEST_EQUALITY( gblSuccess, 1 );
   if (! gblSuccess) {
     out << "Aborting test" << endl;
-    return;
+    return gblSuccess;
   }
 
+  out << "Create the upper triangular Tpetra::CrsMatrix U" << endl;
+  TEST_NOTHROW( U = rcp (new crs_matrix_type (rowMap, colMap, U_ptr_d, U_ind_d, U_val_d)) );
+  TEST_ASSERT( ! U.is_null () );
+  lclSuccess = success ? 1 : 0;
+  gblSuccess = 0; // output argument
+  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
+  TEST_EQUALITY( gblSuccess, 1 );
+  if (! gblSuccess) {
+    out << "Aborting test" << endl;
+    return gblSuccess;
+  }
+  out << "Call fillComplete on the upper triangular matrix U" << endl;
+  TEST_NOTHROW( U->fillComplete (domMap, ranMap) );
+  lclSuccess = success ? 1 : 0;
+  gblSuccess = 0; // output argument
+  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
+  TEST_EQUALITY( gblSuccess, 1 );
+  if (! gblSuccess) {
+    out << "Aborting test" << endl;
+    return gblSuccess;
+  }
+  return gblSuccess;
+}
+
+
+template<class SC = Tpetra::Vector<>::scalar_type,
+         class LO = Tpetra::Vector<>::local_ordinal_type,
+         class GO = Tpetra::Vector<>::global_ordinal_type>
+void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
+{
+  typedef Tpetra::Map<LO, GO> map_type;
+  typedef typename map_type::device_type device_type;
+  typedef Tpetra::CrsMatrix<SC, LO, GO> crs_matrix_type;
+  typedef Tpetra::RowMatrix<SC, LO, GO> row_matrix_type;
+  typedef Tpetra::Vector<SC, LO, GO> vec_type;
+  typedef Ifpack2::LocalSparseTriangularSolver<row_matrix_type> solver_type;
+  typedef Kokkos::Details::ArithTraits<SC> KAT;
+  typedef typename KAT::val_type IST;
+  typedef typename KAT::mag_type mag_type;
+  int lclSuccess = 1;
+  int gblSuccess = 1;
+
+  const bool explicitlyStoreUnitDiagonalOfL = false;
+
+  Teuchos::OSTab tab0 (out);
+  out << "Ifpack2::LocalSparseTriangularSolver: Test with arrow matrix" << endl;
+  Teuchos::OSTab tab1 (out);
+
+  auto comm = Tpetra::getDefaultComm ();
+
+  const LO lclNumRows = 8; // power of two (see above)
+  const LO lclNumCols = lclNumRows;
+  const GO gblNumRows = comm->getSize () * lclNumRows;
+  const GO indexBase = 0;
+  RCP<const map_type> rowMap =
+    rcp (new map_type (static_cast<GST> (gblNumRows),
+                       static_cast<std::size_t> (lclNumRows),
+                       indexBase, comm));
+
+  // At this point, we know Kokkos has been initialized, so test the
+  // dense version of the problem.
+  testArrowMatrixWithDense<SC, LO, device_type> (success, out, lclNumRows);
+
+  // If we construct an upper or lower triangular matrix with an
+  // implicit unit diagonal, then we need to specify the column Map
+  // explicitly.  Otherwise, the matrix will report having the wrong
+  // number of columns.  In this case, the local matrix is square and
+  // every column is populated, so we can set column Map = row Map.
+  RCP<const map_type> colMap = rowMap;
+  RCP<const map_type> domMap = rowMap;
+  RCP<const map_type> ranMap = rowMap;
+
+  // All of the matrix assembly stuff had to get hived off into a different
+  // scope to keep the later accessors from violating the "you can't have a 
+  // host and a device view at the same time" assumption
+  RCP<crs_matrix_type> L, U;
+
+  gblSuccess=testArrowMatrixAssembly(lclNumRows,
+                                     explicitlyStoreUnitDiagonalOfL,
+                                     rowMap,colMap,domMap,ranMap,
+                                     L,U,out);
+  if(!gblSuccess) return;
+
+  typedef typename crs_matrix_type::local_graph_device_type local_graph_type;
+  typedef typename crs_matrix_type::local_matrix_device_type local_matrix_type;
+  typedef typename local_matrix_type::row_map_type::non_const_type row_offsets_type;
+  typedef typename local_graph_type::entries_type::non_const_type col_inds_type;
+  typedef typename local_matrix_type::values_type::non_const_type values_type;
+
+  typedef typename crs_matrix_type::local_inds_host_view_type const_local_inds_type;
+  typedef typename crs_matrix_type::values_host_view_type const_values_type;
+
+  const IST ONE = KAT::one ();
+  const IST TWO = KAT::one () + KAT::one ();
+  // Don't cast directly from an integer type to IST,
+  // since if IST is complex, that cast may not exist.
+  const IST N = static_cast<IST> (static_cast<mag_type> (lclNumRows));
+  const IST d = TWO * N;
+
   out << "Make sure that the last row of L is correct" << endl;
   {
     Teuchos::OSTab tab2 (out);
 
-    // FIXME (mfh 23 Aug 2016) This may depend on UVM.
-    // We should instead rely on dual view semantics here.
-    Teuchos::ArrayView<const LO> lclColInds;
-    Teuchos::ArrayView<const SC> vals;
+    const_local_inds_type lclColInds;
+    const_values_type vals;
 
     L->getLocalRowView (lclNumRows - 1, lclColInds, vals);
     if (explicitlyStoreUnitDiagonalOfL) {
@@ -983,28 +1057,6 @@ void testArrowMatrix (bool& success, Teuchos::FancyOStream& out)
     return;
   }
 
-  out << "Create the upper triangular Tpetra::CrsMatrix U" << endl;
-  RCP<crs_matrix_type> U;
-  TEST_NOTHROW( U = rcp (new crs_matrix_type (rowMap, colMap, U_ptr_d, U_ind_d, U_val_d)) );
-  TEST_ASSERT( ! U.is_null () );
-  lclSuccess = success ? 1 : 0;
-  gblSuccess = 0; // output argument
-  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
-  TEST_EQUALITY( gblSuccess, 1 );
-  if (! gblSuccess) {
-    out << "Aborting test" << endl;
-    return;
-  }
-  out << "Call fillComplete on the upper triangular matrix U" << endl;
-  TEST_NOTHROW( U->fillComplete (domMap, ranMap) );
-  lclSuccess = success ? 1 : 0;
-  gblSuccess = 0; // output argument
-  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
-  TEST_EQUALITY( gblSuccess, 1 );
-  if (! gblSuccess) {
-    out << "Aborting test" << endl;
-    return;
-  }
 
   out << "Create the solver for L" << endl;
   RCP<solver_type> L_solver;
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp
index 061e66a121db..4434a9c6d3ee 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver2.cpp
@@ -18,7 +18,7 @@ namespace {
 
     const crs_graph_type& G_crs = dynamic_cast<const crs_graph_type&> (G);
 
-    auto G_lcl = G_crs.getLocalGraph ();
+    auto G_lcl = G_crs.getLocalGraphDevice ();
     auto lclRowMap = G.getRowMap ()->getLocalMap ();
     auto lclColMap = G.getColMap ()->getLocalMap ();
     return determineLocalTriangularStructure (G_lcl, lclRowMap, lclColMap, true);
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp
index cb136bd10e61..954d5acc0bc0 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp
@@ -228,8 +228,8 @@ void reducedMatvec(const OverlappedMatrixClass & A,
   if(overlapLevel >= (int) hstarts.size()) 
     throw std::runtime_error("reducedMatvec: Exceeded available overlap");
 
-  auto undA_lcl = undA->getLocalMatrix ();
-  auto extA_lcl = extA->getLocalMatrix ();
+  auto undA_lcl = undA->getLocalMatrixDevice ();
+  auto extA_lcl = extA->getLocalMatrixDevice ();
   auto X_lcl = X.getLocalViewDevice (Tpetra::Access::ReadOnly);
   auto Y_lcl = Y.getLocalViewDevice (Tpetra::Access::OverwriteAll);
   
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp
index b7458a923f8f..61b7a7e77ade 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp
@@ -392,8 +392,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BandedBlockCrsMatrixWithDropping, Scal
 
 TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BlockMatrixOps, Scalar, LocalOrdinal, GlobalOrdinal)
 {
-  typedef Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::MemoryTraits<Kokkos::Unmanaged> > little_block_type;
-  typedef Kokkos::View<Scalar*,Kokkos::LayoutRight,Kokkos::MemoryTraits<Kokkos::Unmanaged> > little_vec_type;
+  typedef Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > little_block_type;
+  typedef Kokkos::View<Scalar*,Kokkos::LayoutRight,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > little_vec_type;
   typedef typename Kokkos::Details::ArithTraits<Scalar>::val_type impl_scalar_type;
   typedef Teuchos::ScalarTraits<Scalar> STS;
 
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp
index 3d383702d1cf..2469df5a8980 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRelaxation.cpp
@@ -1133,6 +1133,64 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, MTSGS, Scalar, LocalOrdinal
   }
 }
 
+TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, MTSGS_LongRows, Scalar, LocalOrdinal, GlobalOrdinal)
+{
+  using Teuchos::RCP;
+  using Teuchos::rcp;
+  using Teuchos::ParameterList;
+  using crs_matrix_type = Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+  using row_matrix_type = Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+  using MV = Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+  using map_type = Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node>;
+  using prec_type = Ifpack2::Relaxation<row_matrix_type>;
+  using STS = Teuchos::ScalarTraits<Scalar>;
+  using STM = typename STS::magnitudeType;
+  std::string version = Ifpack2::Version();
+  out << "Ifpack2::Version(): " << version << std::endl;
+  //Generate banded test matrix
+  RCP<const map_type> rowmap = tif_utest::create_tpetra_map<LocalOrdinal, GlobalOrdinal, Node>(100);
+  RCP<const crs_matrix_type> A = tif_utest::create_banded_matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(rowmap, 3);
+  RCP<prec_type> prec = rcp(new prec_type(A));
+  ParameterList goodParams;
+  goodParams.set("relaxation: type", "MT Symmetric Gauss-Seidel");
+  goodParams.set("relaxation: sweeps", 3);
+  goodParams.set("relaxation: long row threshold", 3);
+  //Try setting up precondition with incompatible type, and make sure this throws.
+  {
+    ParameterList badParams = goodParams;
+    badParams.set("relaxation: type", "Gauss-Seidel");
+    TEST_THROW (prec->setParameters (badParams), std::invalid_argument);
+  }
+  //Try setting up cluster GS preconditioner with long row algorithm enabled - should also throw.
+  {
+    ParameterList badParams = goodParams;
+    badParams.set("relaxation: mtgs cluster size", 4);
+    TEST_THROW(prec->setParameters (badParams), std::invalid_argument);
+  }
+  prec->setParameters (goodParams);
+  prec->initialize();
+  prec->compute();
+  //Set up linear problem
+  const int numVecs = 10;
+  MV x(A->getDomainMap(), numVecs, true);
+  MV b(rowmap, numVecs, false);
+  b.randomize();
+  Kokkos::View<STM*, Kokkos::HostSpace> initNorms("Initial norms", numVecs);
+  //Residual norms for starting solution of zero
+  b.norm2(initNorms);
+  prec->apply(b, x);
+  //Compute residual vector = b - Ax
+  MV residual(b, Teuchos::Copy);
+  A->apply(x, residual, Teuchos::NO_TRANS, -STS::one(), STS::one());
+  Kokkos::View<STM*, Kokkos::HostSpace> resNorms("Residual norms", numVecs);
+  residual.norm2(resNorms);
+  //Make sure all residual norms are significantly smaller than initial
+  for(int i = 0; i < numVecs; i++)
+  {
+    TEST_COMPARE(resNorms(i), <, 0.5 * initNorms(i));
+  }
+}
+
 TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, ClusterMTSGS, Scalar, LocalOrdinal, GlobalOrdinal)
 {
   using Teuchos::RCP;
@@ -1193,6 +1251,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, ClusterMTSGS, Scalar, Local
   TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, TestLowerTriangularBlockCrsMatrix, Scalar, LO, GO ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, TestUpperTriangularBlockCrsMatrix, Scalar, LO, GO ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, MTSGS, Scalar, LO, GO ) \
+  TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, MTSGS_LongRows, Scalar, LO, GO ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, ClusterMTSGS, Scalar, LO, GO )
 
   //TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( Ifpack2Relaxation, SGS_mult_sweeps, Scalar, LO, GO )
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp
index d9ac6214f887..404d63f0adb9 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestSingleProcessRILUK.cpp
@@ -73,7 +73,7 @@ template<class MatrixType, class VectorType>
 void remove_diags_and_scale(const MatrixType& L, const MatrixType& U,
                             Teuchos::RCP<MatrixType>& Ln, Teuchos::RCP<MatrixType>& Un, Teuchos::RCP<VectorType>& Dn) {
 
-  typedef typename MatrixType::local_matrix_type local_matrix_type;
+  typedef typename MatrixType::local_matrix_device_type local_matrix_type;
   typedef typename std::remove_const<typename local_matrix_type::size_type>::type    size_type;
   typedef typename std::remove_const<typename local_matrix_type::ordinal_type>::type ordinal_type;
   typedef typename std::remove_const<typename local_matrix_type::value_type>::type   value_type;
@@ -87,11 +87,11 @@ void remove_diags_and_scale(const MatrixType& L, const MatrixType& U,
   typedef Kokkos::TeamPolicy<execution_space> team_policy;
   typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type;
 
-  auto L_rowmap  = L.getLocalMatrix().graph.row_map;
-  auto L_entries = L.getLocalMatrix().graph.entries;
+  auto L_rowmap  = L.getLocalMatrixDevice().graph.row_map;
+  auto L_entries = L.getLocalMatrixDevice().graph.entries;
   auto L_values  = L.getLocalValuesView();
-  auto U_rowmap  = U.getLocalMatrix().graph.row_map;
-  auto U_entries = U.getLocalMatrix().graph.entries;
+  auto U_rowmap  = U.getLocalMatrixDevice().graph.row_map;
+  auto U_entries = U.getLocalMatrixDevice().graph.entries;
   auto U_values  = U.getLocalValuesView();
 
   rowmap_type  Ln_rowmap ("Ln_rowmap",  L_rowmap.extent(0));
@@ -290,6 +290,23 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out,
   RCP<const map_type> rowmap =
     tif_utest::create_tpetra_map<LO, GO, Node> (num_rows_per_proc);
 
+  // Matrix
+  // [ 2 .1  0  0  0]
+  // [.1  2  0  0  0]
+  // [ 0 .1  2 .1  0]
+  // [ 0  0 .1  2 .1]
+  // [ 0  0  0 .1  2]
+
+  // Matlab's Factors
+  // L
+  // Diagonal = 1 (implied)
+  // Subdiagonal (approx) = .05, .0501, .0501 .0501
+
+  // U
+  // Diagonal (approx)      = 2 1.995 1.995 1.995 1.995 
+  // Superdiagonal (approx) = .1 .1 .1 .1
+
+
   if (rowmap->getComm ()->getSize () > 1) {
     out << "This test may only be run in serial "
       "or with a single MPI process." << endl;
@@ -300,6 +317,12 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out,
   RCP<const crs_matrix_type> crsmatrix =
     tif_utest::create_test_matrix2<Scalar,LO,GO,Node>(rowmap);
 
+  {//CMS
+    auto out = Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cout));
+    *out<<"***** A *****"<<std::endl;
+    crsmatrix->describe(*out,Teuchos::VERB_EXTREME);
+  }
+
   //----------------Default trisolver----------------//
   {
     out << "Creating preconditioner" << endl;
@@ -316,7 +339,18 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out,
     out << "Calling initialize() and compute()" << endl;
     prec.initialize();
     prec.compute();
-    
+   
+  {//CMS
+    auto out = Teuchos::getFancyOStream (Teuchos::rcpFromRef (std::cout));
+    *out<<"***** Test L *****"<<std::endl;
+    prec.getL().describe(*out,Teuchos::VERB_EXTREME);
+    *out<<"***** Test U *****"<<std::endl;
+    prec.getU().describe(*out,Teuchos::VERB_EXTREME);
+    *out<<"***** Test D *****"<<std::endl;
+    prec.getD().describe(*out,Teuchos::VERB_EXTREME);
+  }
+
+ 
     out << "Creating test problem" << endl;
     MV x (rowmap, 2);
     MV y (rowmap, 2);
@@ -408,6 +442,8 @@ void Ifpack2RILUKSingleProcess_test1 (bool& success, Teuchos::FancyOStream& out,
       test_alpha_beta(-0.42, 4.2, mode);
     }
   }
+
+  return;//CMS
   //----------------Kokkos Kernels SPTRSV----------------//
   {
     out << "Creating preconditioner" << endl;
@@ -700,20 +736,18 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2RILUKSingleProcess, IgnoreRowMapGIDs, S
       //Copy the banded matrix into a new matrix with permuted row map GIDs.
       //This matrix will have the sparsity pattern as the original matrix.
       RCP<crs_matrix_type> permutedMatrix = Teuchos::rcp(new crs_matrix_type(permRowMap, 5));
-      Teuchos::Array<GO> Inds(5);
-      Teuchos::Array<GO> pInds(5);
-      Teuchos::Array<Scalar>        Vals(5);
-      Teuchos::Array<Scalar>        pVals(5);
+      typename crs_matrix_type::nonconst_global_inds_host_view_type Inds("Inds",5), pInds("pInds",5);
+      typename crs_matrix_type::nonconst_values_host_view_type Vals("Vals",5), pVals("pVals",5);
       size_t numEntries;
       for (global_size_t i=0; i<num_rows_per_proc; ++i) {
-        crsmatrix->getGlobalRowCopy(i,Inds(),Vals(),numEntries);
-        pInds.resize(numEntries);
-        pVals.resize(numEntries);
+        crsmatrix->getGlobalRowCopy(i,Inds,Vals,numEntries);
+        Kokkos::resize(pInds,numEntries);
+        Kokkos::resize(pVals,numEntries);
         for (size_t j=0; j<numEntries; ++j) {
           pInds[j] = origToPerm[Inds[j]];
           pVals[j] = Vals[j];
         }
-        permutedMatrix->insertGlobalValues(origToPerm[i],pInds(),pVals());
+        permutedMatrix->insertGlobalValues(origToPerm[i],numEntries,pVals.data(),pInds.data());
       }
       permutedMatrix->fillComplete();
     
diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp
index 13954baec214..7ca8e527efd9 100644
--- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp
+++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_Basis.hpp
@@ -362,92 +362,7 @@ using HostBasisPtr = BasisPtr<typename Kokkos::HostSpace::device_type, OutputTyp
      
         Note that only the basic exact-sequence operators are supported at the moment: VALUE, GRAD, DIV, CURL.
      */
-    Kokkos::DynRankView<OutputValueType,DeviceType> allocateOutputView( const int numPoints, const EOperator operatorType = OPERATOR_VALUE) const
-    {
-      const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV);
-      INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateOutputView()");
-      
-      const int numFields = this->getCardinality();
-      const int spaceDim  = basisCellTopology_.getDimension();
-      
-      // KK: this needs to be updated after nate works on tensorthings
-      using OutputViewAllocatable = Kokkos::DynRankView<outputValueType,DeviceType>;
-      
-      switch (functionSpace_)
-      {
-        case FUNCTION_SPACE_HGRAD:
-          if (operatorType == OPERATOR_VALUE)
-          {
-            // scalar-valued container
-            OutputViewAllocatable dataView("BasisValues HGRAD VALUE data", numFields, numPoints);
-            return dataView;
-          }
-          else if (operatorType == OPERATOR_GRAD)
-          {
-            OutputViewAllocatable dataView("BasisValues HGRAD GRAD data", numFields, numPoints, spaceDim);
-            return dataView;
-          }
-          else
-          {
-            INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
-          }
-        case FUNCTION_SPACE_HDIV:
-          if (operatorType == OPERATOR_VALUE)
-          {
-            // vector-valued container
-            OutputViewAllocatable dataView("BasisValues HDIV VALUE data", numFields, numPoints, spaceDim);
-            return dataView;
-          }
-          else if (operatorType == OPERATOR_DIV)
-          {
-            // scalar-valued curl
-            OutputViewAllocatable dataView("BasisValues HDIV DIV data", numFields, numPoints);
-            return dataView;
-          }
-          else
-          {
-            INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
-          }
-        case FUNCTION_SPACE_HCURL:
-          if (operatorType == OPERATOR_VALUE)
-          {
-            OutputViewAllocatable dataView("BasisValues HCURL VALUE data", numFields, numPoints, spaceDim);
-            return dataView;
-          }
-          else if (operatorType == OPERATOR_CURL)
-          {
-            if (spaceDim != 2)
-            {
-              // vector-valued curl
-              OutputViewAllocatable dataView("BasisValues HCURL CURL data", numFields, numPoints, spaceDim);
-              return dataView;
-            }
-            else
-            {
-              // scalar-valued curl
-              OutputViewAllocatable dataView("BasisValues HCURL CURL data (scalar)", numFields, numPoints);
-              return dataView;
-            }
-          }
-          else
-          {
-            INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
-          }
-        case FUNCTION_SPACE_HVOL:
-          if (operatorType == OPERATOR_VALUE)
-          {
-            // vector-valued container
-            OutputViewAllocatable dataView("BasisValues HVOL VALUE data", numFields, numPoints);
-            return dataView;
-          }
-          else
-          {
-            INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
-          }
-        default:
-          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
-      }
-    }
+    Kokkos::DynRankView<OutputValueType,DeviceType> allocateOutputView( const int numPoints, const EOperator operatorType = OPERATOR_VALUE) const;
     
     /** \brief Allocate BasisValues container suitable for passing to the getValues() variant that takes a TensorPoints container as argument.
      
@@ -456,7 +371,8 @@ using HostBasisPtr = BasisPtr<typename Kokkos::HostSpace::device_type, OutputTyp
      */
     virtual BasisValues<OutputValueType,DeviceType> allocateBasisValues( TensorPoints<PointValueType,DeviceType> points, const EOperator operatorType = OPERATOR_VALUE) const
     {
-      const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV);
+      const bool operatorIsDk = (operatorType >= OPERATOR_D1) && (operatorType <= OPERATOR_D10);
+      const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV) || operatorIsDk;
       INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateBasisValues");
       
 //      // this default implementation employs a trivial tensor-product structure; make sure that points also have a trivial tensor product structure:
diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp
index c87fe3a9e0d8..07f018e3a8f2 100644
--- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp
+++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisDef.hpp
@@ -862,6 +862,108 @@ namespace Intrepid2 {
                                   ">>> ERROR: (Intrepid2::getValues_HGRAD_Args) dim 0 (number of basis functions) of outputValues must equal basis cardinality.");
   }
 
+  template<typename Device,
+            typename outputValueType,
+            typename pointValueType>
+  Kokkos::DynRankView<outputValueType,Device>
+  Basis<Device,outputValueType,pointValueType>::allocateOutputView( const int numPoints, const EOperator operatorType) const
+  {
+    const bool operatorIsDk = (operatorType >= OPERATOR_D1) && (operatorType <= OPERATOR_D10);
+    const bool operatorSupported = (operatorType == OPERATOR_VALUE) || (operatorType == OPERATOR_GRAD) || (operatorType == OPERATOR_CURL) || (operatorType == OPERATOR_DIV) || operatorIsDk;
+    INTREPID2_TEST_FOR_EXCEPTION(!operatorSupported, std::invalid_argument, "operator is not supported by allocateOutputView()");
+    
+    const int numFields = this->getCardinality();
+    const int spaceDim  = basisCellTopology_.getDimension();
+    
+    using OutputViewAllocatable = Kokkos::DynRankView<outputValueType,DeviceType>;
+    
+    switch (functionSpace_)
+    {
+      case FUNCTION_SPACE_HGRAD:
+        if (operatorType == OPERATOR_VALUE)
+        {
+          // scalar-valued container
+          OutputViewAllocatable dataView("BasisValues HGRAD VALUE data", numFields, numPoints);
+          return dataView;
+        }
+        else if (operatorType == OPERATOR_GRAD)
+        {
+          OutputViewAllocatable dataView("BasisValues HGRAD GRAD data", numFields, numPoints, spaceDim);
+          return dataView;
+        }
+        else if (operatorIsDk)
+        {
+          ordinal_type dkCardinality = getDkCardinality(operatorType, spaceDim);
+          OutputViewAllocatable dataView("BasisValues HGRAD Dk data", numFields, numPoints, dkCardinality);
+          return dataView;
+        }
+        else
+        {
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
+        }
+      case FUNCTION_SPACE_HDIV:
+        if (operatorType == OPERATOR_VALUE)
+        {
+          // vector-valued container
+          OutputViewAllocatable dataView("BasisValues HDIV VALUE data", numFields, numPoints, spaceDim);
+          return dataView;
+        }
+        else if (operatorType == OPERATOR_DIV)
+        {
+          // scalar-valued curl
+          OutputViewAllocatable dataView("BasisValues HDIV DIV data", numFields, numPoints);
+          return dataView;
+        }
+        else
+        {
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
+        }
+      case FUNCTION_SPACE_HCURL:
+        if (operatorType == OPERATOR_VALUE)
+        {
+          OutputViewAllocatable dataView("BasisValues HCURL VALUE data", numFields, numPoints, spaceDim);
+          return dataView;
+        }
+        else if (operatorType == OPERATOR_CURL)
+        {
+          if (spaceDim != 2)
+          {
+            // vector-valued curl
+            OutputViewAllocatable dataView("BasisValues HCURL CURL data", numFields, numPoints, spaceDim);
+            return dataView;
+          }
+          else
+          {
+            // scalar-valued curl
+            OutputViewAllocatable dataView("BasisValues HCURL CURL data (scalar)", numFields, numPoints);
+            return dataView;
+          }
+        }
+        else
+        {
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
+        }
+      case FUNCTION_SPACE_HVOL:
+        if (operatorType == OPERATOR_VALUE)
+        {
+          // vector-valued container
+          OutputViewAllocatable dataView("BasisValues HVOL VALUE data", numFields, numPoints);
+          return dataView;
+        }
+        else if (operatorIsDk)
+        {
+          ordinal_type dkCardinality = getDkCardinality(operatorType, spaceDim);
+          OutputViewAllocatable dataView("BasisValues HVOL Dk data", numFields, numPoints, dkCardinality);
+          return dataView;
+        }
+        else
+        {
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
+        }
+      default:
+        INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "operator/space combination not supported by allocateOutputView()");
+    }
+  }
 }
 
 #endif
diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
index 9313684fe262..f02256131951 100644
--- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
+++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
@@ -399,7 +399,7 @@ namespace Intrepid2 {
               }
             });
             
-            if (composedTransform_.underlyingMatchesNotional())
+            if (composedTransform_.underlyingMatchesLogical())
             {
               const auto & composedTransformView = composedTransform_.getUnderlyingView4();
               Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
diff --git a/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp b/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp
new file mode 100644
index 000000000000..74a5fbba01c5
--- /dev/null
+++ b/packages/intrepid2/src/Shared/Intrepid2_ArgExtractor.hpp
@@ -0,0 +1,214 @@
+// @HEADER
+// ************************************************************************
+//
+//                           Intrepid2 Package
+//                 Copyright (2007) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Kyungjoo Kim  (kyukim@sandia.gov),
+//                    Mauro Perego  (mperego@sandia.gov), or
+//                    Nate Roberts  (nvrober@sandia.gov)
+//
+// ************************************************************************
+// @HEADER
+
+/** \file   Intrepid2_ArgExtractor.hpp
+    \brief  Header file with various static argument-extractor classes.  These are useful for writing efficient, templated code in terms of a subset of the arguments passed into a specified functor.  See Intrepid2::Data, and specifically its storeInPlaceCombination() implementation, for an example.
+    \author Created by Nate Roberts.
+*/
+
+#ifndef __Intrepid2_ArgExtractor_HPP__
+#define __Intrepid2_ArgExtractor_HPP__
+
+#include "Intrepid2_ConfigDefs.hpp"
+#include "Intrepid2_DeviceAssert.hpp"
+#include "Intrepid2_Types.hpp"
+#include "Intrepid2_Utils.hpp"
+
+#include "Kokkos_Core.hpp"
+
+namespace Intrepid2 {
+  /** \class  Intrepid2::ConstantArgExtractor
+      \brief Argument extractor class which ignores the input arguments in favor of passing a single 0 argument to the provided container.
+  */
+  template<class reference_type>
+  struct ConstantArgExtractor
+  {
+    template<class ViewType, class ...IntArgs>
+    static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs)
+    {
+      return view(0);
+    }
+  };
+
+  /** \class  Intrepid2::FullArgExtractor
+      \brief Argument extractor class which passes all arguments to the provided container.
+  */
+  template<class reference_type>
+  struct FullArgExtractor
+  {
+    template<class ViewType, class ...IntArgs>
+    static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs)
+    {
+      return view(intArgs...);
+    }
+  };
+
+  /** \class  Intrepid2::SingleArgExtractor
+      \brief Argument extractor class which passes a single argument, indicated by the template parameter whichArg, to the provided container.
+  */
+  template<class reference_type, int whichArg>
+  struct SingleArgExtractor
+  {
+    template< bool B, class T = reference_type >
+    using enable_if_t = typename std::enable_if<B,T>::type;
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 0>
+    get(const ViewType &view, const int_type &i0)
+    {
+      return view(i0);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 0>
+    get(const ViewType &view, const int_type &i0, const IntArgs&... intArgs)
+    {
+      return view(i0);
+    }
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 1>
+    get(const ViewType &view, const int_type &i0, const int_type &i1)
+    {
+      return view(i1);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 1>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const IntArgs&... intArgs)
+    {
+      return view(i1);
+    }
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 2>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2)
+    {
+      return view(i2);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 2>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const IntArgs&... intArgs)
+    {
+      return view(i2);
+    }
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 3>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3)
+    {
+      return view(i3);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 3>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const IntArgs&... intArgs)
+    {
+      return view(i3);
+    }
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 4>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4)
+    {
+      return view(i4);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 4>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const IntArgs&... intArgs)
+    {
+      return view(i4);
+    }
+    
+    template<class ViewType, class int_type, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 5>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const int_type &i5)
+    {
+      return view(i5);
+    }
+    
+    template<class ViewType, class int_type, class ...IntArgs, int M=whichArg>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<M == 5>
+    get(const ViewType &view, const int_type &i0, const int_type &i1, const int_type &i2, const int_type &i3, const int_type &i4, const int_type &i5, const IntArgs&... intArgs)
+    {
+      return view(i5);
+    }
+    
+    // the commented-out code below is a cleaner way to implement the above, but we can't support this on CUDA until we can require KOKKOS_ENABLE_CUDA_CONSTEXPR
+    /*
+    template<class ViewType, class ...IntArgs>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<whichArg < sizeof...(IntArgs), reference_type>
+    get(const ViewType &view, const IntArgs&... intArgs)
+    {
+      const auto & arg = std::get<whichArg>(std::tuple<IntArgs...>(intArgs...));
+      return view(arg);
+    }
+     */
+    
+    template<class ViewType, class ...IntArgs>
+    static KOKKOS_INLINE_FUNCTION
+    enable_if_t<whichArg >= sizeof...(IntArgs), reference_type>
+    get(const ViewType &view, const IntArgs&... intArgs)
+    {
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"calling SingleArgExtractor with out-of-bounds argument");
+      Kokkos::abort("Intrepid2::SingleArgExtractor: calling SingleArgExtractor with out-of-bounds argument\n");
+      return view(0); // this line added to avoid missing return statement warning under nvcc
+    }
+  };
+}
+#endif
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
index feccd646e6a5..1495bee47c3e 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
@@ -8,11 +8,12 @@
 #ifndef Intrepid2_Data_h
 #define Intrepid2_Data_h
 
+#include "Intrepid2_ArgExtractor.hpp"
 #include "Intrepid2_ScalarView.hpp"
 #include "Intrepid2_Utils.hpp"
 
 /** \file  Intrepid2_Data.hpp
-   \brief  Defines the Data class, a wrapper around a Kokkos::View that allows data that is constant or repeating in various notional dimensions to be stored just once, while providing a similar interface to that of View.
+   \brief  Defines the Data class, a wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimensions to be stored just once, while providing a similar interface to that of View.
    \author Created by N.V. Roberts.
 */
 
@@ -44,7 +45,7 @@ namespace Intrepid2 {
     DataVariationType variationType;
     int dataExtent;
     int variationModulus; // should be equal to dataExtent variationType other than MODULAR and CONSTANT
-    int blockPlusDiagonalFirstNonDiagonal = -1; // only relevant for variationType == BLOCK_PLUS_DIAGONAL
+    int blockPlusDiagonalLastNonDiagonal = -1; // only relevant for variationType == BLOCK_PLUS_DIAGONAL
   };
 
   //! Returns DimensionInfo for a Data container that combines (through multiplication, say, or addition) the two specified DimensionInfo specifications in one of its dimensions.
@@ -134,8 +135,8 @@ namespace Intrepid2 {
             combinedDimensionInfo.variationType    = GENERAL;
             combinedDimensionInfo.dataExtent       = max(myDataExtent,otherDataExtent);
             combinedDimensionInfo.variationModulus = combinedDimensionInfo.dataExtent;
-            // for this case, we want to take the minimum of the two Data objects' blockPlusDiagonalFirstNonDiagonal as the combined object's blockPlusDiagonalFirstNonDiagonal
-            combinedDimensionInfo.blockPlusDiagonalFirstNonDiagonal = min(myData.blockPlusDiagonalFirstNonDiagonal, otherData.blockPlusDiagonalFirstNonDiagonal);
+            // for this case, we want to take the minimum of the two Data objects' blockPlusDiagonalLastNonDiagonal as the combined object's blockPlusDiagonalLastNonDiagonal
+            combinedDimensionInfo.blockPlusDiagonalLastNonDiagonal = min(myData.blockPlusDiagonalLastNonDiagonal, otherData.blockPlusDiagonalLastNonDiagonal);
         }
         break;
       case GENERAL:
@@ -153,19 +154,19 @@ namespace Intrepid2 {
 
     /**
       \class  Intrepid2::Data
-      \brief  Wrapper around a Kokkos::View that allows data that is constant or repeating in various notional dimensions to be stored just once, while providing a similar interface to that of View.
+      \brief  Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimensions to be stored just once, while providing a similar interface to that of View.
      
-      The Data class distinguishes between the notional extent and the data extent.  For example, one could construct a data container corresponding to constant (cell, point) data with 100 cells
+      The Data class distinguishes between the logical extent and the data extent.  For example, one could construct a data container corresponding to constant (cell, point) data with 100 cells
      and 25 points per cell as follows:
           auto cpData = Data(value, Kokkos::Array<int>{100,25});
-     The data extent of the container is 1 in every dimension, while the notional extent is 100 in the first dimension, and 25 in the second.  Similarly, the notional rank of the container is 2, but the rank of the
+     The data extent of the container is 1 in every dimension, while the logical extent is 100 in the first dimension, and 25 in the second.  Similarly, the logical rank of the container is 2, but the rank of the
      underlying View is 1.
      
-     There are four possible variation types in a notional dimension:
-     - GENERAL: the data varies arbitrarily.  The underlying View will have the same extent in its corresponding dimension (which may be distinct from the notional dimension).
+     There are four possible variation types in a logical dimension:
+     - GENERAL: the data varies arbitrarily.  The underlying View will have the same extent in its corresponding dimension (which may be distinct from the logical dimension).
      - CONSTANT: the data does not vary.  The underlying View will not have a dimension corresponding to this dimension.
      - MODULAR: the data varies with a modulus.  The underlying View will have a corresponding dimension with extent corresponding to the modulus.
-     - BLOCK_PLUS_DIAGONAL: the data varies in this notional dimension and one other, corresponding to a square matrix that has some (possibly trivial) full block, with diagonal entries in the remaining dimensions.  The underlying View will have one dimension corresponding to the two notional dimensions, with extent corresponding to the number of nonzeros in the matrix.
+     - BLOCK_PLUS_DIAGONAL: the data varies in this logical dimension and one other, corresponding to a square matrix that has some (possibly trivial) full block, with diagonal entries in the remaining dimensions.  The underlying View will have one dimension corresponding to the two logical dimensions, with extent corresponding to the number of nonzeros in the matrix.
      
   */
   template<class DataScalar,typename DeviceType>
@@ -187,8 +188,8 @@ namespace Intrepid2 {
     Kokkos::Array<int,7> variationModulus_;            // for each dimension, a value by which indices should be modulused (only used when variationType_ is MODULAR)
     int blockPlusDiagonalLastNonDiagonal_ = -1;        // last row/column that is part of the non-diagonal part of the matrix indicated by BLOCK_PLUS_DIAGONAL (if any dimensions are thus marked)
     
-    bool hasNontrivialModulusUNUSED_;  // this is a little nutty, but having this UNUSED member variable improves performance, probably by shifting the alignment of underlyingMatchesNotional_.  This is true with nvcc; it may also be true with Apple clang
-    bool underlyingMatchesNotional_;   // if true, this Data object has the same rank and extent as the underlying view
+    bool hasNontrivialModulusUNUSED_;  // this is a little nutty, but having this UNUSED member variable improves performance, probably by shifting the alignment of underlyingMatchesLogical_.  This is true with nvcc; it may also be true with Apple clang
+    bool underlyingMatchesLogical_;   // if true, this Data object has the same rank and extent as the underlying view
     Kokkos::Array<ordinal_type,7> activeDims_;
     int numActiveDims_; // how many of the 7 entries are actually filled in
     
@@ -253,7 +254,7 @@ namespace Intrepid2 {
       
       numActiveDims_ = 0;
       int blockPlusDiagonalCount = 0;
-      underlyingMatchesNotional_ = true;
+      underlyingMatchesLogical_ = true;
       for (ordinal_type i=0; i<7; i++)
       {
         if (variationType_[i] == GENERAL)
@@ -271,7 +272,7 @@ namespace Intrepid2 {
         }
         else if (variationType_[i] == MODULAR)
         {
-          underlyingMatchesNotional_ = false;
+          underlyingMatchesLogical_ = false;
           if (extents_[i] != getUnderlyingViewExtent(numActiveDims_))
           {
             const int dataExtent = getUnderlyingViewExtent(numActiveDims_);
@@ -291,7 +292,7 @@ namespace Intrepid2 {
         }
         else if (variationType_[i] == BLOCK_PLUS_DIAGONAL)
         {
-          underlyingMatchesNotional_ = false;
+          underlyingMatchesLogical_ = false;
           blockPlusDiagonalCount++;
           if (blockPlusDiagonalCount == 1) // first dimension thus marked --> active
           {
@@ -318,7 +319,7 @@ namespace Intrepid2 {
         {
           if (i < rank_)
           {
-            underlyingMatchesNotional_ = false;
+            underlyingMatchesLogical_ = false;
           }
           variationModulus_[i] = 1; // trivial modulus
         }
@@ -326,7 +327,7 @@ namespace Intrepid2 {
       
       if (rank_ != dataRank_)
       {
-        underlyingMatchesNotional_ = false;
+        underlyingMatchesLogical_ = false;
       }
       
       for (int d=numActiveDims_; d<7; d++)
@@ -340,75 +341,608 @@ namespace Intrepid2 {
         INTREPID2_TEST_FOR_EXCEPTION(variationModulus_[d] == 0, std::logic_error, "variationModulus should not ever be 0");
       }
     }
-    
+
   public:
-    //! Returns an l-value reference to the specified nominal entry in the underlying view.  Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location.  Intended for Intrepid2 developers and expert users only.
-    KOKKOS_INLINE_FUNCTION
-    reference_type getWritableEntry(const int & i0, const int & i1, const int & i2,
-                                    const int & i3, const int & i4, const int & i5,
-                                    const int & i6) const
+    //! For use with Data object into which a value will be stored.
+    struct FullArgExtractorWritableData
     {
-      if (underlyingMatchesNotional_)
+      template<class ViewType, class ...IntArgs>
+      static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs)
       {
-        switch (dataRank_)
+        return view.getWritableEntry(intArgs...);
+      }
+    };
+    
+    template<class BinaryOperator, class ThisUnderlyingViewType, class AUnderlyingViewType, class BUnderlyingViewType,
+             class ArgExtractorThis, class ArgExtractorA, class ArgExtractorB, bool includeInnerLoop=false>
+    struct InPlaceCombinationFunctor
+    {
+    private:
+      ThisUnderlyingViewType this_underlying_;
+      AUnderlyingViewType A_underlying_;
+      BUnderlyingViewType B_underlying_;
+      BinaryOperator binaryOperator_;
+      int innerLoopSize_;
+    public:
+      InPlaceCombinationFunctor(ThisUnderlyingViewType this_underlying, AUnderlyingViewType A_underlying, BUnderlyingViewType B_underlying,
+                                BinaryOperator binaryOperator)
+      :
+      this_underlying_(this_underlying),
+      A_underlying_(A_underlying),
+      B_underlying_(B_underlying),
+      binaryOperator_(binaryOperator)
+      {
+        INTREPID2_TEST_FOR_EXCEPTION(includeInnerLoop,std::invalid_argument,"If includeInnerLoop is true, must specify the size of the inner loop");
+      }
+      
+      InPlaceCombinationFunctor(ThisUnderlyingViewType this_underlying, AUnderlyingViewType A_underlying, BUnderlyingViewType B_underlying,
+                                BinaryOperator binaryOperator, int innerLoopSize)
+      :
+      this_underlying_(this_underlying),
+      A_underlying_(A_underlying),
+      B_underlying_(B_underlying),
+      binaryOperator_(binaryOperator),
+      innerLoopSize_(innerLoopSize)
+      {
+        INTREPID2_TEST_FOR_EXCEPTION(includeInnerLoop,std::invalid_argument,"If includeInnerLoop is true, must specify the size of the inner loop");
+      }
+      
+      template<class ...IntArgs, bool M=includeInnerLoop>
+      KOKKOS_INLINE_FUNCTION
+      enable_if_t<!M, void>
+      operator()(const IntArgs&... args) const
+      {
+        auto      & result = ArgExtractorThis::get( this_underlying_, args... );
+        const auto & A_val =    ArgExtractorA::get(    A_underlying_, args... );
+        const auto & B_val =    ArgExtractorB::get(    B_underlying_, args... );
+        
+        result = binaryOperator_(A_val,B_val);
+      }
+      
+      template<class ...IntArgs, bool M=includeInnerLoop>
+      KOKKOS_INLINE_FUNCTION
+      enable_if_t<M, void>
+      operator()(const IntArgs&... args) const
+      {
+        using int_type = std::tuple_element_t<0, std::tuple<IntArgs...>>;
+        for (int_type iFinal=0; iFinal<static_cast<int_type>(innerLoopSize_); iFinal++)
         {
-          case 1: return data1_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 2: return data2_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 3: return data3_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 4: return data4_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 5: return data5_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 6: return data6_.access(i0,i1,i2,i3,i4,i5,i6);;
-          case 7: return data7_.access(i0,i1,i2,i3,i4,i5,i6);;
-          default:
-            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "invalid dataRank_");
+          auto      & result = ArgExtractorThis::get( this_underlying_, args..., iFinal );
+          const auto & A_val =    ArgExtractorA::get(    A_underlying_, args..., iFinal );
+          const auto & B_val =    ArgExtractorB::get(    B_underlying_, args..., iFinal );
+          
+          result = binaryOperator_(A_val,B_val);
         }
       }
+    };
+    
+    //! storeInPlaceCombination implementation for rank < 7, with compile-time underlying views and argument interpretation.  Intended for internal and expert use.
+    template<class BinaryOperator, class PolicyType, class ThisUnderlyingViewType, class AUnderlyingViewType, class BUnderlyingViewType,
+             class ArgExtractorThis, class ArgExtractorA, class ArgExtractorB>
+    void storeInPlaceCombination(PolicyType &policy, ThisUnderlyingViewType &this_underlying,
+                                 AUnderlyingViewType &A_underlying, BUnderlyingViewType &B_underlying,
+                                 BinaryOperator &binaryOperator, ArgExtractorThis argThis, ArgExtractorA argA, ArgExtractorB argB)
+    {
+      using Functor = InPlaceCombinationFunctor<BinaryOperator, ThisUnderlyingViewType, AUnderlyingViewType, BUnderlyingViewType, ArgExtractorThis, ArgExtractorA, ArgExtractorB>;
+      Functor functor(this_underlying, A_underlying, B_underlying, binaryOperator);
+      Kokkos::parallel_for("compute in-place", policy, functor);
+    }
+    
+    //! storeInPlaceCombination with compile-time rank -- implementation for rank < 7.
+    template<class BinaryOperator, int rank>
+    enable_if_t<rank != 7, void>
+    storeInPlaceCombination(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, BinaryOperator binaryOperator)
+    {
+      auto policy = dataExtentRangePolicy<rank>();
+      using PolicyType = decltype(policy);
+      
+      // shallow copy of this to avoid implicit references to this in calls to getWritableEntry() below
+      Data<DataScalar,DeviceType> thisData = *this;
       
-      const Kokkos::Array<int,7> args {i0,i1,i2,i3,i4,i5,i6};
-      Kokkos::Array<int,7> refEntry;
+      const bool A_1D          = A.getUnderlyingViewRank() == 1;
+      const bool B_1D          = B.getUnderlyingViewRank() == 1;
+      const bool this_1D       = this->getUnderlyingViewRank() == 1;
+      const bool A_constant    = A_1D && (A.getUnderlyingViewSize() == 1);
+      const bool B_constant    = B_1D && (B.getUnderlyingViewSize() == 1);
+      const bool this_constant = this_1D && (this->getUnderlyingViewSize() == 1);
+      const bool A_full        = A.underlyingMatchesLogical();
+      const bool B_full        = B.underlyingMatchesLogical();
+      const bool this_full     = this->underlyingMatchesLogical();
       
-      for (int d=0; d<7; d++)
+      const ConstantArgExtractor<reference_type> constArg;
+      
+      const FullArgExtractor<reference_type> fullArgs;
+      const FullArgExtractor<const_reference_type> fullArgsConst;
+      const FullArgExtractorWritableData fullArgsWritable;
+      
+      const SingleArgExtractor<reference_type,0> arg0;
+      const SingleArgExtractor<reference_type,1> arg1;
+      const SingleArgExtractor<reference_type,2> arg2;
+      const SingleArgExtractor<reference_type,3> arg3;
+      const SingleArgExtractor<reference_type,4> arg4;
+      const SingleArgExtractor<reference_type,5> arg5;
+      
+      // this lambda returns -1 if there is not a rank-1 underlying view whose data extent matches the logical extent in the corresponding dimension;
+      // otherwise, it returns the logical index of the corresponding dimension.
+      auto get1DArgIndex = [](const Data<DataScalar,DeviceType> &data) -> int
+      {
+        const auto & variationTypes = data.getVariationTypes();
+        for (int d=0; d<rank; d++)
+        {
+          if (variationTypes[d] == GENERAL)
+          {
+            return d;
+          }
+        }
+        return -1;
+      };
+      if (this_constant)
+      {
+        // then A, B are constant, too
+        auto thisAE = constArg;
+        auto AAE    = constArg;
+        auto BAE    = constArg;
+        auto & this_underlying = this->getUnderlyingView<1>();
+        auto & A_underlying    = A.getUnderlyingView<1>();
+        auto & B_underlying    = B.getUnderlyingView<1>();
+        storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE);
+      }
+      else if (this_full && A_full && B_full)
+      {
+        auto thisAE = fullArgs;
+        auto AAE    = fullArgs;
+        auto BAE    = fullArgs;
+        
+        auto & this_underlying = this->getUnderlyingView<rank>();
+        auto & A_underlying    = A.getUnderlyingView<rank>();
+        auto & B_underlying    = B.getUnderlyingView<rank>();
+        
+        storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE);
+      }
+      else if (A_constant)
+      {
+        auto AAE = constArg;
+        auto & A_underlying = A.getUnderlyingView<1>();
+        if (this_full)
+        {
+          auto thisAE = fullArgs;
+          auto & this_underlying = this->getUnderlyingView<rank>();
+          
+          if (B_full)
+          {
+            auto BAE = fullArgs;
+            auto & B_underlying = B.getUnderlyingView<rank>();
+            storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE);
+          }
+          else // this_full, not B_full: B may have modular data, etc.
+          {
+            auto BAE = fullArgsConst;
+            storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
+          }
+        }
+        else // this is not full
+        {
+          // below, we optimize for the case of 1D data in B, when A is constant.  Still need to handle other cases…
+          if (B_1D && (get1DArgIndex(B) != -1) )
+          {
+            // since A is constant, that implies that this_1D is true, and has the same 1DArgIndex
+            const int argIndex = get1DArgIndex(B);
+            auto & B_underlying    = B.getUnderlyingView<1>();
+            auto & this_underlying = this->getUnderlyingView<1>();
+            switch (argIndex)
+            {
+              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, AAE, arg0); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, AAE, arg1); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, AAE, arg2); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, AAE, arg3); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, AAE, arg4); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, AAE, arg5); break;
+              default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+            }
+          }
+          else
+          {
+            // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData
+            auto thisAE = fullArgsWritable;
+            auto BAE    = fullArgsConst;
+            storeInPlaceCombination(policy, thisData, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
+          }
+        }
+      }
+      else if (B_constant)
       {
-        if (variationType_[d] == GENERAL)
+        auto BAE = constArg;
+        auto & B_underlying = B.getUnderlyingView<1>();
+        if (this_full)
         {
-          refEntry[d] = args[d];
+          auto thisAE = fullArgs;
+          auto & this_underlying = this->getUnderlyingView<rank>();
+          if (A_full)
+          {
+            auto AAE = fullArgs;
+            auto & A_underlying = A.getUnderlyingView<rank>();
+            
+            storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, BAE);
+          }
+          else  // this_full, not A_full: A may have modular data, etc.
+          {
+            // use A (the Data object).  This could be further optimized by using A's underlying View and an appropriately-defined ArgExtractor.
+            auto AAE = fullArgsConst;
+            storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, thisAE, AAE, BAE);
+          }
         }
-        else if (variationType_[d] == MODULAR)
+        else // this is not full
         {
-          refEntry[d] = args[d] % variationModulus_[d];
+          // below, we optimize for the case of 1D data in A, when B is constant.  Still need to handle other cases…
+          if (A_1D && (get1DArgIndex(A) != -1) )
+          {
+            // since B is constant, that implies that this_1D is true, and has the same 1DArgIndex as A
+            const int argIndex = get1DArgIndex(A);
+            auto & A_underlying    = A.getUnderlyingView<1>();
+            auto & this_underlying = this->getUnderlyingView<1>();
+            switch (argIndex)
+            {
+              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, arg0, BAE); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, arg1, BAE); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, arg2, BAE); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, arg3, BAE); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, arg4, BAE); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, arg5, BAE); break;
+              default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+            }
+          }
+          else
+          {
+            // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData
+            auto thisAE = fullArgsWritable;
+            auto AAE    = fullArgsConst;
+            storeInPlaceCombination(policy, thisData, A, B_underlying, binaryOperator, thisAE, AAE, BAE);
+          }
         }
-        else if (variationType_[d] == BLOCK_PLUS_DIAGONAL)
+      }
+      else // neither A nor B constant
+      {
+        if (this_1D && (get1DArgIndex(thisData) != -1))
         {
-          const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_);
+          // possible ways that "this" could have full-extent, 1D data
+          // 1. A constant, B 1D
+          // 2. A 1D, B constant
+          // 3. A 1D, B 1D
+          // The constant possibilities are already addressed above, leaving us with (3).  Note that A and B don't have to be full-extent, however
+          const int argThis = get1DArgIndex(thisData);
+          const int argA    = get1DArgIndex(A); // if not full-extent, will be -1
+          const int argB    = get1DArgIndex(B); // ditto
           
-          const int &i = args[d];
-          const int &j = args[d+1];
+          auto & A_underlying    = A.getUnderlyingView<1>();
+          auto & B_underlying    = B.getUnderlyingView<1>();
+          auto & this_underlying = this->getUnderlyingView<1>();
+          if ((argA != -1) && (argB != -1))
+          {
+#ifdef INTREPID2_HAVE_DEBUG
+            INTREPID2_TEST_FOR_EXCEPTION(argA != argThis, std::logic_error, "Unexpected 1D arg combination.");
+            INTREPID2_TEST_FOR_EXCEPTION(argB != argThis, std::logic_error, "Unexpected 1D arg combination.");
+#endif
+            switch (argThis)
+            {
+              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg0, arg0, arg0); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg1, arg1, arg1); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg2, arg2, arg2); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg3, arg3, arg3); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg4, arg4, arg4); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, arg5, arg5, arg5); break;
+              default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+            }
+          }
+          else if (argA != -1)
+          {
+            // B is not full-extent in dimension argThis; use the Data object
+            switch (argThis)
+            {
+              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsConst); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsConst); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsConst); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsConst); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsConst); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsConst); break;
+              default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+            }
+          }
+          else
+          {
+            // A is not full-extent in dimension argThis; use the Data object
+            switch (argThis)
+            {
+              case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsConst, arg0); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsConst, arg1); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsConst, arg2); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsConst, arg3); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsConst, arg4); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsConst, arg5); break;
+              default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+            }
+          }
+        }
+        else if (this_full)
+        {
+          // This case uses A,B Data objects; could be optimized by dividing into subcases and using underlying Views with appropriate ArgExtractors.
+          auto & this_underlying = this->getUnderlyingView<rank>();
+          auto thisAE = fullArgs;
           
-          if ((i > blockPlusDiagonalLastNonDiagonal_) || (j > blockPlusDiagonalLastNonDiagonal_))
+          if (A_full)
           {
-            if (i != j)
+            auto & A_underlying = A.getUnderlyingView<rank>();
+            auto AAE = fullArgs;
+            
+            if (B_1D && (get1DArgIndex(B) != -1))
             {
-              // off diagonal: zero
-              return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance.  When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method.  (Use methods that directly take advantage of the data packing instead.)
+              const int argIndex = get1DArgIndex(B);
+              auto & B_underlying = B.getUnderlyingView<1>();
+              switch (argIndex)
+              {
+                case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg0); break;
+                case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg1); break;
+                case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg2); break;
+                case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg3); break;
+                case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg4); break;
+                case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, AAE, arg5); break;
+                default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+              }
             }
             else
             {
-              refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i);
+              // A is full; B is not full, but not constant or full-extent 1D
+              // unoptimized in B access:
+              auto BAE = fullArgsConst;
+              storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
             }
           }
-          else
+          else // A is not full
           {
-            refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j);
+            if (A_1D && (get1DArgIndex(A) != -1))
+            {
+              const int argIndex = get1DArgIndex(A);
+              auto & A_underlying  = A.getUnderlyingView<1>();
+              if (B_full)
+              {
+                auto & B_underlying = B.getUnderlyingView<rank>();
+                auto BAE = fullArgs;
+                switch (argIndex)
+                {
+                  case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg0, BAE); break;
+                  case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg1, BAE); break;
+                  case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg2, BAE); break;
+                  case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg3, BAE); break;
+                  case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg4, BAE); break;
+                  case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B_underlying, binaryOperator, thisAE, arg5, BAE); break;
+                  default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+                }
+              }
+              else
+              {
+                auto BAE = fullArgsConst;
+                switch (argIndex)
+                {
+                  case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg0, BAE); break;
+                  case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg1, BAE); break;
+                  case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg2, BAE); break;
+                  case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg3, BAE); break;
+                  case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg4, BAE); break;
+                  case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg5, BAE); break;
+                  default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
+                }
+              }
+            }
+            else // A not full, and not full-extent 1D
+            {
+              // unoptimized in A, B accesses.
+              auto AAE    = fullArgsConst;
+              auto BAE    = fullArgsConst;
+              storeInPlaceCombination(policy, this_underlying, A, B, binaryOperator, thisAE, AAE, BAE);
+            }
           }
+        }
+        else
+        {
+          // completely un-optimized case: we use Data objects for this, A, B.
+          auto thisAE = fullArgsWritable;
+          auto AAE    = fullArgsConst;
+          auto BAE    = fullArgsConst;
+          storeInPlaceCombination(policy, thisData, A, B, binaryOperator, thisAE, AAE, BAE);
+        }
+      }
+    }
+    
+    //! storeInPlaceCombination with compile-time rank -- implementation for rank of 7.  (Not optimized; expectation is this case will be rarely used.)
+    template<class BinaryOperator, int rank>
+    enable_if_t<rank == 7, void>
+    storeInPlaceCombination(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, BinaryOperator binaryOperator)
+    {
+      auto policy = dataExtentRangePolicy<rank>();
+      
+      using DataType = Data<DataScalar,DeviceType>;
+      using ThisAE = FullArgExtractorWritableData;
+      using AAE    = FullArgExtractor<const_reference_type>;
+      using BAE    = FullArgExtractor<const_reference_type>;
+      
+      const ordinal_type dim6 = getDataExtent(6);
+      const bool includeInnerLoop = true;
+      using Functor = InPlaceCombinationFunctor<BinaryOperator, DataType, DataType, DataType, ThisAE, AAE, BAE, includeInnerLoop>;
+      Functor functor(*this, A, B, binaryOperator, dim6);
+      Kokkos::parallel_for("compute in-place", policy, functor);
+    }
+  public:
+    //! applies the specified unary operator to each entry
+    template<class UnaryOperator>
+    void applyOperator(UnaryOperator unaryOperator)
+    {
+      using ExecutionSpace = typename DeviceType::execution_space;
+      
+      switch (dataRank_)
+      {
+        case 1:
+        {
+          const int dataRank = 1;
+          auto view = getUnderlyingView<dataRank>();
+          
+          const int dataExtent = this->getDataExtent(0);
+          Kokkos::RangePolicy<ExecutionSpace> policy(ExecutionSpace(),0,dataExtent);
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0) {
+            view(i0) = unaryOperator(view(i0));
+          });
           
-          // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above)
-          refEntry[d+1] = 0;
-          d++;
         }
-        else if (variationType_[d] == CONSTANT)
+        break;
+        case 2:
         {
-          refEntry[d] = 0;
+          const int dataRank = 2;
+          auto policy = dataExtentRangePolicy<dataRank>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0, const int &i1) {
+            view(i0,i1) = unaryOperator(view(i0,i1));
+          });
         }
+        break;
+        case 3:
+        {
+          const int dataRank = 3;
+          auto policy = dataExtentRangePolicy<dataRank>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2) {
+            view(i0,i1,i2) = unaryOperator(view(i0,i1,i2));
+          });
+        }
+        break;
+        case 4:
+        {
+          const int dataRank = 4;
+          auto policy = dataExtentRangePolicy<dataRank>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3) {
+            view(i0,i1,i2,i3) = unaryOperator(view(i0,i1,i2,i3));
+          });
+        }
+        break;
+        case 5:
+        {
+          const int dataRank = 5;
+          auto policy = dataExtentRangePolicy<dataRank>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4) {
+            view(i0,i1,i2,i3,i4) = unaryOperator(view(i0,i1,i2,i3,i4));
+          });
+        }
+        break;
+        case 6:
+        {
+          const int dataRank = 6;
+          auto policy = dataExtentRangePolicy<dataRank>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          Kokkos::parallel_for("apply operator in-place", policy,
+          KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4, const int &i5) {
+            view(i0,i1,i2,i3,i4,i5) = unaryOperator(view(i0,i1,i2,i3,i4,i5));
+          });
+        }
+        break;
+        case 7:
+        {
+          const int dataRank = 7;
+          auto policy6 = dataExtentRangePolicy<6>();
+          auto view = getUnderlyingView<dataRank>();
+          
+          const int dim_i6 = view.extent_int(6);
+          
+          Kokkos::parallel_for("apply operator in-place", policy6,
+          KOKKOS_LAMBDA (const int &i0, const int &i1, const int &i2, const int &i3, const int &i4, const int &i5) {
+            for (int i6=0; i6<dim_i6; i6++)
+            {
+              view(i0,i1,i2,i3,i4,i5,i6) = unaryOperator(view(i0,i1,i2,i3,i4,i5,i6));
+            }
+          });
+        }
+        break;
+        default:
+          INTREPID2_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unsupported data rank");
+      }
+    }
+    
+    //! Returns an l-value reference to the specified nominal entry in the underlying view.  Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location.  Intended for Intrepid2 developers and expert users only.
+    template<class ...IntArgs>
+    KOKKOS_INLINE_FUNCTION
+    reference_type getWritableEntry(const IntArgs... intArgs) const
+    {
+#ifdef INTREPID2_HAVE_DEBUG
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(numArgs != rank_, std::invalid_argument, "getWritableEntry() should have the same number of arguments as the logical rank.");
+#endif
+      constexpr int numArgs = sizeof...(intArgs);
+      if (underlyingMatchesLogical_)
+      {
+        // in this case, we require that numArgs == dataRank_
+        return getUnderlyingView<numArgs>()(intArgs...);
+      }
+      
+      // extract the type of the first argument; use that for the arrays below
+      using int_type = std::tuple_element_t<0, std::tuple<IntArgs...>>;
+      
+      const Kokkos::Array<int_type, numArgs> args {intArgs...};
+      Kokkos::Array<int_type, 7> refEntry;
+      for (int d=0; d<numArgs; d++)
+      {
+        switch (variationType_[d])
+        {
+          case CONSTANT: refEntry[d] = 0;                              break;
+          case GENERAL:  refEntry[d] = args[d];                        break;
+          case MODULAR:  refEntry[d] = args[d] % variationModulus_[d]; break;
+          case BLOCK_PLUS_DIAGONAL:
+          {
+            const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_);
+            
+            const int_type &i = args[d];
+            if (d+1 >= numArgs)
+            {
+              INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one");
+            }
+            else
+            {
+              const int_type &j = args[d+1];
+              
+              if ((i > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)))
+              {
+                if (i != j)
+                {
+                  // off diagonal: zero
+                  return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance.  When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method.  (Use methods that directly take advantage of the data packing instead.)
+                }
+                else
+                {
+                  refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i);
+                }
+              }
+              else
+              {
+                refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j);
+              }
+
+              // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above)
+              refEntry[d+1] = 0;
+            }
+            d++;
+          }
+        }
+      }
+       // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1).
+      for (int d=numArgs; d<7; d++)
+      {
+        refEntry[d] = 0;
       }
       
       if (dataRank_ == 1)
@@ -489,6 +1023,57 @@ namespace Intrepid2 {
       }
     }
     
+    //! Constructor in terms of DimensionInfo for each nominal dimension; does not require a View to be specified.  Will allocate a View of appropriate rank, zero-filled.
+    Data(std::vector<DimensionInfo> dimInfoVector)
+    :
+    // initialize member variables as if default constructor; if dimInfoVector is empty, we want default constructor behavior.
+    dataRank_(0), extents_({0,0,0,0,0,0,0}), variationType_({CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT,CONSTANT}), blockPlusDiagonalLastNonDiagonal_(-1), rank_(dimInfoVector.size())
+    {
+      // If dimInfoVector is empty, the member initialization above is correct; otherwise, we set as below.
+      // Either way, once members are initialized, we must call setActiveDims().
+      if (dimInfoVector.size() != 0)
+      {
+        std::vector<int> dataExtents;
+
+        bool blockPlusDiagonalEncountered = true;
+        for (int d=0; d<rank_; d++)
+        {
+          const DimensionInfo & dimInfo = dimInfoVector[d];
+          extents_[d] = dimInfo.nominalExtent;
+          variationType_[d] = dimInfo.variationType;
+          const bool isBlockPlusDiagonal = (variationType_[d] == BLOCK_PLUS_DIAGONAL);
+          const bool isSecondBlockPlusDiagonal = isBlockPlusDiagonal && blockPlusDiagonalEncountered;
+          if (isBlockPlusDiagonal)
+          {
+            blockPlusDiagonalEncountered = true;
+            blockPlusDiagonalLastNonDiagonal_ = dimInfo.blockPlusDiagonalLastNonDiagonal;
+          }
+          if ((variationType_[d] != CONSTANT) && (!isSecondBlockPlusDiagonal))
+          {
+            dataExtents.push_back(dimInfo.dataExtent);
+          }
+        }
+        if (dataExtents.size() == 0)
+        {
+          // constant data
+          dataExtents.push_back(1);
+        }
+        dataRank_ = dataExtents.size();
+        switch (dataRank_)
+        {
+          case 1: data1_ = Kokkos::View<DataScalar*,       DeviceType>("Intrepid2 Data", dataExtents[0]); break;
+          case 2: data2_ = Kokkos::View<DataScalar**,      DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1]); break;
+          case 3: data3_ = Kokkos::View<DataScalar***,     DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2]); break;
+          case 4: data4_ = Kokkos::View<DataScalar****,    DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3]); break;
+          case 5: data5_ = Kokkos::View<DataScalar*****,   DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4]); break;
+          case 6: data6_ = Kokkos::View<DataScalar******,  DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4], dataExtents[5]); break;
+          case 7: data7_ = Kokkos::View<DataScalar*******, DeviceType>("Intrepid2 Data", dataExtents[0], dataExtents[1], dataExtents[2], dataExtents[3], dataExtents[4], dataExtents[5], dataExtents[6]); break;
+          default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid data rank");
+        }
+      }
+      setActiveDims();
+    }
+    
     //! DynRankView constructor.  Will copy to a View of appropriate rank.
     Data(const ScalarView<DataScalar,DeviceType> &data, int rank, Kokkos::Array<int,7> extents, Kokkos::Array<DataVariationType,7> variationType, const int blockPlusDiagonalLastNonDiagonal = -1)
     :
@@ -779,14 +1364,14 @@ namespace Intrepid2 {
       
       if (dimInfo.variationType == BLOCK_PLUS_DIAGONAL)
       {
-        dimInfo.blockPlusDiagonalFirstNonDiagonal = blockPlusDiagonalLastNonDiagonal_;
+        dimInfo.blockPlusDiagonalLastNonDiagonal = blockPlusDiagonalLastNonDiagonal_;
       }
       return dimInfo;
     }
     
     //! Returns (DataVariationType, data extent) in the specified dimension for a Data container that combines (through multiplication, say, or addition) this container with otherData.
     KOKKOS_INLINE_FUNCTION
-    DimensionInfo combinedDimensionInfo(const Data &otherData, const int &dim) const
+    DimensionInfo combinedDataDimensionInfo(const Data &otherData, const int &dim) const
     {
       const DimensionInfo myDimInfo    = getDimensionInfo(dim);
       const DimensionInfo otherDimInfo = otherData.getDimensionInfo(dim);
@@ -999,7 +1584,7 @@ namespace Intrepid2 {
       return dataRank_;
     }
     
-    //! returns the rank of the View that stores the unique data
+    //! returns the number of entries in the View that stores the unique data
     KOKKOS_INLINE_FUNCTION
     ordinal_type getUnderlyingViewSize() const
     {
@@ -1077,7 +1662,7 @@ namespace Intrepid2 {
       }
     }
     
-    //! returns the true extent of the data corresponding to the notional dimension provided; if the data does not vary in that dimension, returns 1
+    //! returns the true extent of the data corresponding to the logical dimension provided; if the data does not vary in that dimension, returns 1
     KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
     {
       for (unsigned i=0; i<activeDims_.size(); i++)
@@ -1095,7 +1680,7 @@ namespace Intrepid2 {
     }
     
     /** \brief  Variation modulus accessor.
-       \param [in] d - the notional dimension whose variation modulus is requested.
+       \param [in] d - the logical dimension whose variation modulus is requested.
        \return the variation modulus.
      
      The variation modulus is defined as the number of unique entries in the specified dimension.
@@ -1103,7 +1688,7 @@ namespace Intrepid2 {
      - for CONSTANT variation, the variation modulus is 1
      - for MODULAR variation, the variation modulus is exactly the modulus by which the data repeats in the specified dimension
      - for GENERAL variation, the variation modulus is the extent in the specified dimension
-     - for BLOCK_PLUS_DIAGONAL, the variation modulus in the first notional dimension of the matrix is the number of nonzeros in the matrix; in the second notional dimension the variation modulus is 1.
+     - for BLOCK_PLUS_DIAGONAL, the variation modulus in the first logical dimension of the matrix is the number of nonzeros in the matrix; in the second logical dimension the variation modulus is 1.
     */
     KOKKOS_INLINE_FUNCTION
     int getVariationModulus(const int &d) const
@@ -1111,217 +1696,47 @@ namespace Intrepid2 {
       return variationModulus_[d];
     }
     
-    //! Returns an array with the variation types in each notional dimension.
+    //! Returns an array with the variation types in each logical dimension.
     KOKKOS_INLINE_FUNCTION
     const Kokkos::Array<DataVariationType,7> & getVariationTypes() const
     {
       return variationType_;
     }
     
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2, typename iType3,
-              typename iType4, typename iType5, typename iType6>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
-         std::is_integral<iType4>::value && std::is_integral<iType5>::value &&
-         std::is_integral<iType6>::value),
-        return_type>::type
-    getEntry(const iType0& i0, const iType1& i1, const iType2& i2,
-             const iType3& i3, const iType4& i4, const iType5& i5,
-             const iType6& i6) const
-    {
-      const Kokkos::Array<int,7> args {static_cast<int>(i0),static_cast<int>(i1),static_cast<int>(i2),
-                                       static_cast<int>(i3),static_cast<int>(i4),static_cast<int>(i5),
-                                       static_cast<int>(i6)};
-      Kokkos::Array<int,7> refEntry;
-      
-      for (int d=0; d<7; d++)
-      {
-        if (variationType_[d] == GENERAL)
-        {
-          refEntry[d] = args[d];
-        }
-        else if (variationType_[d] == MODULAR)
-        {
-          refEntry[d] = args[d] % variationModulus_[d];
-        }
-        else if (variationType_[d] == BLOCK_PLUS_DIAGONAL)
-        {
-          const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_);
-          
-          const int &i = args[d];
-          const int &j = args[d+1];
-          
-          if ((i > blockPlusDiagonalLastNonDiagonal_) || (j > blockPlusDiagonalLastNonDiagonal_))
-          {
-            if (i != j)
-            {
-              // off diagonal: zero
-              return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance.  When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method.  (Use methods that directly take advantage of the data packing instead.)
-            }
-            else
-            {
-              refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i);
-            }
-          }
-          else
-          {
-            refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j);
-          }
-          
-          // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above)
-          refEntry[d+1] = 0;
-          d++;
-        }
-        else if (variationType_[d] == CONSTANT)
-        {
-          refEntry[d] = 0;
-        }
-      }
-      
-      if (dataRank_ == 1)
-      {
-        return data1_(refEntry[activeDims_[0]]);
-      }
-      else if (dataRank_ == 2)
-      {
-        return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]);
-      }
-      else if (dataRank_ == 3)
-      {
-        return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]);
-      }
-      else if (dataRank_ == 4)
-      {
-        return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]);
-      }
-      else if (dataRank_ == 5)
-      {
-        return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]]);
-      }
-      else if (dataRank_ == 6)
-      {
-        return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]],refEntry[activeDims_[5]]);
-      }
-      else // dataRank_ == 7
-      {
-        return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]);
-      }
-    }
-    
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType>::value),
-        return_type>::type
-    operator()(const iType& i0) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data1_(i0);
-      }
-      return getEntry(i0,0,0,0,0,0,0);
-    }
-    
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data2_(i0,i1);
-      }
-      return getEntry(i0,i1,0,0,0,0,0);
-    }
-    
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1, const iType2& i2) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data3_(i0,i1,i2);
-      }
-      return getEntry(i0,i1,i2,0,0,0,0);
-    }
-    
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2, typename iType3>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value && std::is_integral<iType3>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-               const iType3& i3) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data4_(i0,i1,i2,i3);
-      }
-      return getEntry(i0,i1,i2,i3,0,0,0);
+    //! Returns a (read-only) value corresponding to the specified logical data location.
+    template<class ...IntArgs>
+    KOKKOS_INLINE_FUNCTION
+    return_type getEntry(const IntArgs&... intArgs) const
+    {
+      return getWritableEntry(intArgs...);
     }
     
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2, typename iType3,
-              typename iType4>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
-         std::is_integral<iType4>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-               const iType3& i3, const iType4& i4) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data5_(i0,i1,i2,i3,i4);
-      }
-      return getEntry(i0,i1,i2,i3,i4,0,0);
-    }
+    template <bool...> struct bool_pack;
+
+    template <bool... v>
+    using all_true = std::is_same<bool_pack<true, v...>, bool_pack<v..., true>>;
     
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2, typename iType3,
-              typename iType4, typename iType5>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
-         std::is_integral<iType4>::value && std::is_integral<iType5>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-               const iType3& i3, const iType4& i4, const iType5& i5) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data6_(i0,i1,i2,i3,i4,i5);
-      }
-      return getEntry(i0,i1,i2,i3,i4,i5,0);
-    }
+    template <class ...IntArgs>
+    using valid_args = all_true<std::is_integral<IntArgs>{}...>;
     
-    //! Returns a value corresponding to the specified notional data location.
-    template <typename iType0, typename iType1, typename iType2, typename iType3,
-              typename iType4, typename iType5, typename iType6>
-    KOKKOS_INLINE_FUNCTION typename std::enable_if<
-        (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
-         std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
-         std::is_integral<iType4>::value && std::is_integral<iType5>::value &&
-         std::is_integral<iType6>::value),
-        return_type>::type
-    operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-               const iType3& i3, const iType4& i4, const iType5& i5,
-               const iType6& i6) const {
-      if (underlyingMatchesNotional_)
-      {
-        return data7_(i0,i1,i2,i3,i4,i5,i6);
-      }
-      return getEntry(i0,i1,i2,i3,i4,i5,i6);
+    static_assert(valid_args<int,long,unsigned>::value, "valid args works");
+
+    //! Returns a value corresponding to the specified logical data location.
+    template <class ...IntArgs>
+    KOKKOS_INLINE_FUNCTION
+#ifndef __INTEL_COMPILER
+    // icc has a bug that prevents compilation with this enable_if_t
+    // (possibly the same as https://community.intel.com/t5/Intel-C-Compiler/Intel-Compiler-bug-while-deducing-template-arguments-inside/m-p/1164358)
+    // so with icc we'll just skip the argument type/count check
+    enable_if_t<valid_args<IntArgs...>::value && (sizeof...(IntArgs) <= 7),return_type>
+#else
+    return_type
+#endif
+    operator()(const IntArgs&... intArgs) const {
+      return getEntry(intArgs...);
     }
-    
-    //! Returns the notional extent in the specified dimension.
+
+    //! Returns the logical extent in the specified dimension.
     KOKKOS_INLINE_FUNCTION
     int extent_int(const int& r) const
     {
@@ -1352,6 +1767,25 @@ namespace Intrepid2 {
       return false; // statement should be unreachable; included because compilers don't necessarily recognize that fact...
     }
     
+    //! Constructs a container suitable for storing the result of an in-place combination of the two provided data containers.  The two containers must have the same nominal shape.
+    //! \see storeInPlaceCombination()
+    //! \param A  [in] - the first data container.
+    //! \param B  [in] - the second data container.  Must have the same nominal shape as A.
+    //! \return A container with the same nominal shape as A and B, with underlying View storage sufficient to store the result of A + B (or any other in-place combination).
+    static Data<DataScalar,DeviceType> allocateInPlaceCombinationResult( const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B )
+    {
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.rank() != B.rank(), std::invalid_argument, "A and B must have the same nominal shape");
+      const int rank = A.rank();
+      std::vector<DimensionInfo> dimInfo(rank);
+      for (int d=0; d<rank; d++)
+      {
+        INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.extent_int(d) != B.extent_int(d), std::invalid_argument, "A and B must have the same nominal shape");
+        dimInfo[d] = A.combinedDataDimensionInfo(B, d);
+      }
+      Data<DataScalar,DeviceType> result(dimInfo);
+      return result;
+    }
+    
     //! Constructs a container suitable for storing the result of a matrix-vector multiply corresponding to the two provided containers.
     //! \see storeMatMat()
     //! \param A_MatData                                            [in] - nominally (...,D1,D2)-dimensioned container, where D1,D2 correspond to matrix dimensions.
@@ -1664,6 +2098,144 @@ namespace Intrepid2 {
       return Data<DataScalar,DeviceType>(data,resultRank,resultExtents,resultVariationTypes);
     }
     
+    //! returns an MDRangePolicy over the underlying data extents (but with the logical shape).
+    template<int rank>
+    enable_if_t<(rank!=1) && (rank!=7), Kokkos::MDRangePolicy<typename DeviceType::execution_space,Kokkos::Rank<rank>> >
+    dataExtentRangePolicy()
+    {
+      using ExecutionSpace = typename DeviceType::execution_space;
+      Kokkos::Array<int,rank> startingOrdinals;
+      Kokkos::Array<int,rank> extents;
+      
+      for (int d=0; d<rank; d++)
+      {
+        startingOrdinals[d] = 0;
+        extents[d] = getDataExtent(d);
+      }
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<rank>>(startingOrdinals,extents);
+      return policy;
+    }
+    
+    //! returns an MDRangePolicy over the first six underlying data extents (but with the logical shape).
+    template<int rank>
+    enable_if_t<rank==7, Kokkos::MDRangePolicy<typename DeviceType::execution_space,Kokkos::Rank<6>> >
+    dataExtentRangePolicy()
+    {
+      using ExecutionSpace = typename DeviceType::execution_space;
+      Kokkos::Array<int,6> startingOrdinals;
+      Kokkos::Array<int,6> extents;
+      
+      for (int d=0; d<6; d++)
+      {
+        startingOrdinals[d] = 0;
+        extents[d] = getDataExtent(d);
+      }
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<6>>(startingOrdinals,extents);
+      return policy;
+    }
+    
+    template<int rank>
+    inline
+    enable_if_t<rank==1, Kokkos::RangePolicy<typename DeviceType::execution_space> >
+    dataExtentRangePolicy()
+    {
+      using ExecutionSpace = typename DeviceType::execution_space;
+      Kokkos::RangePolicy<ExecutionSpace> policy(ExecutionSpace(),0,getDataExtent(0));
+      return policy;
+    }
+    
+    //! Places the result of an in-place combination (e.g., entrywise sum) into this data container.
+    template<class BinaryOperator>
+    void storeInPlaceCombination(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, BinaryOperator binaryOperator)
+    {
+      using ExecutionSpace = typename DeviceType::execution_space;
+
+#ifdef INTREPID2_HAVE_DEBUG
+      // check nominal extents
+      for (int d=0; d<rank_; d++)
+      {
+        INTREPID2_TEST_FOR_EXCEPTION(A.extent_int(d) != this->extent_int(d), std::invalid_argument, "A, B, and this must agree on all nominal extents");
+        INTREPID2_TEST_FOR_EXCEPTION(B.extent_int(d) != this->extent_int(d), std::invalid_argument, "A, B, and this must agree on all nominal extents");
+      }
+      // TODO: add some checks that data extent of this suffices to accept combined A + B data.
+#endif
+      
+      const bool this_constant = (this->getUnderlyingViewRank() == 1) && (this->getUnderlyingViewSize() == 1);
+
+      // we special-case for constant output here; since the constant case is essentially all overhead, we want to avoid as much of the overhead of storeInPlaceCombination() as possible…
+      if (this_constant)
+      {
+        // constant data
+        Kokkos::RangePolicy<ExecutionSpace> policy(ExecutionSpace(),0,1); // just 1 entry
+        auto this_underlying = this->getUnderlyingView<1>();
+        auto A_underlying = A.getUnderlyingView<1>();
+        auto B_underlying = B.getUnderlyingView<1>();
+        Kokkos::parallel_for("compute in-place", policy,
+        KOKKOS_LAMBDA (const int &i0) {
+          auto & result = this_underlying(0);
+          const auto & A_val = A_underlying(0);
+          const auto & B_val = B_underlying(0);
+          
+          result = binaryOperator(A_val,B_val);
+        });
+      }
+      else
+      {
+        switch (rank_)
+        {
+          case 1: storeInPlaceCombination<BinaryOperator, 1>(A, B, binaryOperator); break;
+          case 2: storeInPlaceCombination<BinaryOperator, 2>(A, B, binaryOperator); break;
+          case 3: storeInPlaceCombination<BinaryOperator, 3>(A, B, binaryOperator); break;
+          case 4: storeInPlaceCombination<BinaryOperator, 4>(A, B, binaryOperator); break;
+          case 5: storeInPlaceCombination<BinaryOperator, 5>(A, B, binaryOperator); break;
+          case 6: storeInPlaceCombination<BinaryOperator, 6>(A, B, binaryOperator); break;
+          case 7: storeInPlaceCombination<BinaryOperator, 7>(A, B, binaryOperator); break;
+          default:
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "unhandled rank in switch");
+        }
+      }
+    }
+    
+    //! stores the in-place (entrywise) sum, A .+ B, into this container.
+    void storeInPlaceSum(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B)
+    {
+      auto sum = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar
+      {
+        return a + b;
+      };
+      storeInPlaceCombination(A, B, sum);
+    }
+    
+    //! stores the in-place (entrywise) product, A .* B, into this container.
+    void storeInPlaceProduct(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B)
+    {
+      auto product = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar
+      {
+        return a * b;
+      };
+      storeInPlaceCombination(A, B, product);
+    }
+    
+    //! stores the in-place (entrywise) difference, A .- B, into this container.
+    void storeInPlaceDifference(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B)
+    {
+      auto difference = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar
+      {
+        return a - b;
+      };
+      storeInPlaceCombination(A, B, difference);
+    }
+    
+    //! stores the in-place (entrywise) quotient, A ./ B, into this container.
+    void storeInPlaceQuotient(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B)
+    {
+      auto quotient = KOKKOS_LAMBDA(const DataScalar &a, const DataScalar &b) -> DataScalar
+      {
+        return a / b;
+      };
+      storeInPlaceCombination(A, B, quotient);
+    }
+    
     //! Places the result of a matrix-vector multiply corresponding to the two provided containers into this Data container.  This Data container should have been constructed by a call to allocateMatVecResult(), or should match such a container in underlying data extent and variation types.
     void storeMatVec( const Data<DataScalar,DeviceType> &matData, const Data<DataScalar,DeviceType> &vecData )
     {
@@ -1684,7 +2256,7 @@ namespace Intrepid2 {
         auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{getDataExtent(0),getDataExtent(1),matRows});
         Kokkos::parallel_for("compute mat-vec", policy,
         KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal, const int &i) {
-          auto & val_i = thisData.getWritableEntry(cellOrdinal, pointOrdinal, i, 0, 0, 0, 0);
+          auto & val_i = thisData.getWritableEntry(cellOrdinal, pointOrdinal, i);
           val_i = 0;
           for (int j=0; j<matCols; j++)
           {
@@ -1698,7 +2270,7 @@ namespace Intrepid2 {
         auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{getDataExtent(0),matRows});
         Kokkos::parallel_for("compute mat-vec", policy,
         KOKKOS_LAMBDA (const int &vectorOrdinal, const int &i) {
-          auto & val_i = thisData.getWritableEntry(vectorOrdinal, i, 0, 0, 0, 0, 0);
+          auto & val_i = thisData.getWritableEntry(vectorOrdinal, i);
           val_i = 0;
           for (int j=0; j<matCols; j++)
           {
@@ -1712,7 +2284,7 @@ namespace Intrepid2 {
         Kokkos::RangePolicy<ExecutionSpace> policy(0,matRows);
         Kokkos::parallel_for("compute mat-vec", policy,
         KOKKOS_LAMBDA (const int &i) {
-          auto & val_i = thisData.getWritableEntry(i, 0, 0, 0, 0, 0, 0);
+          auto & val_i = thisData.getWritableEntry(i);
           val_i = 0;
           for (int j=0; j<matCols; j++)
           {
@@ -1775,7 +2347,7 @@ namespace Intrepid2 {
           {
             for (int j=0; j<rightCols; j++)
             {
-              auto & val_ij = thisData.getWritableEntry(matrixOrdinal, i, j, 0, 0, 0, 0);
+              auto & val_ij = thisData.getWritableEntry(matrixOrdinal, i, j);
               val_ij = 0;
               for (int k=0; k<leftCols; k++)
               {
@@ -1787,7 +2359,7 @@ namespace Intrepid2 {
           }
           for (int i=diagonalStart; i<leftRows; i++)
           {
-            auto & val_ii = thisData.getWritableEntry(matrixOrdinal, i, i, 0, 0, 0, 0);
+            auto & val_ii = thisData.getWritableEntry(matrixOrdinal, i, i);
             const auto & left  = A_MatData(matrixOrdinal,i,i);
             const auto & right = B_MatData(matrixOrdinal,i,i);
             val_ii = left * right;
@@ -1798,7 +2370,7 @@ namespace Intrepid2 {
       {
         // (C,P,D,D), perhaps
         auto policy = Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2> >({0,0},{getDataExtent(0),getDataExtent(1)});
-        if (underlyingMatchesNotional_) // receiving data object is completely expanded
+        if (underlyingMatchesLogical_) // receiving data object is completely expanded
         {
           Kokkos::parallel_for("compute mat-mat", policy,
           KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal) {
@@ -1826,7 +2398,7 @@ namespace Intrepid2 {
             {
               for (int j=0; j<rightCols; j++)
               {
-                auto & val_ij = thisData.getWritableEntry(cellOrdinal,pointOrdinal, i, j, 0, 0, 0);
+                auto & val_ij = thisData.getWritableEntry(cellOrdinal,pointOrdinal, i, j);
                 val_ij = 0;
                 for (int k=0; k<leftCols; k++)
                 {
@@ -1838,7 +2410,7 @@ namespace Intrepid2 {
             }
             for (int i=diagonalStart; i<leftRows; i++)
             {
-              auto & val_ii = thisData.getWritableEntry(cellOrdinal,pointOrdinal, i, i, 0, 0, 0);
+              auto & val_ii = thisData.getWritableEntry(cellOrdinal,pointOrdinal, i, i);
               const auto & left  = A_MatData(cellOrdinal,pointOrdinal,i,i);
               const auto & right = B_MatData(cellOrdinal,pointOrdinal,i,i);
               val_ii = left * right;
@@ -1859,15 +2431,15 @@ namespace Intrepid2 {
       return extents_[0] > 0;
     }
     
-    //! Returns the notional rank of the Data container.
+    //! Returns the logical rank of the Data container.
     KOKKOS_INLINE_FUNCTION
     unsigned rank() const
     {
       return rank_;
     }
     
- /** \brief sets the notional extent in the specified dimension.  If needed, the underlying data container is resized.
-     \param [in] d - the notional dimension in which the extent is to be changed
+ /** \brief sets the logical extent in the specified dimension.  If needed, the underlying data container is resized.
+     \param [in] d - the logical dimension in which the extent is to be changed
      \param [in] newExtent - the new extent
      \note Not supported for dimensions in which the variation type is BLOCK_PLUS_DIAGONAL.
      \note If the variation type is MODULAR, the existing modulus must evenly divide the new extent; the underlying data structure will not be resized in this case.
@@ -1923,11 +2495,11 @@ namespace Intrepid2 {
       extents_[d] = newExtent;
     }
     
-    //! Returns true if the underlying container has exactly the same rank and extents as the notional container.
+    //! Returns true if the underlying container has exactly the same rank and extents as the logical container.
     KOKKOS_INLINE_FUNCTION
-    bool underlyingMatchesNotional() const
+    bool underlyingMatchesLogical() const
     {
-      return underlyingMatchesNotional_;
+      return underlyingMatchesLogical_;
     }
   };
 }
diff --git a/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp b/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp
index 1065bd240e09..86ff6541142e 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_PointToolsDef.hpp
@@ -48,6 +48,13 @@
 #ifndef __INTREPID2_POINTTOOLS_DEF_HPP__
 #define __INTREPID2_POINTTOOLS_DEF_HPP__
 
+#if defined(_MSC_VER) || defined(_WIN32) && defined(__ICL)
+// M_PI, M_SQRT2, etc. are hidden in MSVC by #ifdef _USE_MATH_DEFINES
+  #ifndef _USE_MATH_DEFINES
+  #define _USE_MATH_DEFINES
+  #endif
+  #include <math.h>
+#endif
 
 namespace Intrepid2 {
 
diff --git a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp
index 5c5ee9b1894e..b58d37c02316 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_PolylibDef.hpp
@@ -95,6 +95,14 @@
 #ifndef __INTREPID2_POLYLIB_DEF_HPP__
 #define __INTREPID2_POLYLIB_DEF_HPP__
 
+#if defined(_MSC_VER) || defined(_WIN32) && defined(__ICL)
+// M_PI, M_SQRT2, etc. are hidden in MSVC by #ifdef _USE_MATH_DEFINES
+  #ifndef _USE_MATH_DEFINES
+  #define _USE_MATH_DEFINES
+  #endif
+  #include <math.h>
+#endif
+
 namespace Intrepid2 {
 
   // -----------------------------------------------------------------------
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp
index c6754633e558..377dd2e649f5 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TensorArgumentIterator.hpp
@@ -91,6 +91,32 @@ namespace Intrepid2
       }
     }
     
+    //! Basic constructor in which only the bounds of the tensor components are required.
+    TensorArgumentIterator(const std::vector<ordinal_type> tensorComponentBounds)
+    :
+    numTensorComponents_(tensorComponentBounds.size())
+    {
+      for (ordinal_type r=0; r<numTensorComponents_; r++)
+      {
+        arguments_[r] = 0;
+        bounds_[r]    = tensorComponentBounds[r];
+      }
+    }
+    
+    //! Basic constructor in which only the bounds of the tensor components are required.
+    template<size_t rank>
+    KOKKOS_INLINE_FUNCTION
+    TensorArgumentIterator(const Kokkos::Array<ordinal_type,rank> &tensorComponentBounds)
+    :
+    numTensorComponents_(rank)
+    {
+      for (ordinal_type r=0; r<rank; r++)
+      {
+        arguments_[r] = 0;
+        bounds_[r]    = tensorComponentBounds[r];
+      }
+    }
+    
     //! Proceed to next entry.
     KOKKOS_INLINE_FUNCTION ordinal_type increment()
     {
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
index ba060539321a..4a9293e26c3f 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
@@ -355,7 +355,7 @@ namespace Intrepid2
     
     //! return the index into the specified tensorial component in the dimension specified corresponding to the enumerationIndex given for that dimension.
     KOKKOS_INLINE_FUNCTION
-    ordinal_type getTensorComponentIndex(const ordinal_type &tensorComponent, const ordinal_type &dim, const ordinal_type &enumerationIndex)
+    ordinal_type getTensorComponentIndex(const ordinal_type &tensorComponent, const ordinal_type &dim, const ordinal_type &enumerationIndex) const
     {
       ordinal_type remainingEntryOrdinal = enumerationIndex;
       for (ordinal_type r=0; r<tensorComponent; r++)
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
index 0d8cf9a03980..63d9fe7c75bf 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
@@ -57,6 +57,115 @@ namespace
 {
   using namespace Intrepid2;
 
+/** \brief Data has facilities for in-place combinations of logical data.  Suppose you have two containers of nominal shape (C,P), one of which is constant across cells, the other of which is constant across points.  To combine these (e.g., sum them together entrywise), you want a container that varies in both cells and points.  The test below exercises the facility for allocation of the combined container.
+*/
+  TEUCHOS_UNIT_TEST( Data, AllocateInPlaceCombinationResult )
+  {
+    // test allocateInPlaceCombinationResult()
+    // Use two Data objects A and B, each with nominal shape (5,9,15) -- (C,F,P), say.
+    // with A having variation types of GENERAL, MODULAR, and CONSTANT,
+    // and B having variation types of CONSTANT, CONSTANT, and GENERAL.
+    // Result should have variation types of GENERAL, MODULAR, GENERAL.
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    
+    const int rank        = 3;
+    const int cellCount   = 5;
+    const int fieldCount  = 9;
+    const int pointCount  = 15;
+    
+    const int fieldCountA = 3; // A is modular in field dimension, with variation mod 3.
+    auto AView = getView<Scalar,DeviceType>("A", cellCount, fieldCountA);
+    auto BView = getView<Scalar,DeviceType>("B", pointCount);
+    
+    auto ABView = getView<Scalar,DeviceType>("A+B", cellCount, fieldCountA, pointCount);
+    
+    Kokkos::Array<int,rank> extents {cellCount, fieldCount, pointCount};
+    Kokkos::Array<DataVariationType,rank> A_variation {GENERAL, MODULAR, CONSTANT};
+    Kokkos::Array<DataVariationType,rank> B_variation {CONSTANT, CONSTANT, GENERAL};
+    
+    Data<Scalar,DeviceType> A(AView,extents,A_variation);
+    Data<Scalar,DeviceType> B(BView,extents,B_variation);
+    
+    // expected variation for A+B:
+    Kokkos::Array<DataVariationType,3> AB_variation {GENERAL, MODULAR, GENERAL};
+    // expected Data object for A+B:
+    Data<Scalar,DeviceType> AB_expected(ABView,extents,AB_variation);
+    
+    auto AB_actual = Data<Scalar,DeviceType>::allocateInPlaceCombinationResult(A, B);
+    
+    TEST_EQUALITY(AB_actual.rank(), AB_expected.rank());
+    for (int d=0; d<rank; d++)
+    {
+      const auto actualVariationType   = AB_actual.getVariationTypes()[d];
+      const auto expectedVariationType = AB_expected.getVariationTypes()[d];
+      TEST_EQUALITY(actualVariationType, expectedVariationType);
+      
+      const auto actualVariationModulus   = AB_actual.getVariationModulus(d);
+      const auto expectedVariationModulus = AB_expected.getVariationModulus(d);
+      TEST_EQUALITY(actualVariationModulus, expectedVariationModulus);
+      
+      const auto actualExtent   = AB_actual.extent_int(d);
+      const auto expectedExtent = AB_expected.extent_int(d);
+      TEST_EQUALITY(actualExtent, expectedExtent);
+    }
+    
+    TEST_EQUALITY(AB_actual.getUnderlyingViewRank(), AB_expected.getUnderlyingViewRank());
+    const int dataRank = AB_expected.getUnderlyingViewRank();
+    if (AB_actual.getUnderlyingViewRank() == dataRank)
+    {
+      for (int d=0; d<dataRank; d++)
+      {
+        const auto actualDataExtent   = AB_actual.getDataExtent(d);
+        const auto expectedDataExtent = AB_expected.getDataExtent(d);
+        TEST_EQUALITY(actualDataExtent, expectedDataExtent);
+      }
+    }
+  }
+
+  TEUCHOS_UNIT_TEST( Data, CombinedDimensionInfo )
+  {
+    // test free function, combinedDimensionInfo()
+    
+    DimensionInfo A_dimInfo;
+    DimensionInfo B_dimInfo;
+    DimensionInfo AB_dimInfo;
+    
+    A_dimInfo.nominalExtent = 15;
+    B_dimInfo.nominalExtent = 15;
+    AB_dimInfo.nominalExtent = 15;
+    
+    A_dimInfo.blockPlusDiagonalLastNonDiagonal = -1;
+    B_dimInfo.blockPlusDiagonalLastNonDiagonal = -1;
+    AB_dimInfo.blockPlusDiagonalLastNonDiagonal = -1;
+    
+    A_dimInfo.variationModulus = 15;
+    B_dimInfo.variationModulus = 1;
+    AB_dimInfo.variationModulus = 15;
+    
+    A_dimInfo.variationType = GENERAL;
+    B_dimInfo.variationType = CONSTANT;
+    AB_dimInfo.variationType = GENERAL;
+    
+    A_dimInfo.dataExtent  =  A_dimInfo.nominalExtent / ( A_dimInfo.nominalExtent /  A_dimInfo.variationModulus);
+    B_dimInfo.dataExtent  =  B_dimInfo.nominalExtent / ( B_dimInfo.nominalExtent /  B_dimInfo.variationModulus);
+    AB_dimInfo.dataExtent = AB_dimInfo.nominalExtent / (AB_dimInfo.nominalExtent / AB_dimInfo.variationModulus);
+    
+    // combinedDimensionInfo should commute, so let's test both directions:
+    DimensionInfo AB_dimInfoActual_LR = combinedDimensionInfo(A_dimInfo, B_dimInfo);
+    DimensionInfo AB_dimInfoActual_RL = combinedDimensionInfo(B_dimInfo, A_dimInfo);
+    
+    std::vector<DimensionInfo> actualCombinations {AB_dimInfoActual_LR, AB_dimInfoActual_RL};
+    
+    for (const auto & dimInfoActual : actualCombinations)
+    {
+      TEST_EQUALITY(dimInfoActual.nominalExtent, AB_dimInfo.nominalExtent);
+      TEST_EQUALITY(dimInfoActual.dataExtent, AB_dimInfo.dataExtent);
+      TEST_EQUALITY(dimInfoActual.variationType, AB_dimInfo.variationType);
+      TEST_EQUALITY(dimInfoActual.variationModulus, AB_dimInfo.variationModulus);
+    }
+  }
+
 // #pragma mark Data: EmptyDataMarkedAsInvalid
 /** \brief When Data containers are constructed without arguments, the isValid() method should return false.  This test confirms that that is the case.
  */
@@ -91,7 +200,7 @@ namespace
     Kokkos::RangePolicy<ExecSpaceType> policy(0,1); // trivial policy: 1 entry
     Kokkos::parallel_for("set lastVal", policy,
     KOKKOS_LAMBDA (const int &i) {
-      auto & lastVal = data.getWritableEntry(numRows-1, numCols-1, 0, 0, 0, 0, 0);
+      auto & lastVal = data.getWritableEntry(numRows-1, numCols-1);
       lastVal = lastValueToSet;
     });
     
@@ -104,6 +213,90 @@ namespace
     testFloatingEquality2(expectedView, data, relTol, absTol, out, success);
   }
 
+/** \brief Data has facilities for in-place combinations of logical data.  Suppose you have two containers of nominal shape (C,P), one of which is constant across cells, the other of which is constant across points.  To combine these (e.g., sum them together entrywise), you want a container that varies in both cells and points.  The test below exercises the facility for allocation of the combined container.
+*/
+
+  TEUCHOS_UNIT_TEST( Data, InPlaceSum )
+  {
+    double relTol = 1e-13;
+    double absTol = 1e-13;
+    
+    // Use two Data objects A and B, each with nominal shape (5,9,15) -- (C,F,P), say.
+    // with A having variation types of GENERAL, MODULAR, and CONSTANT,
+    // and B having variation types of CONSTANT, CONSTANT, and GENERAL.
+    // Result should have variation types of GENERAL, MODULAR, GENERAL.
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    
+    const int rank        = 3;
+    const int cellCount   = 5;
+    const int fieldCount  = 9;
+    const int pointCount  = 15;
+    
+    auto formula_A = [] (int cellOrdinal, int fieldOrdinal, int pointOrdinal) -> double
+    {
+      // varies modulus 3 in fieldOrdinal; constant pointwise
+      return double(cellOrdinal) + double(fieldOrdinal % 3);
+    };
+    
+    auto formula_B = [] (int cellOrdinal, int fieldOrdinal, int pointOrdinal) -> double
+    {
+      // constant in cell, field; varies pointwise
+      return double(pointOrdinal);
+    };
+    
+    auto sum = [] (const Scalar &a, const Scalar &b) -> Scalar
+    {
+      return a + b;
+    };
+    
+    const int fieldCountA = 3; // A is modular in field dimension, with variation mod 3.
+    auto AView = getView<Scalar,DeviceType>("A", cellCount, fieldCountA);
+    auto BView = getView<Scalar,DeviceType>("B", pointCount);
+    
+    auto ABView = getView<Scalar,DeviceType>("A+B", cellCount, fieldCountA, pointCount);
+    
+    auto AViewHost  = Kokkos::create_mirror(AView);
+    auto BViewHost  = Kokkos::create_mirror(BView);
+    auto ABViewHost = Kokkos::create_mirror(ABView);
+    for (int cellOrdinal=0; cellOrdinal<cellCount; cellOrdinal++)
+    {
+      for (int fieldOrdinal=0; fieldOrdinal<fieldCountA; fieldOrdinal++)
+      {
+        for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
+        {
+          auto a = formula_A(cellOrdinal,fieldOrdinal,pointOrdinal);
+          auto b = formula_B(cellOrdinal,fieldOrdinal,pointOrdinal);
+          AViewHost (cellOrdinal,fieldOrdinal) = a;
+          BViewHost (pointOrdinal) = b;
+          ABViewHost(cellOrdinal,fieldOrdinal,pointOrdinal) = sum(a,b);
+        }
+      }
+    }
+    Kokkos::deep_copy( AView,  AViewHost);
+    Kokkos::deep_copy( BView,  BViewHost);
+    Kokkos::deep_copy(ABView, ABViewHost);
+    
+    Kokkos::Array<int,rank> extents {cellCount, fieldCount, pointCount};
+    Kokkos::Array<DataVariationType,rank> A_variation {GENERAL, MODULAR, CONSTANT};
+    Kokkos::Array<DataVariationType,rank> B_variation {CONSTANT, CONSTANT, GENERAL};
+    
+    Data<Scalar,DeviceType> A(AView,extents,A_variation);
+    Data<Scalar,DeviceType> B(BView,extents,B_variation);
+    
+    // expected variation for A+B:
+    Kokkos::Array<DataVariationType,3> AB_variation {GENERAL, MODULAR, GENERAL};
+    // expected Data object for A+B:
+    Data<Scalar,DeviceType> AB_expected(ABView,extents,AB_variation);
+    
+    auto AB_actual = Data<Scalar,DeviceType>::allocateInPlaceCombinationResult(A, B);
+    
+    AB_actual.storeInPlaceSum(A, B);
+    
+    // test AB_actual equals AB_expected.  (This will iterate over the nominal extents.)
+    testFloatingEquality3(AB_actual, AB_expected, relTol, absTol, out, success);
+  }
+
 // #pragma mark Data: MatVec
 /** \brief Data provides matrix-vector multiplication support.  This method checks correctness of the computed mat-vec for a particular case involving a 2x2 matrix and a 2x1 vector.
 */
@@ -209,7 +402,7 @@ namespace
   }
 
 // #pragma mark Data: MatMatExplicitIdentity_PDD
-/** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for several cases involving 3x3 identity matrices.  Here, the notional dimensions (C,P,D,D) differ from the stored dimensions of (P,D,D).  We test each possible transpose combination.
+/** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for several cases involving 3x3 identity matrices.  Here, the logical dimensions (C,P,D,D) differ from the stored dimensions of (P,D,D).  We test each possible transpose combination.
 */
 TEUCHOS_UNIT_TEST( Data, MatMatExplicitIdentity_PDD ) // (P,D,D) underlying; notionally (C,P,D,D)
 {
@@ -280,7 +473,7 @@ TEUCHOS_UNIT_TEST( Data, MatMatExplicitIdentity_PDD ) // (P,D,D) underlying; not
 }
 
   // #pragma mark Data: MatMatBlockPlusDiagonal
-/** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for a case involving one 3x3 matrix that has a 2x2 upper left block, and diagonal entry in the (3,3) position, and one 3x3 matrix that is entirely diagonal.  Here, the notional dimensions (C,D,D) match the stored dimensions.
+/** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for a case involving one 3x3 matrix that has a 2x2 upper left block, and diagonal entry in the (3,3) position, and one 3x3 matrix that is entirely diagonal.  Here, the logical dimensions (C,D,D) match the stored dimensions.
 */
   TEUCHOS_UNIT_TEST( Data, MatMatBlockPlusDiagonal )
   {
diff --git a/packages/intrepid2/unit-test/performance/CMakeLists.txt b/packages/intrepid2/unit-test/performance/CMakeLists.txt
index 190c839e5cb5..13a28a738f4c 100644
--- a/packages/intrepid2/unit-test/performance/CMakeLists.txt
+++ b/packages/intrepid2/unit-test/performance/CMakeLists.txt
@@ -1,2 +1,3 @@
+ADD_SUBDIRECTORY(DataCombination)
 ADD_SUBDIRECTORY(StructuredIntegration)
 
diff --git a/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt b/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt
new file mode 100644
index 000000000000..6a73853732ba
--- /dev/null
+++ b/packages/intrepid2/unit-test/performance/DataCombination/CMakeLists.txt
@@ -0,0 +1,13 @@
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+SET(LIBRARIES intrepid2)
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  DataCombinationPerformance
+  SOURCES ${SOURCES}
+  ARGS 
+  NUM_MPI_PROCS 1
+  ADD_DIR_TO_NAME
+  )
diff --git a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp
new file mode 100644
index 000000000000..b965e28b1cb4
--- /dev/null
+++ b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp
@@ -0,0 +1,399 @@
+// @HEADER
+// ************************************************************************
+//
+//                           Intrepid2 Package
+//                 Copyright (2007) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Kyungjoo Kim  (kyukim@sandia.gov),
+//                    Mauro Perego  (mperego@sandia.gov), or
+//                    Nate Roberts  (nvrober@sandia.gov)
+//
+// ************************************************************************
+// @HEADER
+
+/** \file   DataCombinationPerformance.cpp
+    \brief  Main for performance tests comparing performance when combining Intrepid2 Data objects (as sums and products) with the performance of (expanded) Kokkos View objects.
+ 
+ Specifically, we consider a few use cases, each with nominal shape (C,P):
+ 1. Constant data.  This case favors Data objects most heavily, since redundancy in the Views will be maximal.
+ 2. "Affine" data.  This has shape (C,P), but only varies in the cell dimension.
+ 3. General data.  There is no redundancy in the data.  This case favors the View objects most heavily, and will maximally expose overhead from the Data implementation.
+ 
+ We can define an ideal speedup as the relative reduction in flop count.  Any write to a general data container will have an ideal speedup of 1.0 (no reduction in flop count); affine data will have
+ an ideal speedup of P; constant will have an ideal speedup of C*P.
+ 
+ In addition to combinations of "like" Data (e.g., constant plus constant), we can also test combinations of "unlike" Data (e.g., constant plus affine).  We expect these tests to have performance
+ characteristics somewhere between the corresponding "like" tests, but the ideal speedups for these tests will correspond to that of the more general container (e.g., constant plus affine will have
+ the same ideal speedup as affine plus affine: P).
+ 
+ We fix the cell count at 16,000, and allow the point count to vary.  We expect (and observe) that constant/constant operations will not come close to the ideal speedup (because the overhead
+ surrounding the operation dwarfs the single-flop cost); affine/affine operations fare somewhat better, with some non-negligible fraction of the ideal speedup; general/general operations are quite
+ close to the 1.0 ideal speedup.  The mixed operations that result in general containers (e.g., general/constant) often do substantially better than the ideal speedup, thanks to enhanced data locality.
+ 
+ After measuring timings, we also confirm that the two algorithms agree on the results.
+ */
+
+#include "Teuchos_GlobalMPISession.hpp"
+
+#include "Teuchos_StackedTimer.hpp"
+#include "Teuchos_TimeMonitor.hpp"
+#include "Teuchos_DefaultComm.hpp"
+
+#include "Kokkos_Core.hpp"
+
+#include "Intrepid2_Data.hpp"
+#include "Intrepid2_TestUtils.hpp"
+#include "Intrepid2_Types.hpp"
+
+enum CaseChoice
+{
+  Constant,
+  Affine,
+  General
+};
+
+std::string to_string(CaseChoice choice)
+{
+  switch (choice) {
+    case Constant: return "Constant";
+    case Affine:   return "Affine";
+    case General:  return "General";
+    
+    default:       return "Unknown CaseChoice";
+  }
+}
+
+using namespace Intrepid2;
+
+static const int NUM_CELLS = 16000;
+
+template< typename Scalar, typename DeviceType >
+inline
+Data<Scalar, DeviceType> getData(CaseChoice caseChoice, const int numPoints, const double baseValue)
+{
+  using ExecutionSpace = typename DeviceType::execution_space;
+  const int numCells = NUM_CELLS;
+  Kokkos::Array<ordinal_type,2> extents {numCells, numPoints};
+  Kokkos::Array<DataVariationType,2> variationTypes {GENERAL,GENERAL};
+  
+  switch (caseChoice) {
+    case Constant:
+      return Data<Scalar, DeviceType>(baseValue,extents);
+    case Affine:
+    {
+      // (C,P); varies in C dimension
+      variationTypes[1] = CONSTANT;
+      Kokkos::View<Scalar*,DeviceType> cellView("affine case - underlying view",numCells);
+      Kokkos::RangePolicy<ExecutionSpace> policy(ExecutionSpace(), 0, numCells);
+      Kokkos::parallel_for("initialize underlying view data", policy,
+      KOKKOS_LAMBDA (const int &i0) {
+        cellView(i0) = i0 * baseValue;
+      });
+      return Data<Scalar, DeviceType>(cellView,extents,variationTypes);
+    }
+    case General:
+    {
+      // (C,P); varies in C and P dimensions
+      variationTypes[1] = GENERAL;
+      Kokkos::View<Scalar**,DeviceType> cellView("affine case - underlying view",numCells,numPoints);
+      Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>> policy({0,0},{numCells,numPoints});
+      Kokkos::parallel_for("initialize underlying view data", policy,
+      KOKKOS_LAMBDA (const int &i0, const int &i1) {
+        cellView(i0,i1) = i0 * baseValue + i1;
+      });
+      return Data<Scalar, DeviceType>(cellView,extents,variationTypes);
+    }
+    default:
+      return Data<Scalar, DeviceType>();
+  }
+}
+
+double idealSpeedup(CaseChoice caseChoice, const int numPoints)
+{
+  switch (caseChoice) {
+    case Constant:
+      return NUM_CELLS * numPoints;
+    case Affine:
+      return numPoints;
+    case General:
+      return 1.0;
+    default:
+      return -1.0;
+  }
+}
+
+template< typename Scalar, typename DeviceType >
+Kokkos::View<Scalar**, DeviceType> allocateView(const int numPoints)
+{
+  Kokkos::View<Scalar**,DeviceType> view("DataCombinationPerformance - View", NUM_CELLS, numPoints);
+  return view;
+}
+
+template< typename Scalar, typename DeviceType >
+inline
+void fillView(CaseChoice caseChoice, Kokkos::View<Scalar**,DeviceType> view, const double baseValue)
+{
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  switch (caseChoice) {
+    case Constant:
+      Kokkos::deep_copy(view, baseValue);
+      break;
+    case Affine:
+    {
+      Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>> policy({0,0},{view.extent_int(0),view.extent_int(1)});
+      // (C,P); varies in C dimension
+      Kokkos::parallel_for("initialize underlying view data", policy,
+      KOKKOS_LAMBDA (const int &i0, const int &i1) {
+        view(i0,i1) = i0 * baseValue;
+      });
+    }
+      break;
+    case General:
+    {
+      Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>> policy({0,0},{view.extent_int(0),view.extent_int(1)});
+      // (C,P); varies in C and P dimensions
+      Kokkos::parallel_for("initialize underlying view data", policy,
+      KOKKOS_LAMBDA (const int &i0, const int &i1) {
+        view(i0,i1) = i0 * baseValue + i1;
+      });
+    }
+    break;
+    default:
+      break;
+  }
+  ExecutionSpace().fence();
+}
+
+template< typename Scalar, typename DeviceType >
+void sumViews(Kokkos::View<Scalar**,DeviceType> resultView,
+              Kokkos::View<Scalar**,DeviceType> view1, Kokkos::View<Scalar**,DeviceType> view2)
+{
+  using ExecutionSpace = typename DeviceType::execution_space;
+  Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>> policy({0,0},{resultView.extent_int(0),resultView.extent_int(1)});
+  
+  Kokkos::parallel_for("initialize underlying view data", policy,
+  KOKKOS_LAMBDA (const int &i0, const int &i1) {
+    resultView(i0,i1) = view1(i0,i1) + view2(i0,i1);
+  });
+}
+
+int main( int argc, char* argv[] )
+{
+  // Note that the dtor for GlobalMPISession will call Kokkos::finalize_all() but does not call Kokkos::initialize()...
+  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::initialize(argc,argv);
+  
+  using std::cout;
+  using std::endl;
+  using std::string;
+  using std::vector;
+  
+  bool success = true;
+  
+  {
+    vector<CaseChoice> allCaseChoices {Constant, Affine, General};
+    
+    Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options
+    
+    string caseChoiceString = "All"; // alternatives: Standard, NonAffineTensor, AffineTensor, Uniform
+    
+    int pointCountFixed = -1;
+    int pointCountMin = 16;
+    int pointCountMax = 1024;
+    
+    cmdp.setOption("case", &caseChoiceString, "Options: All, Constant, Affine, General");
+    cmdp.setOption("pointCount", &pointCountFixed, "Single point count to run with");
+    cmdp.setOption("minPointCount", &pointCountMin, "Starting point count (will double until max count is reached)");
+    cmdp.setOption("maxPointCount", &pointCountMax, "Maximum point count");
+    
+    if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
+    {
+  #ifdef HAVE_MPI
+      MPI_Finalize();
+  #endif
+      return -1;
+    }
+
+    vector<CaseChoice> caseChoices;
+    if (caseChoiceString == "All")
+    {
+      caseChoices = allCaseChoices;
+    }
+    else if (caseChoiceString == "Constant")
+    {
+      caseChoices = vector<CaseChoice>{Constant};
+    }
+    else if (caseChoiceString == "Affine")
+    {
+      caseChoices = vector<CaseChoice>{Affine};
+    }
+    else if (caseChoiceString == "General")
+    {
+      caseChoices = vector<CaseChoice>{General};
+    }
+    else
+    {
+      cout << "Unrecognized case choice: " << caseChoiceString << endl;
+#ifdef HAVE_MPI
+      MPI_Finalize();
+#endif
+      return -1;
+    }
+    
+    if (pointCountFixed > 0)
+    {
+      pointCountMin = pointCountFixed;
+      pointCountMax = pointCountFixed;
+    }
+    
+    using Scalar = double;
+    using DeviceType = Kokkos::DefaultExecutionSpace::device_type;
+    
+    using DataType = Data<Scalar, DeviceType>;
+    
+    const int charWidth = 15;
+    using std::vector;
+    using std::map;
+    using std::pair;
+    using std::make_pair;
+    using std::tuple;
+    using std::cout;
+    using std::endl;
+    using std::setw;
+    using std::scientific;
+    using std::fixed;
+    
+    const double absTol = 1e-15, relTol = 1e-15;
+    
+    for (CaseChoice caseChoice1 : caseChoices)
+    {
+      for (CaseChoice caseChoice2 : caseChoices)
+      {
+//        {
+//          // DEBUGGING:
+//          if ((caseChoice1 != General) && (caseChoice2 == General))
+//          {
+//            cout << "Set breakpoint here.\n";
+//          }
+//        }
+        
+        // since constant takes so little time (and measurement is therefore noisy), we do a bunch of measurements and use their average
+        const bool bothConstant   = (caseChoice1 == Constant) && (caseChoice2 == Constant);
+        const int numMeasurements = bothConstant ? 1000 : 1;
+        
+        cout << "\n\n*******************************************\n";
+        cout <<     "******   " << setw(12) << to_string(caseChoice1) << "/" << to_string(caseChoice2) << setw(14) << "   ******\n";
+        cout << "*******************************************\n";
+        for (int pointCount=pointCountMin; pointCount<=pointCountMax; pointCount *= 2)
+        {
+          const double baseValue1 = M_PI;
+          const double baseValue2 = 1.0;
+
+          Data<Scalar, DeviceType> result;
+          auto dataTimer = Teuchos::TimeMonitor::getNewTimer("Data sum");
+          for (int i=0; i<numMeasurements; i++)
+          {
+            auto data1 = getData<Scalar, DeviceType>(caseChoice1, pointCount, baseValue1);
+            auto data2 = getData<Scalar, DeviceType>(caseChoice2, pointCount, baseValue2);
+            
+            result = DataType::allocateInPlaceCombinationResult(data1, data2);
+            
+            DeviceType::execution_space().fence();
+            dataTimer->start();
+            result.storeInPlaceSum(data1, data2);
+            DeviceType::execution_space().fence();
+            dataTimer->stop();
+          }
+          double dataElapsedTimeSeconds = dataTimer->totalElapsedTime() / numMeasurements;
+          
+          cout << "Point count:          " << setw(charWidth) << pointCount << endl;
+          cout << "Time (sum - data):    " << setw(charWidth) << std::setprecision(2) << scientific << dataElapsedTimeSeconds << endl;
+          
+          dataTimer->reset();
+          
+          auto viewTimer = Teuchos::TimeMonitor::getNewTimer("View sum");
+          auto view1 = allocateView<Scalar, DeviceType>(pointCount);
+          auto view2 = allocateView<Scalar, DeviceType>(pointCount);
+          auto resultView = allocateView<Scalar, DeviceType>(pointCount);
+          
+          fillView(caseChoice1, view1, baseValue1);
+          fillView(caseChoice2, view2, baseValue2);
+          
+          DeviceType::execution_space().fence();
+          viewTimer->start();
+          sumViews(resultView, view1, view2);
+          DeviceType::execution_space().fence();
+          viewTimer->stop();
+          double viewElapsedTimeSeconds = viewTimer->totalElapsedTime();
+          cout << "Time (sum - view):    " << setw(charWidth) << std::setprecision(2) << scientific << viewElapsedTimeSeconds << endl;
+          
+          viewTimer->reset();
+          
+          const double maxSpeedup = std::min(idealSpeedup(caseChoice1, pointCount),idealSpeedup(caseChoice2, pointCount));
+          const double actualSpeedup = viewElapsedTimeSeconds / dataElapsedTimeSeconds;
+          const double percentage = actualSpeedup / maxSpeedup * 100.0;
+          cout << "Ideal speedup:        " << setw(charWidth) << std::setprecision(2) << scientific << maxSpeedup << endl;
+          cout << "Actual speedup:       " << setw(charWidth) << std::setprecision(2) << scientific << actualSpeedup << endl;
+          cout << "Percentage of ideal:  " << setw(charWidth) << std::setprecision(2) << fixed << percentage << "%" << endl;
+          cout << endl;
+          
+          // to optimize for the case where the test passes, we output to a Teuchos::oblackholestream first.
+          // if the test fails, we repeat the comparison to std::cout.
+          Teuchos::oblackholestream  outNothing;
+          Teuchos::basic_FancyOStream<char> out(Teuchos::rcp(&outNothing,false));
+          bool localSuccess = true;
+          testFloatingEquality2(resultView, result, relTol, absTol, out, localSuccess);
+          
+          if (!localSuccess)
+          {
+            cout << "Error: results do not match.  Comparison details:\n";
+            
+            Teuchos::oblackholestream  outNothing;
+            Teuchos::basic_FancyOStream<char> out(Teuchos::rcp(&outNothing,false));
+            
+            Teuchos::basic_FancyOStream<char> std_out(Teuchos::rcp(&std::cout,false));
+            testFloatingEquality2(resultView, result, relTol, absTol, std_out, localSuccess);
+            
+            success = false;
+          }
+        }
+      }
+    }
+  }
+  
+  if (success)
+    return 0;
+  else
+    return -1;
+}
diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md
index 911bb3219754..4326f3ee5f19 100644
--- a/packages/kokkos-kernels/CHANGELOG.md
+++ b/packages/kokkos-kernels/CHANGELOG.md
@@ -1,39 +1,50 @@
 # Change Log
 
+## [3.4.01](https://github.com/kokkos/kokkos-kernels/tree/3.4.01) (2021-05-19)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.4.00...3.4.01)
+
+**Fixed Bugs:**
+- Windows: Fixes for Windows [\#981](https://github.com/kokkos/kokkos-kernels/pull/981)
+- Sycl: ArithTraits fixes for Sycl [\#959](https://github.com/kokkos/kokkos-kernels/pull/959)
+- Sparse: Added code to allow KokkosKernels coloring to accept partial colorings [\#938](https://github.com/kokkos/kokkos-kernels/pull/938)
+- Sparse: Include sorting within spiluk [\#972](https://github.com/kokkos/kokkos-kernels/pull/972)
+- Sparse: Fix CrsMatrix raw pointer constructor [\#971](https://github.com/kokkos/kokkos-kernels/pull/971)
+- Sparse: Fix spmv Serial beta==-1 code path [\#947](https://github.com/kokkos/kokkos-kernels/pull/947)
+
 ## [3.4.00](https://github.com/kokkos/kokkos-kernels/tree/3.4.00) (2021-04-25)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00)
 
 **Features:**
-- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos/pull/924)
+- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos-kernels/pull/924)
 
 **Implemented enhancements Algorithms and Archs:**
-- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos/pull/921)
-- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos/pull/899)
-- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos/pull/895)
-- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos/pull/893)
+- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos-kernels/pull/921)
+- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos-kernels/pull/899)
+- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos-kernels/pull/895)
+- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos-kernels/pull/893)
 
 **Implemented enhancements BuildSystem:**
-- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos/pull/901)
-- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos/pull/880)
-- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos/pull/878)
+- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos-kernels/pull/901)
+- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos-kernels/pull/880)
+- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos-kernels/pull/878)
 
 **Implemented enhancements Other:**
-- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos/pull/931)
-- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos/pull/925)
-- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos/pull/892)
+- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos-kernels/pull/931)
+- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos-kernels/pull/925)
+- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos-kernels/pull/892)
 
 **Fixed bugs:**
-- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos/pull/918)
-- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos/pull/915)
-- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos/pull/910)
-- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos/pull/898)
-- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos/pull/894)
-- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos/pull/885)
-- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos/pull/884)
-- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos/pull/883)
-- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos/pull/882)
-- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos/pull/872)
-- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos/pull/871)
+- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos-kernels/pull/918)
+- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos-kernels/pull/915)
+- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos-kernels/pull/910)
+- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos-kernels/pull/898)
+- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos-kernels/pull/894)
+- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos-kernels/pull/885)
+- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos-kernels/pull/884)
+- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos-kernels/pull/883)
+- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos-kernels/pull/882)
+- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos-kernels/pull/872)
+- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos-kernels/pull/871)
 
 ## [3.3.01](https://github.com/kokkos/kokkos-kernels/tree/3.3.01) (2021-01-18)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.00...3.3.01)
diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt
index 1f698db6683a..88292bdd0c06 100644
--- a/packages/kokkos-kernels/CMakeLists.txt
+++ b/packages/kokkos-kernels/CMakeLists.txt
@@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
   SET(KokkosKernels_VERSION_MINOR 4)
-  SET(KokkosKernels_VERSION_PATCH 0)
+  SET(KokkosKernels_VERSION_PATCH 01)
 ENDIF()
 
 IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
diff --git a/packages/kokkos-kernels/master_history.txt b/packages/kokkos-kernels/master_history.txt
index a113e3619f79..5c63ba453d97 100644
--- a/packages/kokkos-kernels/master_history.txt
+++ b/packages/kokkos-kernels/master_history.txt
@@ -12,3 +12,5 @@ tag: 3.1.01     date: 05/04/2020  master: 43773523    release: 6fce7502
 tag: 3.2.00     date: 08/19/2020  master: 07a60bcc    release: ea3f2b77
 tag: 3.3.00     date: 12/16/2020  master: 42defc56    release: e5279e55
 tag: 3.3.01     date: 01/18/2021  master: f64b1c57    release: 4e1cc00b
+tag: 3.4.00     date: 04/26/2021  master: fe439b21    release: d3c33910
+tag: 3.4.01     date: 05/20/2021  master: 564dccb3    release: 4c62eb86
diff --git a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp
index f96ffc49c39c..17d3f568fe10 100644
--- a/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp
+++ b/packages/kokkos-kernels/src/Kokkos_ArithTraits.hpp
@@ -729,7 +729,13 @@ class ArithTraits<Kokkos::Experimental::half_t> {
     return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half<float>(x)));
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::cbrt (Kokkos::Experimental::cast_from_half<float>(x)));
+    return Kokkos::Experimental::cast_to_half(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(Kokkos::Experimental::cast_from_half<float>(x))
+#else
+        ::cbrt(Kokkos::Experimental::cast_from_half<float>(x))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half<float>(x)));
@@ -762,10 +768,22 @@ class ArithTraits<Kokkos::Experimental::half_t> {
     return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half<float>(x)));
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::acos (Kokkos::Experimental::cast_from_half<float>(x)));
+    return Kokkos::Experimental::cast_to_half(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::acos(Kokkos::Experimental::cast_from_half<float>(x))
+#else
+        ::acos(Kokkos::Experimental::cast_from_half<float>(x))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::atan (Kokkos::Experimental::cast_from_half<float>(x)));
+    return Kokkos::Experimental::cast_to_half(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::atan(Kokkos::Experimental::cast_from_half<float>(x))
+#else
+        ::atan(Kokkos::Experimental::cast_from_half<float>(x))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
     //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS);
@@ -858,16 +876,16 @@ class ArithTraits<float> {
   static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
-#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    using sycl::isinf
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isinf;
 #endif
     return isinf (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
-#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    using sycl::isnan
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isnan;
 #endif
     return isnan (x);
   }
@@ -899,10 +917,18 @@ class ArithTraits<float> {
     return ::pow (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION float sqrt (const float x) {
-    return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION float cbrt (const float x) {
-    return ::cbrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION float exp (const float x) {
     return ::exp (x);
@@ -938,7 +964,11 @@ class ArithTraits<float> {
     return ::acos (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION float atan (const float x) {
-    return ::atan (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
     return FLT_EPSILON;
@@ -1039,8 +1069,8 @@ class ArithTraits<std::complex<RealFloatType> > {
   static bool isInf(const std::complex<Dummy>& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
-#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    using sycl::isinf
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isinf;
 #endif
     return isinf (real (x)) || isinf (imag (x));
   }
@@ -1062,8 +1092,8 @@ class ArithTraits<std::complex<RealFloatType> > {
   static bool isNan(const std::complex<Dummy>& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
-#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-    using sycl::isnan
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isnan;
 #endif
     return isnan (real (x)) || isnan (imag (x));
   }
@@ -1130,7 +1160,11 @@ class ArithTraits<std::complex<RealFloatType> > {
     return std::sqrt (x);
   }
   static std::complex<RealFloatType> cbrt (const std::complex<RealFloatType>& x) {
-    return std::cbrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
   }
   static std::complex<RealFloatType> exp (const std::complex<RealFloatType>& x) {
     return std::exp (x);
@@ -1166,7 +1200,12 @@ class ArithTraits<std::complex<RealFloatType> > {
     return std::acos (x);
   }
   static std::complex<RealFloatType> atan (const std::complex<RealFloatType>& x) {
-    return std::atan (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    using sycl::atan;
+#else
+    using std::atan;
+#endif
+    return atan(x);
   }
   static std::complex<RealFloatType> nan () {
     const mag_type mag_nan = ArithTraits<mag_type>::nan ();
@@ -1251,17 +1290,17 @@ class ArithTraits<double> {
   static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
     #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
-    #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isinf;
-    #endif
+#endif
     return isinf (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
     #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
-    #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isnan;
-    #endif
+#endif
     return isnan (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
@@ -1292,10 +1331,18 @@ class ArithTraits<double> {
     return ::pow (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-    return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return ::cbrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return ::exp (x);
@@ -1331,7 +1378,11 @@ class ArithTraits<double> {
     return ::acos (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-    return ::atan (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
 #if defined(__CUDA_ARCH__)
@@ -2224,10 +2275,22 @@ class ArithTraits<char> {
     // some reasonable value (like 0), though this might be more
     // expensive than the absolute value interpreted using the ternary
     // operator.
-    return static_cast<val_type> ( ::sqrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
@@ -2346,10 +2409,22 @@ class ArithTraits<signed char> {
     return intPowSigned<val_type> (x, y);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-    return static_cast<val_type> ( ::sqrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
@@ -2471,10 +2546,22 @@ class ArithTraits<unsigned char> {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
-    return static_cast<val_type> ( ::sqrt (static_cast<float> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<float> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<float> (x)));
@@ -2604,10 +2691,22 @@ class ArithTraits<short> {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
-    return static_cast<val_type> ( ::sqrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<float> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
@@ -2735,10 +2834,22 @@ class ArithTraits<unsigned short> {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
-    return static_cast<val_type> ( ::sqrt (static_cast<float> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<float> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<float> (x)));
@@ -2874,10 +2985,22 @@ class ArithTraits<int> {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (abs (x))));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<double> (abs (x))));
@@ -3005,10 +3128,22 @@ class ArithTraits<unsigned int> {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::sqrt(static_cast<float>(abs(x)))
+#else
+        ::sqrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<float>(abs(x)))
+#else
+        ::cbrt(static_cast<float>(abs(x)))
+#endif
+    );
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
     return static_cast<val_type> ( ::exp (static_cast<double> (x)));
@@ -3272,7 +3407,13 @@ class ArithTraits<unsigned long> {
     using std::cbrtl;
     return static_cast<val_type> ( ::cbrtl (static_cast<long double> (x)));
 #else
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+    return static_cast<val_type>(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(static_cast<double>(abs(x)))
+#else
+        ::cbrt(static_cast<double>(abs(x)))
+#endif
+    );
 #endif
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
@@ -3406,7 +3547,7 @@ class ArithTraits<long long> {
     // 64-bit integer type exactly.  However, CUDA does not implement
     // long double for device functions.
     return static_cast<val_type> ( sqrt (static_cast<long double> (abs (x))));
-#else
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     // Casting from a 64-bit integer type to double does result in a
     // loss of accuracy.  However, it gives us a good first
     // approximation.  For very large numbers, we may lose some
@@ -3417,6 +3558,8 @@ class ArithTraits<long long> {
     // which it has to be, so we don't have to check) to ensure
     // correctness.  It actually should suffice to check numbers
     // within 1 of the result.
+    return static_cast<val_type>(sycl::sqrt(static_cast<double>(abs(x))));
+#else
     return static_cast<val_type> ( ::sqrt (static_cast<double> (abs (x))));
 #endif
   }
@@ -3425,6 +3568,8 @@ class ArithTraits<long long> {
     using std::cbrtl;
     using std::abs;
     return static_cast<val_type> ( cbrtl (static_cast<long double> (abs (x))));
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    return static_cast<val_type>(sycl::cbrt(static_cast<double>(abs(x))));
 #else
     return static_cast<val_type> ( ::cbrt (static_cast<double> (abs (x))));
 #endif
@@ -3555,6 +3700,8 @@ class ArithTraits<unsigned long long> {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::sqrt;
     return static_cast<val_type> ( sqrt (static_cast<long double> (x)));
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    return static_cast<val_type>(sycl::sqrt(static_cast<double>(x)));
 #else
     return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
 #endif
@@ -3563,6 +3710,8 @@ class ArithTraits<unsigned long long> {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::cbrtl;
     return static_cast<val_type> ( cbrtl (static_cast<long double> (x)));
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    return static_cast<val_type>(sycl::cbrt(static_cast<double>(x)));
 #else
     return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
 #endif
@@ -3700,10 +3849,18 @@ struct ArithTraits<dd_real>
     return ::pow(x,y);
   }
   static inline val_type sqrt (const val_type& x) {
-      return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
   static inline val_type cbrt (const val_type& x) {
-      return ::cbrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
   }
   static inline val_type exp (const val_type& x) {
       return ::exp (x);
@@ -3740,7 +3897,11 @@ struct ArithTraits<dd_real>
     return ::acos (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-    return ::atan (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
   }
   static inline val_type nan () {
     return val_type::_nan;
@@ -3801,7 +3962,11 @@ struct ArithTraits<dd_real>
   }
   static std::string name () { return "dd_real"; }
   static val_type squareroot (const val_type& x) {
-    return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
 };
 
@@ -3852,10 +4017,18 @@ struct ArithTraits<qd_real>
     return ::pow (x, y);
   }
   static inline val_type sqrt (const val_type& x) {
-    return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
   static inline val_type cbrt (const val_type& x) {
-    return ::cbrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::cbrt(x);
+#else
+    return ::cbrt(x);
+#endif
   }
   static inline val_type exp (const val_type& x) {
     return ::exp (x);
@@ -3892,7 +4065,11 @@ struct ArithTraits<qd_real>
     return ::acos (x);
   }
   static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
-    return ::atan (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::atan(x);
+#else
+    return ::atan(x);
+#endif
   }
   static inline val_type nan () {
     return val_type::_nan;
@@ -3957,7 +4134,11 @@ struct ArithTraits<qd_real>
   }
   static std::string name () { return "qd_real"; }
   static val_type squareroot (const val_type& x) {
-    return ::sqrt (x);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::sqrt(x);
+#else
+    return ::sqrt(x);
+#endif
   }
 };
 #endif // HAVE_KOKKOS_QD
diff --git a/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp b/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp
index 9005c3a6b549..d06c714e19b9 100644
--- a/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp
+++ b/packages/kokkos-kernels/src/blas/KokkosBlas3_gemm.hpp
@@ -48,73 +48,12 @@
 
 #include <KokkosKernels_Macros.hpp>
 #include <KokkosBlas3_gemm_spec.hpp>
-#include <KokkosBlas2_gemv.hpp>
 #include <KokkosKernels_helpers.hpp>
 #include <sstream>
 #include <type_traits>
 
 namespace KokkosBlas {
 
-namespace Impl {
-  // Special codepath for when B/C have 1 column: use GEMV (matrix-vector) instead.
-  // GEMV performs better than tiled GEMM in this case.
-  //
-  // Returns true if the criteria are met and GEMV was run, false otherwise.
-  //
-  // This case must be intercepted here rather than impl in order to call TPL
-  // GEMV instead of TPL GEMM. This codepath was measured to be profitable with cuBLAS.
-  template<class AViewType,
-           class BViewType,
-           class CViewType>
-  bool
-  gemv_based_gemm
-       (const char transA[],
-        const char transB[],
-        typename AViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B,
-        typename CViewType::const_value_type& beta,
-        const CViewType& C,
-        typename std::enable_if<
-          !std::is_same<typename BViewType::array_layout, Kokkos::LayoutStride>::value &&
-          !std::is_same<typename CViewType::array_layout, Kokkos::LayoutStride>::value>::type* = nullptr)
-  {
-    if(toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && B.extent(1) == size_t(1))
-    {
-      // since B/C both have a single column and are not LayoutStride,
-      // can create a raw contiguous rank-1 vector from them rather than using subview.
-      Kokkos::View<typename BViewType::value_type*, typename BViewType::array_layout,
-        typename BViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> Bvec(B.data(), B.extent(0));
-      Kokkos::View<typename CViewType::value_type*, typename CViewType::array_layout,
-        typename CViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> Cvec(C.data(), C.extent(0));
-      KokkosBlas::gemv("N", alpha, A, Bvec, beta, Cvec);
-      return true;
-    }
-    return false;
-  }
-
-  // Don't attempt to call GEMV with LayoutStride vectors.
-  // GEMV is not ETI'd for this case, so there would be undefined symbol errors in tests.
-  template<class AViewType,
-           class BViewType,
-           class CViewType>
-  bool
-  gemv_based_gemm
-       (const char transA[],
-        const char transB[],
-        typename AViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B,
-        typename CViewType::const_value_type& beta,
-        const CViewType& C,
-        typename std::enable_if<
-          std::is_same<typename BViewType::array_layout, Kokkos::LayoutStride>::value ||
-          std::is_same<typename CViewType::array_layout, Kokkos::LayoutStride>::value>::type* = nullptr)
-  {
-    return false;
-  }
-}
-
 /// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
 ///
 /// \tparam AViewType Input matrix, as a 2-D Kokkos::View
@@ -203,10 +142,6 @@ gemm (const char transA[],
   if((A.extent(0) == 0) || (A.extent(1) == 0) || (C.extent(1) == 0))
     return;
 
-  // Check if gemv code path is allowed and profitable, and if so run it.
-  if(Impl::gemv_based_gemm(transA, transB, alpha, A, B, beta, C))
-    return;
-
   // Minimize the number of Impl::GEMV instantiations, by
   // standardizing on particular View specializations for its template
   // parameters.
diff --git a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index 4fa19959cdc1..db5bc9fbca33 100644
--- a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -46,7 +46,6 @@
 
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
-#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "Kokkos_ArithTraits.hpp"
 
 namespace KokkosBlas {
@@ -96,17 +95,17 @@ struct SingleLevelNontransposeGEMV {
   KOKKOS_INLINE_FUNCTION void
   operator () (const IndexType& i) const
   {
-    using y_value_type = typename YViewType::non_const_value_type;
+    using y_value_type = typename std::decay<decltype (y_[i]) >::type;
 
     y_value_type y_i;
     if (betaPreset == 0) {
-      y_i = Kokkos::ArithTraits<y_value_type>::zero ();
+      y_i = Kokkos::Details::ArithTraits<y_value_type>::zero ();
     }
     else if (betaPreset == 1) {
-      y_i = y_(i);
+      y_i = y_[i];
     }
     else { // beta_ != 0 and beta != 1
-      y_i = beta_ * y_(i);
+      y_i = beta_ * y_[i];
     }
 
     const IndexType numCols = A_.extent(1);
@@ -124,7 +123,7 @@ struct SingleLevelNontransposeGEMV {
       }
     }
 
-    y_(i) = y_i;
+    y_[i] = y_i;
   }
 
 private:
@@ -212,10 +211,11 @@ struct SingleLevelTransposeGEMV {
 
     for (IndexType j = 0; j < value_count; ++j) {
       // Sum into initial y_ values; use beta as a pre-multiplier if nonzero.
-      if(betaPreset == 0)
-        y_(j) = y_result[j];
-      else
-        y_(j) = beta_ * y_(j) + y_result[j];
+      const y_value_type y_j =
+        beta_ == ArithTraits<BetaCoeffType>::zero () ?
+        ArithTraits<y_value_type>::zero () :
+        beta_ * y_[j];
+      y_[j] = y_j + y_result[j];
     }
   }
 
@@ -480,136 +480,6 @@ singleLevelGemv (const char trans[],
   }
 }
 
-struct TwoLevelGEMV_LayoutLeftTag {};
-struct TwoLevelGEMV_LayoutRightTag {};
-
-// ---------------------------------------------------------------------------------------------
-// Functor for a two-level parallel_reduce version of GEMV (non-transpose),
-// designed for performance on GPU. Kernel depends on the layout of A.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType = typename AViewType::size_type>
-struct TwoLevelGEMV {
-  using y_value_type   = typename YViewType::non_const_value_type;
-  using AlphaCoeffType = typename AViewType::non_const_value_type;
-  using BetaCoeffType  = typename YViewType::non_const_value_type;
-
-
-  using execution_space = typename AViewType::execution_space;
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  TwoLevelGEMV (const AlphaCoeffType& alpha,
-                         const AViewType& A,
-                         const XViewType& x,
-                         const BetaCoeffType& beta,
-                         const YViewType& y) :
-    alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    static_assert (std::is_integral<IndexType>::value,
-                   "IndexType must be an integer.");
-  }
-
-public:
-  //LayoutLeft version: 32xK blocks.
-  //  -Each team handles block rows. 
-  //  -Groups of 32 threads handle N/teamsize columns sequentially, placing results into shared.
-  //  -Then individual thread results are combined with parallel_reduce.
-  KOKKOS_INLINE_FUNCTION void
-  operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const
-  {
-    using Kokkos::Details::ArithTraits;
-    using Scalar = typename YViewType::non_const_value_type;
-    using KAT = ArithTraits<Scalar>;
-    //Allocate a Scalar in shared for each thread
-    Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar));
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
-    [&](int i)
-    {
-      blockResult[i] = KAT::zero();
-    });
-    team.team_barrier();
-    //Which block this thread will work on
-    int block = team.team_rank() / 32;
-    //Which row in the block this thread will work on
-    IndexType row = team.league_rank() * 32 + team.team_rank() % 32;
-    IndexType blockColStart = columnsPerThread * block;
-    Scalar localSum = KAT::zero();
-    //compute local sum
-    if(row < (IndexType) A_.extent(0))
-    {
-      for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++)
-      {
-        //A access is coalesced, x access is a broadcast
-        localSum += A_(row, col) * x_(col);
-      }
-    }
-    //atomically combine local result into shared
-    Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum);
-    team.team_barrier();
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
-    [&](int i)
-    {
-      IndexType yrow = team.league_rank() * 32 + i;
-      if(yrow < (IndexType) A_.extent(0))
-      {
-        if(beta_ == KAT::zero())
-          y_(yrow) = alpha_ * blockResult[i];
-        else
-          y_(yrow) = beta_ * y_(yrow) + alpha_ * blockResult[i];
-      }
-    });
-  }
-
-  //LayoutRight version: one team per row
-  KOKKOS_INLINE_FUNCTION void
-  operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const
-  {
-    using Kokkos::Details::ArithTraits;
-    using KAT = ArithTraits<typename YViewType::non_const_value_type>;
-
-    const IndexType N = A_.extent(1);
-    const int i = team.league_rank(); // batch id
-
-    // parallel-reduce to compute val += A(:,j)' * x
-    y_value_type val = KAT::zero();
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) {
-      update += A_(i, j) * x_(j);
-    }, val);
-
-    // compute yj = beta*yj + alpha*val
-    Kokkos::single(Kokkos::PerTeam(team),
-    [=]()
-    {
-      if(beta_ == KAT::zero())
-        y_(i) = alpha_ * val;
-      else
-        y_(i) = beta_ * y_(i) + alpha_ * val;
-    });
-  }
-
-  IndexType columnsPerThread;
-private:
-  AlphaCoeffType alpha_;
-  typename AViewType::const_type A_;
-  typename XViewType::const_type x_;
-  BetaCoeffType beta_;
-  YViewType y_;
-};
-
 
 // ---------------------------------------------------------------------------------------------
 // Functor for a two-level parallel_reduce version of (conjugate)
@@ -659,29 +529,23 @@ struct TwoLevelTransposeGEMV {
   operator () (const member_type & team) const
   {
     using Kokkos::Details::ArithTraits;
-    using KAT_A = ArithTraits<typename AViewType::non_const_value_type>;
-    using KAT_Y = ArithTraits<typename YViewType::non_const_value_type>;
+    using KAT = ArithTraits<typename AViewType::non_const_value_type>;
 
     const IndexType M = A_.extent(0);
     const int j = team.league_rank(); // batch id
 
     // parallel-reduce to compute val += A(:,j)' * x
-    y_value_type val = KAT_Y::zero();
+    y_value_type val = KAT:: zero();
     Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, y_value_type &update ) {
       const auto x_i = x_(i);
-      const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j);
+      const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
       update += A_ij * x_i;
     }, val);
 
     // compute yj = beta*yj + alpha*val
-    Kokkos::single(Kokkos::PerTeam(team),
-    [&]()
-    {
-      if(beta_ == KAT_Y::zero())
-        y_(j) = alpha_ * val;
-      else
-        y_(j) = beta_ * y_(j) + alpha_ * val;
-    });
+    if (team.team_rank() == 0) {
+      y_[j] = beta_*y_[j] + alpha_ * val;
+    }
   }
 
 private:
@@ -725,68 +589,38 @@ twoLevelGemv (const char trans[],
   using team_policy_type  = Kokkos::TeamPolicy<execution_space>;
   using range_policy_type = Kokkos::RangePolicy<execution_space, IndexType>;
 
+  using BetaCoeffType = typename YViewType::non_const_value_type;
+
   using Kokkos::Details::ArithTraits;
   using KAT = ArithTraits<typename AViewType::non_const_value_type>;
-  using YKAT = ArithTraits<typename YViewType::non_const_value_type>;
 
-  const char tr = toupper(trans[0]);
+  const char tr = trans[0];
 
   // The transpose and conjugate transpose cases where A has zero rows
   // need special handling.  These are equivalent to y := beta*y.  We
   // could implement this using KokkosBlas::scal, but we don't want to
   // depend on that or its implementation details.  Instead, we reuse
   // an instantiation of the non-transpose case for alpha=0.
-  if (y.extent(0) == 0)
-  {
-    //no entries to update
-    return;
-  }
-  else if (x.extent(0) == 0)
-  {
-    if (beta == YKAT::zero ()) {
+  if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) {
+    if (beta == KAT::zero ()) {
       Kokkos::deep_copy (y, KAT::zero ());
     }
-    else if (beta != YKAT::one ()) {
+    else if (beta != Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
       // "Fake out" a scal() by using the non-transpose alpha=0,
       // general beta case.  This assumes that the functor doesn't
       // check dimensions.
       using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
                                                        0, -1, IndexType>;
       functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, y.extent(0)), functor);
+      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, A.extent(1)), functor);
     }
     return;
   }
 
-  if (tr == 'N') {
-    constexpr bool isLayoutLeft = std::is_same<typename AViewType::array_layout, Kokkos::LayoutLeft>::value;
-    using layout_tag = typename std::conditional<isLayoutLeft,
-      TwoLevelGEMV_LayoutLeftTag, TwoLevelGEMV_LayoutRightTag>::type;
-    using tagged_policy = Kokkos::TeamPolicy<execution_space, layout_tag>;
-    using functor_type = TwoLevelGEMV<AViewType, XViewType, YViewType, IndexType>;
-    functor_type functor (alpha, A, x, beta, y);
-    tagged_policy team;
-    if(isLayoutLeft)
-    {
-      size_t sharedPerTeam = 32 * sizeof(y_value_type);
-      IndexType numTeams = (A.extent(0) + 31) / 32;
-      tagged_policy temp(1, 1);
-      int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());
-      //make sure teamSize is a multiple of 32
-      teamSize -= teamSize % 32;
-      //don't make teamSize larger than what's useful
-      if((size_t) teamSize > 32 * A.extent(1))
-        teamSize = 32 * A.extent(1);
-      int numBlocks = teamSize / 32;
-      functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks;
-      team = tagged_policy(numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
-    }
-    else
-    {
-      //LayoutRight: one team per row
-      team = tagged_policy(A.extent(0), Kokkos::AUTO);
-    }
-    Kokkos::parallel_for ("KokkosBlas::gemv[twoLevel]", team, functor);
+  if (tr == 'N' || tr == 'n') {
+    // NOTE: not implemented, so just call single-level version
+    singleLevelGemv<AViewType, XViewType, YViewType, IndexType>
+         (trans, alpha, A, x, beta, y);
   }
   else {
     if (alpha == KAT::zero () && beta == KAT::zero ()) {
@@ -796,7 +630,7 @@ twoLevelGemv (const char trans[],
     else if (alpha == KAT::zero () && beta == KAT::one ()) {
       // Do nothing (y := 1 * y)
     }
-    else if (tr == 'T') {
+    else if (tr == 'T' || tr == 't') {
       // transpose, and not conj transpose
       team_policy_type  team (A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
@@ -804,7 +638,7 @@ twoLevelGemv (const char trans[],
       functor_type functor (alpha, A, x, beta, y);
       Kokkos::parallel_for ("KokkosBlas::gemv[twoLevelTranspose]", team, functor);
     }
-    else if (tr == 'C' || tr == 'H') {
+    else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') {
       // conjugate transpose
       team_policy_type  team (A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
@@ -815,43 +649,6 @@ twoLevelGemv (const char trans[],
   }
 }
 
-//generalGemv: use 1 level (Range) or 2 level (Team) implementation,
-//depending on whether execution space is CPU or GPU. enable_if makes sure
-//unused kernels are not instantiated.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space
-           <typename AViewType::execution_space>()>::type* = nullptr>
-void
-generalGemvImpl (const char trans[],
-                 typename AViewType::const_value_type& alpha,
-                 const AViewType& A,
-                 const XViewType& x,
-                 typename YViewType::const_value_type& beta,
-                 const YViewType& y)
-{
-  singleLevelGemv (trans, alpha, A, x, beta, y);
-}
-
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space
-           <typename AViewType::execution_space>()>::type* = nullptr>
-void
-generalGemvImpl (const char trans[],
-                 typename AViewType::const_value_type& alpha,
-                 const AViewType& A,
-                 const XViewType& x,
-                 typename YViewType::const_value_type& beta,
-                 const YViewType& y)
-{
-  twoLevelGemv (trans, alpha, A, x, beta, y);
-}
-
 } // namespace Impl
 } // namespace KokkosBlas
 
diff --git a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp
index da7983b07a79..76d98c65bc16 100644
--- a/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp
+++ b/packages/kokkos-kernels/src/blas/impl/KokkosBlas2_gemv_spec.hpp
@@ -136,12 +136,22 @@ struct GEMV {
     // Prefer int as the index type, but use a larger type if needed.
     if (numRows < static_cast<size_type> (INT_MAX) &&
         numCols < static_cast<size_type> (INT_MAX)) {
-      generalGemvImpl<AViewType, XViewType, YViewType, int>
+      #if 1
+      twoLevelGemv<AViewType, XViewType, YViewType, int>
          (trans, alpha, A, x, beta, y);
+      #else
+      singleLevelGemv<AViewType, XViewType, YViewType, int>
+         (trans, alpha, A, x, beta, y);
+      #endif
     }
     else {
-      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>
+      #if 1
+      twoLevelGemv<AViewType, XViewType, YViewType, int64_t>
+         (trans, alpha, A, x, beta, y);
+      #else
+      singleLevelGemv<AViewType, XViewType, YViewType, int64_t>
          (trans, alpha, A, x, beta, y);
+      #endif
     }
     Kokkos::Profiling::popRegion();
   }
diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp
index c845e37c5318..7c343ff5a458 100644
--- a/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp
+++ b/packages/kokkos-kernels/src/common/KokkosKernels_BitUtils.hpp
@@ -46,6 +46,10 @@
 #define _KOKKOSKERNELS_BITUTILS_HPP
 #include "Kokkos_Core.hpp"
 
+#if defined (KOKKOS_COMPILER_MSVC)
+#include <intrin.h>
+#endif
+
 namespace KokkosKernels{
 
 namespace Impl{
@@ -203,6 +207,36 @@ int pop_count( long long i ){
   return __popcnt8(i);
 }
 
+#elif defined (KOKKOS_COMPILER_MSVC)
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned i ){
+    return __popcnt(i);
+}
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned long i ){
+    return __popcnt(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( unsigned long long i ){
+    return __popcnt64(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count(int i ){
+    return __popcnt(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( long i ){
+    return __popcnt(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int pop_count( long long i ){
+    return __popcnt64(i);
+}
+
 #else
   #error "Popcount function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
 #endif
@@ -328,6 +362,35 @@ int least_set_bit(  long long i ){
   return __builtin_ffsll(i);
 }
 
+#elif defined (KOKKOS_COMPILER_MSVC)
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit( unsigned i ){
+    return __lzcnt(i);
+}
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit( unsigned long i ){
+    return __lzcnt(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit( unsigned long long i ){
+    return __lzcnt64(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit( int i ){
+    return __lzcnt(i);
+}
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit(  long i ){
+    return __lzcnt(i);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int least_set_bit(  long long i ){
+    return __lzcnt64(i);
+}
+
 #else
   #error "least_set_bit function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
 #endif
diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp
index 39ac62267c2b..fb557c6f5192 100644
--- a/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp
+++ b/packages/kokkos-kernels/src/common/KokkosKernels_Handle.hpp
@@ -635,9 +635,6 @@ class KokkosKernelsHandle
   }
   void destroy_gs_handle(){
     if (is_owner_of_the_gs_handle && this->gsHandle != NULL){
-      if (this->gsHandle->is_owner_of_coloring()){
-        this->destroy_graph_coloring_handle();
-      }
       delete this->gsHandle;
       this->gsHandle = NULL;
     }
diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp
index b74834db5fcf..351480c3a3e9 100644
--- a/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp
+++ b/packages/kokkos-kernels/src/common/KokkosKernels_IOUtils.hpp
@@ -422,11 +422,13 @@ struct Edge{
 ////////////////////////////////////////////////////////////////////////////////
 inline size_t kk_get_file_size(const char* file)
 {
-  struct stat stat_buf;
+  // struct stat stat_buf;
 
 #ifdef _WIN32
+  struct _stat stat_buf;
   int retval = _stat(file, &stat_buf);
 #else
+  struct stat stat_buf;
   int retval = stat(file, &stat_buf);
 #endif
 
diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp
index b3dcc411660e..f33d6b757f38 100644
--- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp
+++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1Color.hpp
@@ -73,7 +73,7 @@ void graph_color_symbolic(
 
   gch->set_tictoc(handle->get_verbose());
 
-  color_view_type colors_out; //= color_view_type("Graph Colors", num_rows);
+  color_view_type colors_out;
   if(gch->get_vertex_colors().use_count() > 0){
     colors_out = gch->get_vertex_colors();
   } else {
diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index d75b359b961d..826d0da962f0 100644
--- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -192,8 +192,8 @@ class GraphColoringHandle
     overall_coloring_time_phase4(0),
     overall_coloring_time_phase5(0),
     coloring_time(0),
-    num_phases(0), size_of_edge_list(0), lower_triangle_src(), lower_triangle_dst(), use_vtx_list(false),
-    vertex_colors(), is_coloring_called_before(false), num_colors(0)
+    num_phases(0), size_of_edge_list(0), lower_triangle_src(), lower_triangle_dst(),
+    use_vtx_list(false), vertex_colors(), is_coloring_called_before(false), num_colors(0)
   {
     this->choose_default_algorithm();
     this->set_defaults(this->coloring_algorithm_type);
@@ -651,9 +651,9 @@ class GraphColoringHandle
   int get_num_phases() const { return this->num_phases;}
   color_view_t get_vertex_colors() const {return this->vertex_colors;}
   bool is_coloring_called() const {return this->is_coloring_called_before;}
-  bool get_use_vtx_list() const{return this->use_vtx_list;}
+  bool get_use_vtx_list() const {return this->use_vtx_list;}
   nnz_lno_temp_work_view_t get_vertex_list() const {return this->vertex_list;}
-  size_type get_vertex_list_size() const{return this->vertex_list_size;}
+  size_type get_vertex_list_size() const {return this->vertex_list_size;}
   //setters
   void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, size_type vertex_list_size_){
     this->vertex_list = vertex_list_;
diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp
index 53f2b4a26b2e..59a4f474393a 100644
--- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp
+++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2Color.hpp
@@ -90,7 +90,7 @@ void graph_color_distance2(
   InternalEntries rowentries_internal(row_entries.data(), nnz);
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
   //note: last template argument 'false' means do distance-2, not bipartite
-  Impl::GraphColorDistance2
+  KokkosGraph::Impl::GraphColorDistance2
     <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, false>
     gc(num_verts, num_verts, rowmap_internal, rowentries_internal, rowmap_internal, rowentries_internal, gch_d2);
   gc.compute_distance2_color();
@@ -174,7 +174,7 @@ void bipartite_color_rows(
   }
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
   //note: last template argument 'true' means do bipartite one-sided
-  Impl::GraphColorDistance2
+  KokkosGraph::Impl::GraphColorDistance2
     <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, true>
     gc(num_rows, num_columns, rowmap_internal, rowentries_internal, colmap_internal, colentries_internal, gch_d2);
   gc.compute_distance2_color();
@@ -237,7 +237,7 @@ void bipartite_color_columns(
   InternalEntries rowentries_internal(row_entries.data(), nnz);
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
   //note: last template argument 'true' means do bipartite one-sided
-  Impl::GraphColorDistance2
+  KokkosGraph::Impl::GraphColorDistance2
     <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, true>
     gc(num_columns, num_rows, colmap_internal, colentries_internal, rowmap_internal, rowentries_internal, gch_d2);
   gc.compute_distance2_color();
diff --git a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp
index 95b46e87079c..35402a72ffb6 100644
--- a/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp
+++ b/packages/kokkos-kernels/src/graph/KokkosGraph_Distance2ColorHandle.hpp
@@ -122,7 +122,7 @@ class GraphColorDistance2Handle
 
     bool use_vtx_list;
     nnz_lno_temp_work_view_type vertex_list;
-    size_type vertex_list_size;
+    size_type vertex_list_size;    
 
     int num_phases;      // Number of phases used by the coloring algorithm
 
@@ -148,7 +148,7 @@ class GraphColorDistance2Handle
         , overall_coloring_time_phase4(0)
         , overall_coloring_time_phase5(0)
         , coloring_time(0)
-	, use_vtx_list(false)
+        , use_vtx_list(false)
         , num_phases(0)
         , vertex_colors()
         , is_coloring_called_before(false)
@@ -287,9 +287,9 @@ class GraphColorDistance2Handle
 
     bool is_coloring_called() const { return this->is_coloring_called_before; }
 
-    bool get_use_vtx_list() const {return this->use_vtx_list;}
-    nnz_lno_temp_work_view_type get_vertex_list() const {return this->vertex_list;}
-    size_type get_vertex_list_size() const {return this->vertex_list_size;}
+    bool get_use_vtx_list() const { return this->use_vtx_list; }
+    nnz_lno_temp_work_view_type get_vertex_list() const { return this->vertex_list; }
+    size_type get_vertex_list_size() const { return this->vertex_list_size; }
 
     // setters
     void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, size_type vertex_list_size_){
diff --git a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
index 340bc3fc2f07..22ca44cc11d5 100644
--- a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
+++ b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
@@ -368,12 +368,13 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_temp_work_view_t current_vertexList =
         nnz_lno_temp_work_view_t(Kokkos::ViewAllocateWithoutInitializing("vertexList"), this->nv);
     nnz_lno_t current_vertexListLength = this->nv;
-
-    //init vertexList sequentially.
+    
     if(this->cp->get_use_vtx_list()){
+      //get the vertexList from the color handle, if it exists.
       current_vertexList = this->cp->get_vertex_list();
       current_vertexListLength = this->cp->get_vertex_list_size();
     } else {
+      //init vertexList sequentially.
       Kokkos::parallel_for("KokkosGraph::GraphColoring::InitList",
           my_exec_space(0, this->nv), functorInitList<nnz_lno_temp_work_view_t> (current_vertexList));
     }
@@ -2526,6 +2527,7 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
     color_temp_work_view_type color_ban; //colors
     color_t hash; //the number of colors to be assigned initially.
     nnz_lno_temp_work_view_t color_set;
+
     //the value to initialize the color_ban_. We avoid using the first bit representing the sign.
     //Therefore if idx is int, it can represent 32-1 colors. Use color_set to represent more.
     color_t color_ban_init_val;
@@ -2539,11 +2541,11 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
-      //set colors based on their indices.
+      //set colors based on their input colors.
       if(kokcolors(ii) > 0){
         color_t colorsize = sizeof(color_t) * 8 - 1;
-	color_set(ii) = (kokcolors(ii) - 1) / colorsize;
-	kokcolors(ii) = 1 << ((kokcolors(ii) - 1) % colorsize);
+        color_set(ii) = (kokcolors(ii) - 1) / colorsize;
+        kokcolors(ii) = 1 << ((kokcolors(ii) - 1) % colorsize);
       }
       color_ban(ii) = color_ban_init_val;
     }
diff --git a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
index 70760943213d..72a617dc4b3a 100644
--- a/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
+++ b/packages/kokkos-kernels/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
@@ -191,13 +191,12 @@ class GraphColorDistance2
     {
         //Delegate to different coloring functions, depending on algorithm
         using_edge_filtering = false;
-        //color_view_type colors_out("Graph Colors", this->nr);
-	color_view_type colors_out;
-	if(gc_handle->get_vertex_colors().use_count() > 0){
-	  colors_out = gc_handle->get_vertex_colors();
-	} else {
-	  colors_out = color_view_type("Graph Colors", this->nr);
-	}
+        color_view_type colors_out;
+        if(gc_handle->get_vertex_colors().use_count() > 0){
+          colors_out = gc_handle->get_vertex_colors();
+        } else {
+          colors_out = color_view_type("Graph Colors", this->nr);
+        }
         switch(this->gc_handle->get_coloring_algo_type())
         {
           case COLORING_D2_VB_BIT_EF:
@@ -251,15 +250,15 @@ class GraphColorDistance2
             Kokkos::ViewAllocateWithoutInitializing("vertexList"), this->nr);
 
         lno_t current_vertexListLength = this->nr;
-
-        // init conflictlist sequentially.
+        
         if(this->gc_handle->get_use_vtx_list()){
-	  current_vertexList = this->gc_handle->get_vertex_list();
-	  current_vertexListLength = this->gc_handle->get_vertex_list_size();
-	} else {
-	  Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList<lno_view_t>(current_vertexList));
-	}
-
+          //init conflict list from coloring handle
+          current_vertexList = this->gc_handle->get_vertex_list();
+          current_vertexListLength = this->gc_handle->get_vertex_list_size();
+        } else {
+          // init conflictlist sequentially.
+          Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList<lno_view_t>(current_vertexList));
+        }
         // Next iteratons's conflictList
         lno_view_t next_iteration_recolorList(Kokkos::ViewAllocateWithoutInitializing("recolorList"), this->nr);
 
@@ -457,7 +456,6 @@ class GraphColorDistance2
               break;
             }
           }
-	  //make sure vertices with a valid color do not get recolored
           if(color && (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE))
           {
             //Color v
@@ -732,7 +730,7 @@ class GraphColorDistance2
       }
       const lno_t numVerts = this->nr;
       const lno_t numCols = this->nc;
-      //note: initializing forbidden to account for previously-colored vertices
+      //note: relying on forbidden and colors_out being initialized to 0
       forbidden_view forbidden("Forbidden", batch * numCols);
       int iter = 0;
       Kokkos::Impl::Timer timer;
@@ -750,8 +748,9 @@ class GraphColorDistance2
           lno_t vertsPerThread = 1;
           lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread;
           timer.reset();
-	  //refresh forbidden before coloring, to ensure previously-colored vertices do not get recolored unnecessarily.
-          //This avoids using too many colors, by relying on forbidden from before conflict resolution (which is now stale).
+          //if still using this color set, refresh forbidden.
+          //This avoids using too many colors, by relying on forbidden from before previous conflict resolution (which is now stale).
+          //Refreshing forbidden before conflict resolution ensures that previously-colored vertices do not get recolored.
           switch(batch)
           {
             case 1:
@@ -772,8 +771,8 @@ class GraphColorDistance2
               break;
             default:;
           }
-	  forbiddenTime += timer.seconds();
-	  timer.reset();
+          forbiddenTime += timer.seconds();
+          timer.reset();
           switch(batch)
           {
             case 1:
diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp
index d734d9ac3ac5..3ce574602cf2 100644
--- a/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp
+++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_CrsMatrix.hpp
@@ -554,7 +554,37 @@ class CrsMatrix {
              OrdinalType* rowmap,
              OrdinalType* cols)
   {
-    ctor_impl (label, nrows, ncols, annz, val, rowmap, cols);
+    using Kokkos::Unmanaged;
+    using HostRowmap = Kokkos::View<SizeType*, Kokkos::HostSpace>;
+    using UnmanagedRowmap = Kokkos::View<const SizeType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
+    using UnmanagedEntries = Kokkos::View<const OrdinalType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
+    using UnmanagedValues = Kokkos::View<const ScalarType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
+    //Allocate device rowmap, entries, values views
+    typename row_map_type::non_const_type rowmapDevice(Kokkos::ViewAllocateWithoutInitializing("rowmap"), nrows + 1);
+    index_type entriesDevice(Kokkos::ViewAllocateWithoutInitializing("entries"), annz);
+    //given rowmap in ordinal_type, so may need to convert to size_type explicitly
+    HostRowmap rowmapConverted;
+    UnmanagedRowmap rowmapRaw;
+    if(!std::is_same<OrdinalType, SizeType>::value)
+    {
+      rowmapConverted = HostRowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap raw"), nrows + 1);
+      for(OrdinalType i = 0; i <= nrows; i++)
+        rowmapConverted(i) = rowmap[i];
+      rowmapRaw = rowmapConverted;
+    }
+    else
+    {
+      rowmapRaw = UnmanagedRowmap((const SizeType*) rowmap, nrows + 1);
+    }
+    Kokkos::deep_copy(rowmapDevice, rowmapRaw);
+    UnmanagedEntries entriesRaw(cols, annz);
+    Kokkos::deep_copy(entriesDevice, entriesRaw);
+    //Construct graph and populate all members
+    this->numCols_ = ncols;
+    this->graph = StaticCrsGraphType(entriesDevice, rowmapDevice);
+    this->values = values_type(Kokkos::ViewAllocateWithoutInitializing("values"), annz);
+    UnmanagedValues valuesRaw(val, annz);
+    Kokkos::deep_copy(this->values, valuesRaw);
 
     // FIXME (mfh 09 Aug 2013) Specialize this on the Device type.
     // Only use cuSPARSE for the Cuda Device.
@@ -646,15 +676,6 @@ class CrsMatrix {
 #endif // KOKKOS_USE_CUSPARSE
   }
 
-  void
-  ctor_impl (const std::string &label,
-          const OrdinalType nrows,
-          const OrdinalType ncols,
-          const size_type annz,
-          ScalarType* val,
-          OrdinalType* rows,
-          OrdinalType* cols);
-
   KOKKOS_INLINE_FUNCTION
   OrdinalType
   sumIntoValues (const OrdinalType rowi,
@@ -883,50 +904,5 @@ class CrsMatrix {
   ordinal_type numCols_;
 };
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType >
-void
-CrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >::
-ctor_impl (const std::string &label,
-           const OrdinalType nrows,
-           const OrdinalType ncols,
-           const size_type annz,
-           ScalarType* val,
-           OrdinalType* rows,
-           OrdinalType* cols)
-{
-  std::string str = label;
-  values = values_type (str.append (".values"), annz);
-
-  numCols_ = ncols;
-
-  // FIXME (09 Aug 2013) CrsArray only takes std::vector for now.
-  // We'll need to fix that.
-  std::vector<int> row_lengths (nrows, 0);
-
-  // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel.
-  for (OrdinalType i = 0; i < nrows; ++i) {
-    row_lengths[i] = rows[i + 1] - rows[i];
-  }
-
-  graph = Kokkos::create_staticcrsgraph<staticcrsgraph_type> (str.append (".graph"), row_lengths);
-  typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
-  typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
-
-  // FIXME (mfh 21 Jun 2013) This needs to be a parallel copy.
-  // Furthermore, why are the arrays copied twice? -- once here, to a
-  // host view, and once below, in the deep copy?
-  for (size_type i = 0; i < annz; ++i) {
-    if (val) {
-      h_values(i) = val[i];
-    }
-    h_entries(i) = cols[i];
-  }
-
-  Kokkos::deep_copy (values, h_values);
-  Kokkos::deep_copy (graph.entries, h_entries);
-}
 }
 #endif
diff --git a/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
index 917680911556..efb9e0c62e70 100644
--- a/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
+++ b/packages/kokkos-kernels/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
@@ -135,8 +135,6 @@ namespace KokkosSparse{
     //getters
     GSAlgorithm get_algorithm_type() const {return this->algorithm_type;}
 
-    virtual bool is_owner_of_coloring() const {return false;}
-
     nnz_lno_persistent_work_host_view_t get_color_xadj() const {
       return this->color_xadj;
     }
@@ -245,11 +243,18 @@ namespace KokkosSparse{
     scalar_persistent_work_view_t permuted_inverse_diagonal;
     nnz_lno_t block_size; //this is for block sgs
 
-    nnz_lno_t max_nnz_input_row;
-
     nnz_lno_t num_values_in_l1, num_values_in_l2, num_big_rows;
     size_t level_1_mem, level_2_mem;
-    bool owner_of_coloring;
+
+    //Option set by user: rows with at least this many nonzeros are handled by a separate kernel
+    nnz_lno_t long_row_threshold;
+    //Number of long rows per color set. They are all grouped at the end of each color set.
+    nnz_lno_persistent_work_host_view_t long_rows_per_color;
+    //Maximum row length in each color set.
+    nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+    //Temporary space for matvec over long rows - size is only max num long rows in a color.
+    scalar_persistent_work_view_t long_row_x;
+
   public:
 
     /**
@@ -260,17 +265,13 @@ namespace KokkosSparse{
       permuted_xadj(), permuted_adj(), permuted_adj_vals(), old_to_new_map(),
       permuted_y_vector(), permuted_x_vector(),
       permuted_inverse_diagonal(), block_size(1),
-      max_nnz_input_row(-1),
       num_values_in_l1(-1), num_values_in_l2(-1),num_big_rows(0), level_1_mem(0), level_2_mem(0),
-      owner_of_coloring(false)
+      long_row_threshold(0)
     {
       if (gs == GS_DEFAULT)
         this->choose_default_algorithm();
     }
 
-    bool is_owner_of_coloring() const override {return this->owner_of_coloring;}
-    void set_owner_of_coloring(bool owner = true) {this->owner_of_coloring = owner;}
-
     void set_block_size(nnz_lno_t bs){this->block_size = bs; }
     nnz_lno_t get_block_size() const {return this->block_size;}
 
@@ -363,14 +364,44 @@ namespace KokkosSparse{
       return this->num_big_rows;
     }
 
-    nnz_lno_t get_max_nnz() const {
-      if(max_nnz_input_row == static_cast<nnz_lno_t>(-1))
-        throw std::runtime_error("Requested max nnz per input row, but this has not been set in the PointGS handle.");
-      return this->max_nnz_input_row;
+    nnz_lno_t get_long_row_threshold() const
+    {
+      return long_row_threshold;
+    }
+
+    void set_long_row_threshold(nnz_lno_t lrt)
+    {
+      long_row_threshold = lrt;
+    }
+
+    nnz_lno_persistent_work_host_view_t get_long_rows_per_color() const
+    {
+      return long_rows_per_color;
+    }
+
+    void set_long_rows_per_color(const nnz_lno_persistent_work_host_view_t& long_rows_per_color_)
+    {
+      long_rows_per_color = long_rows_per_color_;
     }
 
-    void set_max_nnz(nnz_lno_t num_result_nnz_) {
-      this->max_nnz_input_row = num_result_nnz_;
+    nnz_lno_persistent_work_host_view_t get_max_row_length_per_color() const
+    {
+      return max_row_length_per_color;
+    }
+
+    void set_max_row_length_per_color(const nnz_lno_persistent_work_host_view_t& max_row_length_per_color_)
+    {
+      max_row_length_per_color = max_row_length_per_color_;
+    }
+
+    scalar_persistent_work_view_t get_long_row_x() const
+    {
+      return long_row_x;
+    }
+
+    void set_long_row_x(const scalar_persistent_work_view_t& long_row_x_)
+    {
+      long_row_x = long_row_x_;
     }
 
     void allocate_x_y_vectors(nnz_lno_t num_rows, nnz_lno_t num_cols, nnz_lno_t num_vecs){
@@ -514,7 +545,7 @@ namespace KokkosSparse{
         throw std::runtime_error("inverse diagonal does not exist until after numeric setup.");
       return inverse_diagonal;
     }
-    
+
     bool use_teams() const
     {
       return KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>();
diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index d5c111862fba..a4d74614d741 100644
--- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -50,12 +50,12 @@
 #include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_Timer.hpp>
 #include <Kokkos_Bitset.hpp>
-#include <Kokkos_Sort.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp"
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
+#include "KokkosKernels_Sorting.hpp"
 
 //FOR DEBUGGING
 #include "KokkosBlas1_nrm2.hpp"
@@ -103,9 +103,9 @@ namespace KokkosSparse{
       typedef typename HandleType::scalar_persistent_work_view2d_t scalar_persistent_work_view2d_t;
       typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t;
 
-      typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-      typedef nnz_lno_t color_t;
-      typedef Kokkos::View<color_t *, MyTempMemorySpace> color_view_t;
+      typedef Kokkos::RangePolicy<MyExecSpace> range_pol;
+      typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t;
+      typedef typename HandleType::GraphColoringHandleType::color_t color_t;
       typedef Kokkos::Bitset<MyExecSpace> bitset_t;
       typedef Kokkos::ConstBitset<MyExecSpace> const_bitset_t;
 
@@ -114,9 +114,12 @@ namespace KokkosSparse{
 
       struct BlockTag{};
       struct BigBlockTag{};
+      struct LongRowTag{};
 
-      typedef Kokkos::TeamPolicy<BlockTag, MyExecSpace> block_team_fill_policy_t ;
-      typedef Kokkos::TeamPolicy<BigBlockTag, MyExecSpace> bigblock_team_fill_policy_t ;
+      typedef Kokkos::TeamPolicy<BlockTag, MyExecSpace> block_apply_team_policy_t ;
+      typedef Kokkos::TeamPolicy<BigBlockTag, MyExecSpace> bigblock_apply_team_policy_t ;
+      typedef Kokkos::RangePolicy<LongRowTag, MyExecSpace> longrow_apply_range_policy_t ;
+      typedef Kokkos::TeamPolicy<LongRowTag, MyExecSpace> longrow_apply_team_policy_t ;
       typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_scalar_t> pool_memory_space;
 
     private:
@@ -145,7 +148,7 @@ namespace KokkosSparse{
       bool is_symmetric;
 
       //Batch size for column applies. Used as a stack array size, so must be a compile-time constant.
-      static constexpr nnz_lno_t apply_batch_size = 16;
+      static constexpr nnz_lno_t apply_batch_size = 8;
 
     public:
 
@@ -161,6 +164,11 @@ namespace KokkosSparse{
 
         nnz_scalar_t omega;
 
+        scalar_persistent_work_view_t _long_row_x;  //Results of simple Ax matvec over long rows.
+        nnz_lno_t _long_row_col;  //Which X/Y column is now being processed for long rows.
+        nnz_lno_t _color_set_begin; //(only used for long rows): where the current set of rows begins
+        nnz_lno_t _long_row_par;
+
         PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_,
              scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_, nnz_lno_persistent_work_view_t /* color_adj_ */,
              nnz_scalar_t omega_,
@@ -197,6 +205,25 @@ namespace KokkosSparse{
               _Xvector(ii, batch_start + i) += omega * sum[i] * invDiagonalVal;
           }
         }
+
+        KOKKOS_INLINE_FUNCTION
+        void operator()(const LongRowTag&, const nnz_lno_t i) const {
+          nnz_lno_t row = _color_set_begin + i / _long_row_par;
+          nnz_lno_t chunk = i % _long_row_par;
+          size_type row_begin = _xadj(row);
+          size_type row_end = _xadj(row + 1);
+          size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par;
+          size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
+          if(chunk_end > row_end)
+            chunk_end = row_end;
+          nnz_scalar_t localSum{};
+          for(size_type j = chunk_begin; j < chunk_end; j++)
+          {
+            nnz_lno_t colIndex = _adj(j);
+            localSum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
+          }
+          Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
+        }
       };
 
       struct Team_PSGS{
@@ -226,6 +253,11 @@ namespace KokkosSparse{
 
         typedef typename KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, apply_batch_size> batch_sum;
 
+        nnz_lno_persistent_work_view_t _long_rows;
+        scalar_persistent_work_view_t _long_row_x;
+        nnz_lno_t _long_row_col;  //Which X/Y column is now being processed for long rows.
+        nnz_lno_t _long_row_par;
+
         Team_PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_,
                   scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_,
                   nnz_lno_t color_set_begin, nnz_lno_t color_set_end,
@@ -579,11 +611,118 @@ namespace KokkosSparse{
             });
         }
 
+        KOKKOS_INLINE_FUNCTION
+        void operator()(const LongRowTag&, const team_member_t& teamMember) const {
+          nnz_lno_t row = _color_set_begin + teamMember.league_rank() / _long_row_par;
+          nnz_lno_t chunk = teamMember.league_rank() % _long_row_par;
+          size_type row_begin = _xadj(row);
+          size_type row_end = _xadj(row + 1);
+          size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par;
+          size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
+          if(chunk_end > row_end)
+            chunk_end = row_end;
+          nnz_scalar_t localSum;
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, chunk_begin, chunk_end),
+            [&](size_type j, nnz_scalar_t& lsum)
+            {
+              nnz_lno_t colIndex = _adj(j);
+              lsum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
+            }, localSum);
+          Kokkos::single(Kokkos::PerTeam(teamMember),
+            [&]()
+            {
+              Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
+            });
+        }
+
         size_t team_shmem_size (int /* team_size */) const {
           return shared_memory_size;
         }
       };
 
+      struct LongRowComparator
+      {
+        KOKKOS_DEFAULTED_FUNCTION LongRowComparator() = default;
+        KOKKOS_INLINE_FUNCTION LongRowComparator(const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_)
+          : xadj(xadj_), longRowThreshold(longRowThreshold_)
+        {}
+
+        KOKKOS_INLINE_FUNCTION bool operator()(nnz_lno_t lhs, nnz_lno_t rhs) const
+        {
+          int lhsLong = xadj(lhs + 1) - xadj(lhs) >= longRowThreshold;
+          int rhsLong = xadj(rhs + 1) - xadj(rhs) >= longRowThreshold;
+          if(lhsLong < rhsLong)
+            return true;
+          else if(lhsLong > rhsLong)
+            return false;
+          //Either both long or both short, just order ascending by ID
+          return lhs < rhs;
+        }
+
+        in_lno_row_view_t xadj;
+        size_type longRowThreshold;
+      };
+
+      //Functor to sort each color set - first by whether 'long row', second by ID.
+      //Also populates long_rows_per_color.
+      struct SortIntoLongRowsFunctor
+      {
+        SortIntoLongRowsFunctor(
+            const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_,
+            const nnz_lno_persistent_work_view_t& color_xadj_, const nnz_lno_persistent_work_view_t& color_adj_,
+            const nnz_lno_persistent_work_view_t& long_rows_per_color_, const nnz_lno_persistent_work_view_t& max_row_length_per_color_)
+          : xadj(xadj_), longRowThreshold(longRowThreshold_), color_xadj(color_xadj_), color_adj(color_adj_),
+          long_rows_per_color(long_rows_per_color_), max_row_length_per_color(max_row_length_per_color_)
+        {}
+
+        KOKKOS_INLINE_FUNCTION void operator()(const team_member_t& t, nnz_lno_t& lmostPerColor) const
+        {
+          LongRowComparator comp(xadj, longRowThreshold);
+          nnz_lno_t color = t.league_rank();
+          nnz_lno_t colorBegin = color_xadj(color);
+          nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin;
+          KokkosKernels::Impl::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp);
+          t.team_barrier();
+          //Now that the color set is sorted, count how many long rows there were
+          nnz_lno_t numLongRows;
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
+            [&](nnz_lno_t i, nnz_lno_t& lnumLongRows)
+            {
+              nnz_lno_t row = color_adj(i);
+              if(xadj(row + 1) - xadj(row) >= longRowThreshold)
+                lnumLongRows++;
+            }, numLongRows);
+          Kokkos::single(Kokkos::PerTeam(t),
+            [&]()
+            {
+              long_rows_per_color(color) = numLongRows;
+              if(numLongRows > lmostPerColor)
+                lmostPerColor = numLongRows;
+            });
+          nnz_lno_t max_row_length = 0;
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
+            [&](nnz_lno_t i, nnz_lno_t& lmaxLength)
+            {
+              nnz_lno_t row = color_adj(i);
+              nnz_lno_t len = xadj(row + 1) - xadj(row);
+              if(len > lmaxLength)
+                lmaxLength = len;
+            }, Kokkos::Max<nnz_lno_t>(max_row_length));
+          Kokkos::single(Kokkos::PerTeam(t),
+            [&]()
+            {
+              max_row_length_per_color(color) = max_row_length;
+            });
+        }
+
+        in_lno_row_view_t xadj;
+        size_type longRowThreshold;
+        nnz_lno_persistent_work_view_t color_xadj;
+        nnz_lno_persistent_work_view_t color_adj;
+        nnz_lno_persistent_work_view_t long_rows_per_color;
+        nnz_lno_persistent_work_view_t max_row_length_per_color;
+      };
+
       /**
        * \brief constructor
        */
@@ -615,7 +754,6 @@ namespace KokkosSparse{
         is_symmetric(is_symmetric_){}
 
 
-
       /**
        * \brief constructor
        */
@@ -651,14 +789,11 @@ namespace KokkosSparse{
       void initialize_symbolic()
       {
         auto gsHandle = get_gs_handle();
-        typename HandleType::GraphColoringHandleType *gchandle = this->handle->get_graph_coloring_handle();
+        const size_type longRowThreshold = gsHandle->get_long_row_threshold();
 
-        if (gchandle == NULL)
-        {
-          this->handle->create_graph_coloring_handle();
-          gsHandle->set_owner_of_coloring(true);
-          gchandle = this->handle->get_graph_coloring_handle();
-        }
+        //Validate settings
+        if(gsHandle->get_block_size() > 1 && longRowThreshold > 0)
+          throw std::runtime_error("Can't use MTGS long row algorithm with blocks.");
 
         const_lno_row_view_t xadj = this->row_map;
         const_lno_nnz_view_t adj = this->entries;
@@ -669,31 +804,36 @@ namespace KokkosSparse{
 #endif
         typename HandleType::GraphColoringHandleType::color_view_t colors;
         color_t numColors;
-        if (!is_symmetric) {
-          if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) {
-
-            gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj);
-            KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
-              (this->handle, num_rows, num_rows, xadj, adj);
+        {
+          HandleType coloringHandle;
+          coloringHandle.create_graph_coloring_handle();
+          auto gchandle = coloringHandle.get_graph_coloring_handle();
+          if (!is_symmetric) {
+            if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) {
+
+              gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj);
+              KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
+                (&coloringHandle, num_rows, num_rows, xadj, adj);
+            }
+            else {
+              row_lno_temp_work_view_t tmp_xadj;
+              nnz_lno_temp_work_view_t tmp_adj;
+              KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
+                < const_lno_row_view_t, const_lno_nnz_view_t,
+                  row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
+                  MyExecSpace>
+                (num_rows, xadj, adj, tmp_xadj, tmp_adj);
+              KokkosGraph::Experimental::graph_color_symbolic <HandleType, row_lno_temp_work_view_t, nnz_lno_temp_work_view_t>
+                (&coloringHandle, num_rows, num_rows, tmp_xadj, tmp_adj);
+            }
           }
           else {
-            row_lno_temp_work_view_t tmp_xadj;
-            nnz_lno_temp_work_view_t tmp_adj;
-            KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-              < const_lno_row_view_t, const_lno_nnz_view_t,
-                row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
-                MyExecSpace>
-              (num_rows, xadj, adj, tmp_xadj, tmp_adj);
-            KokkosGraph::Experimental::graph_color_symbolic <HandleType, row_lno_temp_work_view_t, nnz_lno_temp_work_view_t>
-              (this->handle, num_rows, num_rows, tmp_xadj, tmp_adj);
+            KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
+              (&coloringHandle, num_rows, num_rows, xadj, adj);
           }
+          colors =  gchandle->get_vertex_colors();
+          numColors = gchandle->get_num_colors();
         }
-        else {
-          KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
-            (this->handle, num_rows, num_rows, xadj, adj);
-        }
-        colors =  gchandle->get_vertex_colors();
-        numColors = gchandle->get_num_colors();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
         std::cout << "COLORING_TIME:" << timer.seconds() << std::endl;
         timer.reset();
@@ -718,48 +858,64 @@ namespace KokkosSparse{
           <typename HandleType::GraphColoringHandleType::color_view_t,
            nnz_lno_persistent_work_view_t, MyExecSpace>
           (num_rows, numColors, colors, color_xadj, color_adj);
-        MyExecSpace().fence();
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
         timer.reset();
 #endif
 
         nnz_lno_persistent_work_host_view_t  h_color_xadj = Kokkos::create_mirror_view (color_xadj);
         Kokkos::deep_copy (h_color_xadj , color_xadj);
-        MyExecSpace().fence();
-
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "DEEP_COPY:" << timer.seconds() << std::endl;
         timer.reset();
 #endif
-
-        // TODO BMK: Why are the vertices in each color set only being sorted on GPU?
-        // Wouldn't it have a locality benefit on CPU too?
-        if(KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+        if(longRowThreshold > 0)
+        {
+          //Count long rows per color set, and sort color sets so that long rows come after regular rows
+          nnz_lno_persistent_work_view_t long_rows_per_color(Kokkos::ViewAllocateWithoutInitializing("long_rows_per_color"), numColors);
+          nnz_lno_persistent_work_view_t max_row_length_per_color(Kokkos::ViewAllocateWithoutInitializing("max_row_length_per_color"), numColors);
+          nnz_lno_t mostLongRowsInColor = 0;
+          Kokkos::parallel_reduce(team_policy_t(numColors, Kokkos::AUTO()),
+              SortIntoLongRowsFunctor(xadj, longRowThreshold, color_xadj, color_adj, long_rows_per_color, max_row_length_per_color),
+              Kokkos::Max<nnz_lno_t>(mostLongRowsInColor));
+          auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color);
+          Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color);
+          gsHandle->set_long_rows_per_color(host_long_rows_per_color);
+          auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color);
+          Kokkos::deep_copy(host_max_row_length_per_color, max_row_length_per_color);
+          gsHandle->set_max_row_length_per_color(host_max_row_length_per_color);
+          scalar_persistent_work_view_t long_row_x(Kokkos::ViewAllocateWithoutInitializing("long_row_x"), mostLongRowsInColor);
+          gsHandle->set_long_row_x(long_row_x);
+        }
+        else
+        {
+          //Just sort rows by ID.
           KokkosKernels::Impl::sort_crs_graph<MyExecSpace, decltype(color_xadj), decltype(color_adj)>(color_xadj, color_adj);
-          MyExecSpace().fence();
+        }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-          std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
-          timer.reset();
+        MyExecSpace().fence();
+        std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
+        timer.reset();
 #endif
-        }
 
         row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1);
         nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows );
         nnz_lno_persistent_work_view_t permuted_adj ("newadj_", nnz );
 
-        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", my_exec_space(0,num_rows),
+        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", range_pol(0,num_rows),
                               create_permuted_xadj(
                                                    color_adj,
                                                    xadj,
                                                    permuted_xadj,
                                                    old_to_new_map));
         //std::cout << "create_permuted_xadj" << std::endl;
-        MyExecSpace().fence();
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl;
 
         timer.reset();
@@ -768,15 +924,15 @@ namespace KokkosSparse{
         KokkosKernels::Impl::inclusive_parallel_prefix_sum
           <row_lno_persistent_work_view_t, MyExecSpace>
           (num_rows + 1, permuted_xadj);
-        MyExecSpace().fence();
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl;
         timer.reset();
 #endif
 
 
-        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",my_exec_space(0,num_rows),
+        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",range_pol(0,num_rows),
                               fill_matrix_symbolic(
                                                    num_rows,
                                                    color_adj,
@@ -787,9 +943,9 @@ namespace KokkosSparse{
                                                    permuted_adj,
                                                    //newvals_,
                                                    old_to_new_map));
-        MyExecSpace().fence();
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl;
         timer.reset();
 #endif
@@ -803,7 +959,6 @@ namespace KokkosSparse{
           //first calculate max row size.
           size_type max_row_size = 0;
           KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, MyExecSpace>(num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, max_row_size);
-          gsHandle->set_max_nnz(max_row_size);
 
           nnz_lno_t brows = permuted_xadj.extent(0) - 1;
           size_type bnnz =  permuted_adj.extent(0) * block_size * block_size;
@@ -884,7 +1039,6 @@ namespace KokkosSparse{
             }
           }
 
-          gsHandle->set_max_nnz(max_row_size);
           gsHandle->set_level_1_mem(level_1_mem);
           gsHandle->set_level_2_mem(level_2_mem);
 
@@ -899,10 +1053,6 @@ namespace KokkosSparse{
         gsHandle->set_new_xadj(permuted_xadj);
         gsHandle->set_new_adj(permuted_adj);
         gsHandle->set_old_to_new_map(old_to_new_map);
-        if(gsHandle->is_owner_of_coloring()) {
-          this->handle->destroy_graph_coloring_handle();
-          gsHandle->set_owner_of_coloring(false);
-        }
         gsHandle->set_call_symbolic(true);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
         std::cout << "ALLOC:" << timer.seconds() << std::endl;
@@ -1147,7 +1297,7 @@ namespace KokkosSparse{
 
           if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric",
-                                  team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size),
+                                  team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size),
                                   fill_matrix_numeric(
                                                       color_adj,
                                                       xadj,
@@ -1163,7 +1313,7 @@ namespace KokkosSparse{
                                                       ));
           }
           else {
-            Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_numeric",my_exec_space(0,num_rows),
+            Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_numeric",range_pol(0,num_rows),
                                   fill_matrix_numeric(
                                                       color_adj,
                                                       xadj,
@@ -1178,7 +1328,6 @@ namespace KokkosSparse{
                                                       block_matrix_size
                                                       ));
           }
-          MyExecSpace().fence();
           gsHandle->set_new_adj_val(permuted_adj_vals);
 
           scalar_persistent_work_view_t permuted_inverse_diagonal (Kokkos::ViewAllocateWithoutInitializing("permuted_inverse_diagonal"), num_rows * block_size );
@@ -1196,7 +1345,7 @@ namespace KokkosSparse{
             }
             else {
               Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals",
-                                   my_exec_space(0,num_rows),
+                                   range_pol(0,num_rows),
                                    gmd );
             }
 
@@ -1225,15 +1374,43 @@ namespace KokkosSparse{
 
           }
 
-          MyExecSpace().fence();
           gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal);
           gsHandle->set_call_numeric(true);
         }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
+        MyExecSpace().fence();
         std::cout << "NUMERIC:" << timer.seconds() << std::endl;
 #endif
       }
 
+      //Functor to update unknown entries corresponding to long rows (in the permuted x)
+      template<typename x_value_array_type, typename y_value_array_type>
+      struct LongRowUpdateFunctor
+      {
+        LongRowUpdateFunctor(
+            const x_value_array_type& permuted_x_,
+            const y_value_array_type& permuted_y_,
+            const scalar_persistent_work_view_t& long_row_x_,
+            const scalar_persistent_work_view_t& permuted_inverse_diagonal_,
+            nnz_scalar_t omega_,
+            nnz_lno_t long_row_begin_)
+          : permuted_x(permuted_x_), permuted_y(permuted_y_), long_row_x(long_row_x_),
+          permuted_inverse_diagonal(permuted_inverse_diagonal_), omega(omega_), long_row_begin(long_row_begin_)
+        {}
+
+        KOKKOS_INLINE_FUNCTION void operator()(nnz_lno_t i) const
+        {
+          permuted_x(i) += omega * permuted_inverse_diagonal(i) * (permuted_y(i) - long_row_x(i - long_row_begin));
+        }
+
+        x_value_array_type permuted_x;
+        y_value_array_type permuted_y;
+        scalar_persistent_work_view_t long_row_x;
+        scalar_persistent_work_view_t permuted_inverse_diagonal;
+        nnz_scalar_t omega;
+        nnz_lno_t long_row_begin;
+      };
+
       template <typename x_value_array_type, typename y_value_array_type>
       void block_apply(
                        x_value_array_type x_lhs_output_vec,
@@ -1275,7 +1452,6 @@ namespace KokkosSparse{
                                                           Permuted_Yvector
                                                           );
         }
-        MyExecSpace().fence();
         if(init_zero_x_vector) {
           KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t, MyExecSpace>(num_cols * block_size, Permuted_Xvector);
         }
@@ -1288,7 +1464,6 @@ namespace KokkosSparse{
                 Permuted_Xvector
                 );
         }
-        MyExecSpace().fence();
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
         std::cout << "Y:";
@@ -1346,7 +1521,7 @@ namespace KokkosSparse{
                             apply_backward);
 
 
-        //Kokkos::parallel_for( my_exec_space(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
+        //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
 
 
         KokkosKernels::Impl::permute_block_vector
@@ -1356,8 +1531,6 @@ namespace KokkosSparse{
                                                                                                            Permuted_Xvector,
                                                                                                            x_lhs_output_vec
                                                                                                            );
-        MyExecSpace().fence();
-
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
         std::cout << "After X:";
         KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
@@ -1404,7 +1577,6 @@ namespace KokkosSparse{
                  Permuted_Yvector
                  );
         }
-        MyExecSpace().fence();
         if(init_zero_x_vector) {
           KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t, MyExecSpace>(num_cols, Permuted_Xvector);
         }
@@ -1417,9 +1589,6 @@ namespace KokkosSparse{
                 Permuted_Xvector
                 );
         }
-        MyExecSpace().fence();
-
-        nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj();
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
         std::cout << "--point Before X:";
@@ -1428,6 +1597,7 @@ namespace KokkosSparse{
         KokkosKernels::Impl::print_1Dview(Permuted_Yvector,true);
 #endif
 
+        nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj();
         if(gsHandle->get_algorithm_type() == GS_PERMUTED) {
           PSGS gs(newxadj, newadj, newadj_vals,
                   Permuted_Xvector, Permuted_Yvector, color_adj, omega, permuted_inverse_diagonal);
@@ -1454,7 +1624,7 @@ namespace KokkosSparse{
                               apply_backward);
         }
 
-        //Kokkos::parallel_for( my_exec_space(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
+        //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
 
         KokkosKernels::Impl::permute_vector
           <scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>(
@@ -1463,7 +1633,6 @@ namespace KokkosSparse{
               Permuted_Xvector,
               x_lhs_output_vec
               );
-        MyExecSpace().fence();
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
         std::cout << "--point After X:";
         KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
@@ -1516,79 +1685,95 @@ namespace KokkosSparse{
                          nnz_lno_persistent_work_host_view_t h_color_xadj,
                          int num_iteration,
                          bool apply_forward,
-                         bool apply_backward){
-
-        for (int i = 0; i < num_iteration; ++i){
-          this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward);
+                         bool apply_backward)
+      {
+        auto gsHandle = this->get_gs_handle();
+        nnz_lno_persistent_work_host_view_t long_rows_per_color;
+        nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+        scalar_persistent_work_view_t long_row_x;
+        bool haveLongRows = false;
+        int longRowTeamSize = 1;
+        if(gsHandle->get_long_row_threshold() > 0)
+        {
+          long_rows_per_color = gsHandle->get_long_rows_per_color();
+          max_row_length_per_color = gsHandle->get_max_row_length_per_color();
+          long_row_x = gsHandle->get_long_row_x();
+          haveLongRows = true;
+          longrow_apply_team_policy_t tempPolicy(1, 1);
+          longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag());
         }
-      }
-
-      void DoPSGS(Team_PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj,
-                  bool apply_forward,
-                  bool apply_backward){
-
-        nnz_lno_t suggested_team_size = gs.suggested_team_size;
-        nnz_lno_t team_row_chunk_size = gs.team_work_size;
-        int vector_size = gs.vector_size;
-        nnz_lno_t block_size = get_gs_handle()->get_block_size();
 
-        if (apply_forward){
-          gs.is_backward = false;
-
-          for (color_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            int overall_work = color_index_end - color_index_begin;// /256 + 1;
-            gs._color_set_begin = color_index_begin;
-            gs._color_set_end = color_index_end;
-
-            if (block_size == 1){
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward",
-                                   team_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                   gs );
-            } else if (gs.num_max_vals_in_l2 == 0){
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward",
-                                   block_team_fill_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                   gs );
-            }
-            else {
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward",
-                                   bigblock_team_fill_policy_t(overall_work / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                   gs );
-            }
+        for (int iter = 0; iter < num_iteration; ++iter){
+          nnz_lno_t suggested_team_size = gs.suggested_team_size;
+          nnz_lno_t team_row_chunk_size = gs.team_work_size;
+          int vector_size = gs.vector_size;
+          nnz_lno_t block_size = gsHandle->get_block_size();
 
-            MyExecSpace().fence();
-          }
-        }
-        if (apply_backward){
-          gs.is_backward = true;
-          if (numColors > 0)
-            for (color_t i = numColors - 1;  ; --i){
+          for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
+            const char* labelRegular = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backward" :
+              "KokkosSparse::GaussSeidel::Team_PSGS::forward";
+            const char* labelBlock = doingBackward ? "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward" :
+              "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward";
+            const char* labelBigBlock = doingBackward ? "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward" :
+              "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward";
+            const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backwardLongRows" :
+              "KokkosSparse::GaussSeidel::Team_PSGS::forwardLongRows";
+
+            if(!doingBackward && !apply_forward)
+              continue;
+            if(doingBackward && !apply_backward)
+              continue;
+            gs.is_backward = doingBackward;
+
+            for (color_t colorIter = 0; colorIter < numColors; ++colorIter){
+              //i is just the color set now being processed
+              color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
               nnz_lno_t color_index_begin = h_color_xadj(i);
               nnz_lno_t color_index_end = h_color_xadj(i + 1);
-              nnz_lno_t numberOfTeams = color_index_end - color_index_begin;// /256 + 1;
+              nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
+              nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows;
+
               gs._color_set_begin = color_index_begin;
-              gs._color_set_end = color_index_end;
-              if (block_size == 1){
-                Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::backward",
-                                     team_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                     gs );
-              }
-              else if ( gs.num_max_vals_in_l2 == 0){
-                Kokkos::parallel_for("KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward",
-                                     block_team_fill_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                     gs );
-              }
-              else {
-                Kokkos::parallel_for("KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward",
-                                     bigblock_team_fill_policy_t(numberOfTeams / team_row_chunk_size + 1 , suggested_team_size, vector_size),
-                                     gs );
+              gs._color_set_end = color_index_end - numLongRows;
+
+              if (numRegularRows) {
+                if (block_size == 1){
+                  Kokkos::parallel_for(labelRegular,
+                                       team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
+                                       gs );
+                } else if (gs.num_max_vals_in_l2 == 0){
+                  Kokkos::parallel_for(labelBlock,
+                                       block_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
+                                       gs );
+                }
+                else {
+                  Kokkos::parallel_for(labelBigBlock,
+                                       bigblock_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
+                                       gs );
+                }
               }
-              MyExecSpace().fence();
-              if (i == 0){
-                break;
+              if (numLongRows) {
+                gs._color_set_begin = color_index_end - numLongRows;
+                gs._color_set_end = color_index_end;
+                gs._long_row_x = long_row_x;
+                nnz_lno_t max_par = max_row_length_per_color(i);
+                nnz_lno_t teams_per_row = ((max_par + 3) / 4 + longRowTeamSize - 1) / longRowTeamSize;
+                gs._long_row_par = teams_per_row;
+                for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
+                  auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
+                  auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
+                  gs._long_row_col = long_row_col;
+                  Kokkos::deep_copy(long_row_x, nnz_scalar_t());
+                  Kokkos::parallel_for(labelLong,
+                      longrow_apply_team_policy_t(numLongRows * teams_per_row, longRowTeamSize), gs);
+                  Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update",
+                      range_pol(color_index_end - numLongRows, color_index_end),
+                      LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>
+                      (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows));
+                }
               }
             }
+          }
         }
       }
 
@@ -1598,34 +1783,60 @@ namespace KokkosSparse{
                          nnz_lno_persistent_work_host_view_t h_color_xadj,
                          int num_iteration,
                          bool apply_forward,
-                         bool apply_backward){
-
-        for (int i = 0; i < num_iteration; ++i){
-          this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward);
+                         bool apply_backward)
+      {
+        auto gsHandle = this->get_gs_handle();
+        nnz_lno_persistent_work_host_view_t long_rows_per_color;
+        nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+        scalar_persistent_work_view_t long_row_x;
+        bool haveLongRows = false;
+        if(gsHandle->get_long_row_threshold() > 0)
+        {
+          long_rows_per_color = gsHandle->get_long_rows_per_color();
+          max_row_length_per_color = gsHandle->get_max_row_length_per_color();
+          long_row_x = gsHandle->get_long_row_x();
+          gs._long_row_x = long_row_x;
+          haveLongRows = true;
         }
-      }
 
-      void DoPSGS(PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj,
-                  bool apply_forward,
-                  bool apply_backward){
-        if (apply_forward){
-          for (color_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::forward",
-                                  my_exec_space (color_index_begin, color_index_end) , gs);
-            MyExecSpace().fence();
-          }
-        }
-        if (apply_backward && numColors){
-          for (size_type i = numColors - 1; ; --i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::backward",
-                                  my_exec_space (color_index_begin, color_index_end), gs);
-            MyExecSpace().fence();
-            if (i == 0){
-              break;
+        for (int iter = 0; iter < num_iteration; ++iter) {
+          for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
+            if(!doingBackward && !apply_forward)
+              continue;
+            if(doingBackward && !apply_backward)
+              continue;
+
+            for (color_t colorIter = 0; colorIter < numColors; ++colorIter) {
+              //i is just the color set now being processed
+              color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
+              const char* labelShort = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backward" :
+                "KokkosSparse::GaussSeidel::PSGS::forward";
+              const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backwardLongRows" :
+                "KokkosSparse::GaussSeidel::PSGS::forwardLongRows";
+              nnz_lno_t color_index_begin = h_color_xadj(i);
+              nnz_lno_t color_index_end = h_color_xadj(i + 1);
+              nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
+              nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows;
+              if(numRegularRows) {
+                Kokkos::parallel_for (labelShort, range_pol (color_index_begin, color_index_end - numLongRows) , gs);
+              }
+              if(numLongRows) {
+                gs._color_set_begin = color_index_end - numLongRows;
+                nnz_lno_t max_par = max_row_length_per_color(i);
+                nnz_lno_t par_per_row = (max_par + 1023) / 1024;
+                gs._long_row_par = par_per_row;
+                for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
+                  auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
+                  auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
+                  gs._long_row_col = long_row_col;
+                  Kokkos::deep_copy(long_row_x, nnz_scalar_t());
+                  Kokkos::parallel_for (labelLong, Kokkos::RangePolicy<MyExecSpace, LongRowTag>(0, numLongRows * par_per_row), gs);
+                  Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update",
+                      range_pol(color_index_end - numLongRows, color_index_end),
+                      LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>
+                      (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows));
+                }
+              }
             }
           }
         }
diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp
index f9936cc4d439..fa3d7194abfd 100644
--- a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp
+++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp
@@ -47,6 +47,8 @@
 
 #include "KokkosKernels_Utils.hpp"
 #include "Kokkos_ArithTraits.hpp"
+#include "KokkosSparse_spmv.hpp"
+#include "gtest/gtest.h"  //for EXPECT_**
 
 namespace Test {
   template<class ViewType, bool strided = std::is_same<typename ViewType::array_layout, Kokkos::LayoutStride>::value>
@@ -214,46 +216,6 @@ namespace Test {
     }
   };
 
-  template<class ViewTypeA, class ViewTypeX, class ViewTypeY>
-  void vanillaGEMV(char mode,
-      typename ViewTypeA::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeX& x, 
-      typename ViewTypeY::non_const_value_type beta, const ViewTypeY& y)
-  {
-    using ScalarY = typename ViewTypeY::non_const_value_type;
-    using KAT_A = Kokkos::ArithTraits<typename ViewTypeA::non_const_value_type>;
-    using KAT_Y = Kokkos::ArithTraits<ScalarY>;
-    int M = A.extent(0);
-    int N = A.extent(1);
-    if(beta == KAT_Y::zero())
-      Kokkos::deep_copy(y, KAT_Y::zero());
-    if(mode == 'N') {
-      for(int i = 0; i < M; i++) {
-        ScalarY y_i = beta * y(i);
-        for(int j = 0; j < N; j++) {
-           y_i += alpha * A(i,j) * x(j);
-        }
-        y(i) = y_i;
-      }
-    } else if(mode == 'T') {
-      for(int j = 0; j < N; j++) {
-        ScalarY y_j = beta * y(j);
-        for(int i = 0; i < M; i++) {
-           y_j += alpha * A(i,j) * x(i);
-        }
-        y(j) = y_j;
-      }
-    } else if(mode == 'C') {
-      for(int j = 0; j < N; j++) {
-        ScalarY y_j = beta * y(j);
-        for(int i = 0; i < M; i++) {
-           y_j += alpha * KAT_A::conj (A(i,j)) * x(i);
-        }
-        y(j) = y_j;
-      }
-    }
-  }
-
-
   template<class T>
   class epsilon {
     public:
@@ -292,5 +254,103 @@ namespace Test {
     start = Kokkos::complex<double>(-mag, -mag);
     end = Kokkos::complex<double>(mag, mag);
   }
+
+  template<typename scalar_t, typename lno_t, typename size_type, typename device, typename crsMat_t>
+  crsMat_t symmetrize(crsMat_t A)
+  {
+    typedef typename crsMat_t::StaticCrsGraphType graph_t;
+    typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+    typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+    auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+    auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+    auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+    lno_t numRows = A.numRows();
+    //symmetrize as input_mat + input_mat^T, to still have a diagonally dominant matrix
+    typedef std::map<lno_t, scalar_t> Row;
+    std::vector<Row> symRows(numRows);
+    for(lno_t r = 0; r < numRows; r++)
+    {
+      auto& row = symRows[r];
+      for(size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++)
+      {
+        lno_t c = host_entries(i);
+        auto& col = symRows[c];
+        auto it = row.find(c);
+        if(it == row.end())
+          row[c] = host_values(i);
+        else
+          row[c] += host_values(i);
+        it = col.find(r);
+        if(it == col.end())
+          col[r] = host_values(i);
+        else
+          col[r] += host_values(i);
+      }
+    }
+    //Count entries
+    Kokkos::View<size_type*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_rowmap("Rowmap", numRows + 1);
+    size_t accum = 0;
+    for(lno_t r = 0; r <= numRows; r++)
+    {
+      new_host_rowmap(r) = accum;
+      if(r < numRows)
+        accum += symRows[r].size();
+    }
+    //Allocate new entries/values
+    Kokkos::View<lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_entries("Entries", accum);
+    Kokkos::View<scalar_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_values("Values", accum);
+    for(lno_t r = 0; r < numRows; r++)
+    {
+      auto rowIt = symRows[r].begin();
+      for(size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++)
+      {
+        new_host_entries(i) = rowIt->first;
+        new_host_values(i) = rowIt->second;
+        rowIt++;
+      }
+    }
+    lno_view_t new_rowmap("Rowmap", numRows + 1);
+    lno_nnz_view_t new_entries("Entries", accum);
+    scalar_view_t new_values("Values", accum);
+    Kokkos::deep_copy(new_rowmap, new_host_rowmap);
+    Kokkos::deep_copy(new_entries, new_host_entries);
+    Kokkos::deep_copy(new_values, new_host_values);
+    return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries);
+  }
+
+  //create_random_x_vector and create_random_y_vector can be used together to generate a random 
+  //linear system Ax = y.
+  template<typename vec_t>
+  vec_t create_random_x_vector(vec_t& kok_x, double max_value = 10.0) {
+    typedef typename vec_t::value_type scalar_t;
+    auto h_x = Kokkos::create_mirror_view (kok_x);
+    for (size_t j = 0; j < h_x.extent(1); ++j){
+      for (size_t i = 0; i < h_x.extent(0); ++i){
+        scalar_t r =
+            static_cast <scalar_t> (rand()) /
+            static_cast <scalar_t> (RAND_MAX / max_value);
+        h_x.access(i, j) = r;
+      }
+    }
+    Kokkos::deep_copy (kok_x, h_x);
+    return kok_x;
+  }
+
+  template <typename crsMat_t, typename vector_t>
+  vector_t create_random_y_vector(crsMat_t crsMat, vector_t x_vector){
+    vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
+        crsMat.numRows());
+    KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
+    return y_vector;
+  }
+
+  template <typename crsMat_t, typename vector_t>
+  vector_t create_random_y_vector_mv(crsMat_t crsMat, vector_t x_vector){
+    vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
+        crsMat.numRows(), x_vector.extent(1));
+    KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
+    return y_vector;
+  }
 }
 #endif
diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp
index c9c01761244c..9ae63b5f8f17 100644
--- a/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp
+++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas2_gemv.hpp
@@ -12,7 +12,6 @@ namespace Test {
     typedef typename ViewTypeA::value_type ScalarA;
     typedef typename ViewTypeX::value_type ScalarX;
     typedef typename ViewTypeY::value_type ScalarY;
-    typedef Kokkos::ArithTraits<ScalarY> KAT_Y;
 
     typedef multivector_layout_adapter<ViewTypeA> vfA_type;
     typedef Kokkos::View<ScalarX*[2],
@@ -26,8 +25,8 @@ namespace Test {
 
 
     ScalarA alpha = 3;
-    ScalarY beta = 5;
-    double eps = (std::is_same<typename KAT_Y::mag_type, float>::value ? 1e-3 : 5e-10);
+    ScalarX beta = 5;
+    double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarY>::mag_type, float>::value ? 1e-3 : 1e-10);
 
     int ldx;
     int ldy;
@@ -43,6 +42,7 @@ namespace Test {
     BaseTypeY b_y("Y", ldy);
     BaseTypeY b_org_y("Org_Y", ldy);
     
+
     ViewTypeA A = vfA_type::view(b_A);
     ViewTypeX x = Kokkos::subview(b_x,Kokkos::ALL(),0);
     ViewTypeY y = Kokkos::subview(b_y,Kokkos::ALL(),0);
@@ -85,56 +85,56 @@ namespace Test {
     Kokkos::deep_copy(h_b_y,b_y);
     Kokkos::deep_copy(h_b_A,b_A);
 
+    typedef Kokkos::Details::ArithTraits<typename ViewTypeA::non_const_value_type> KAT;
     Kokkos::View<ScalarY*, Kokkos::HostSpace> expected("expected aAx+by", ldy);
-    Kokkos::deep_copy(expected, h_org_y);
-    vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
+    if(mode[0] == 'N') {
+      for(int i = 0; i < M; i++) {
+        ScalarY y_i = beta * h_org_y(i);
+        for(int j = 0; j < N; j++) {
+           y_i += alpha * h_A(i,j) * h_x(j);
+        }
+        expected(i) = y_i;
+      }
+    } else if(mode[0] == 'T') {
+      for(int j = 0; j < N; j++) {
+        ScalarY y_j = beta * h_org_y(j);
+        for(int i = 0; i < M; i++) {
+           y_j += alpha * h_A(i,j) * h_x(i);
+        }
+        expected(j) = y_j;
+      }
+    } else if(mode[0] == 'C') {
+      for(int j = 0; j < N; j++) {
+        ScalarY y_j = beta * h_org_y(j);
+        for(int i = 0; i < M; i++) {
+           y_j += alpha * KAT::conj (h_A(i,j)) * h_x(i);
+        }
+        expected(j) = y_j;
+      }
+    }
 
     KokkosBlas::gemv(mode, alpha, A, x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
-    int numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
+      EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i));
     }
-    EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
  
     Kokkos::deep_copy(b_y, b_org_y);
     KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
-    numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
+      EXPECT_NEAR_KK(expected(i), h_y(i), eps);
     }
-    EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
 
     Kokkos::deep_copy(b_y, b_org_y);
     KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
-    numErrors = 0;
-    for(int i = 0; i < ldy; i++)
-    {
-      if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
-    }
-    EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
-    //Test once with beta = 0, but with y initially filled with NaN.
-    //This should overwrite the NaNs with the correct result.
-    beta = KAT_Y::zero();
-    //beta changed, so update the correct answer
-    vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
-    Kokkos::deep_copy(b_y, KAT_Y::nan());
-    KokkosBlas::gemv(mode, alpha, A, x, beta, y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      if(KAT_Y::isNan(h_y(i)) || KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
+      EXPECT_NEAR_KK(expected(i), h_y(i), eps);
     }
-    EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect";
   }
 }
 
@@ -156,12 +156,8 @@ int test_gemv(const char* mode) {
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,200,10);
   #endif
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,50,40);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,132231,1024);
 #endif
 
@@ -170,12 +166,8 @@ int test_gemv(const char* mode) {
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutRight, Device> view_type_c_lr;
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,50,40);
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,132231,1024);
 #endif
 
@@ -184,12 +176,8 @@ int test_gemv(const char* mode) {
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutStride, Device> view_type_c_ls;
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,50,40);
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,132231,1024);
 #endif
 
diff --git a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp
index 4ef8dea47781..580de25397e4 100644
--- a/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp
+++ b/packages/kokkos-kernels/unit_test/blas/Test_Blas3_gemm.hpp
@@ -7,7 +7,7 @@
 namespace Test {
 
   template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct gemm_VanillaGEMM {
+  struct VanillaGEMM {
     bool A_t, B_t, A_c, B_c;
     int N,K;
     ViewTypeA A;
@@ -114,9 +114,8 @@ namespace Test {
     // Kokkos::fill_random(C,rand_pool,ScalarC(10));
     
     Kokkos::deep_copy(C2,C);
-    Kokkos::fence();
- 
-    struct gemm_VanillaGEMM<ViewTypeA,ViewTypeB,ViewTypeC,execution_space> vgemm;
+
+    struct VanillaGEMM<ViewTypeA,ViewTypeB,ViewTypeC,execution_space> vgemm;
     vgemm.A_t = A_t; vgemm.B_t = B_t;
     vgemm.A_c = A_c; vgemm.B_c = B_c;
     vgemm.N = N;     vgemm.K = K;
@@ -125,7 +124,7 @@ namespace Test {
     vgemm.alpha = alpha;
     vgemm.beta = beta;
 
-    Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
+    Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
 
     KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C);
 
@@ -152,49 +151,67 @@ namespace Test {
   }
 }
 
-template<typename Scalar, typename Layout>
-void test_gemm()
-{
-  typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_a;
-  typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_b;
-  typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_c;
-  std::vector<const char*> modes = {"N", "T"};
-  if(std::is_same<Scalar, Kokkos::complex<float>>::value || std::is_same<Scalar, Kokkos::complex<double>>::value)
-    modes.push_back("C");
-  Scalar alpha = 4.5;
-  std::vector<Scalar> betas = {0.0, 3.0};
-  for(Scalar beta : betas)
-  {
-    for(auto amode : modes)
-    {
-      for(auto bmode : modes)
-      {
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,0,0,0,alpha,beta);
-        //BMK: N = 1 exercises the special GEMV code path in GEMM (currently, only for modes N/N)
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,50,1,40,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,13,15,17,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,179,15,211,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,12,3071,517,alpha,beta);
-      }
-    }
-  }
-}
 
-template<typename Scalar>
-void test_gemm_enabled_layouts()
-{
+
+template<class ScalarA, class ScalarB, class ScalarC, class Device>
+int test_gemm(const char* mode, ScalarA alpha, ScalarB beta) {
+
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  test_gemm<Scalar, Kokkos::LayoutLeft>();
+  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarC**, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_gemm<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(&mode[0],&mode[1],0,0,0,alpha,beta);
+  Test::impl_test_gemm<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(&mode[0],&mode[1],13,15,17,alpha,beta);
+  Test::impl_test_gemm<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(&mode[0],&mode[1],179,15,211,alpha,beta);
+  Test::impl_test_gemm<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(&mode[0],&mode[1],12,3071,517,alpha,beta);
+  //Test::impl_test_gemm<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(&mode[0],&mode[1],1024,1024,2048,alpha,beta);
 #endif
+
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  test_gemm<Scalar, Kokkos::LayoutRight>();
+  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarC**, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_gemm<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(&mode[0],&mode[1],0,0,0,alpha,beta);
+  Test::impl_test_gemm<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(&mode[0],&mode[1],13,15,17,alpha,beta);
+  Test::impl_test_gemm<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(&mode[0],&mode[1],179,15,211,alpha,beta);
+  Test::impl_test_gemm<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(&mode[0],&mode[1],12,3071,517,alpha,beta);
+  //Test::impl_test_gemm<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(&mode[0],&mode[1],1024,1024,2048,alpha,beta);
+#endif
+/*
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarX*, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarY*, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,0,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,132231,1024);
+#endif
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(mode,1024,1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(mode,1024,1024);
 #endif
+*/
+  return 1;
 }
 
 #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 TEST_F( TestCategory, gemm_float ) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_float");
-  test_gemm_enabled_layouts<float>();
+    float alpha = 5.0f;
+    float beta = 3.0f;
+    test_gemm<float,float,float,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("TN",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("NT",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("TT",alpha,beta);
+
+    alpha = 4.5f;
+    beta = 0.0f;
+    test_gemm<float,float,float,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("TN",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("NT",alpha,beta);
+    test_gemm<float,float,float,TestExecSpace> ("TT",alpha,beta);
   Kokkos::Profiling::popRegion();
 }
 #endif
@@ -202,7 +219,19 @@ TEST_F( TestCategory, gemm_float ) {
 #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 TEST_F( TestCategory, gemm_double ) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_double");
-    test_gemm_enabled_layouts<double>();
+    double alpha = 5.0;
+    double beta = 3.0;
+    test_gemm<double,double,double,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("TN",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("NT",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("TT",alpha,beta);
+
+    alpha = 4.5;
+    beta = 0.0;
+    test_gemm<double,double,double,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("TN",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("NT",alpha,beta);
+    test_gemm<double,double,double,TestExecSpace> ("TT",alpha,beta);
   Kokkos::Profiling::popRegion();
 }
 #endif
@@ -210,7 +239,19 @@ TEST_F( TestCategory, gemm_double ) {
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 TEST_F( TestCategory, gemm_complex_double ) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_double");
-    test_gemm_enabled_layouts<Kokkos::complex<double>>();
+    Kokkos::complex<double> alpha = 5.0;
+    Kokkos::complex<double> beta = 3.0;
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("CN",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("NC",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("CC",alpha,beta);
+
+    alpha = Kokkos::complex<double>(4.5,0.0);
+    beta = 0.0;
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("CN",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("NC",alpha,beta);
+    test_gemm<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("CC",alpha,beta);
   Kokkos::Profiling::popRegion();
 }
 #endif
@@ -218,8 +259,33 @@ TEST_F( TestCategory, gemm_complex_double ) {
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 TEST_F( TestCategory, gemm_complex_float ) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_float");
-    test_gemm_enabled_layouts<Kokkos::complex<float>>();
+    Kokkos::complex<float> alpha = 5.0f;
+    Kokkos::complex<float> beta = 3.0f;
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("CN",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("NC",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("CC",alpha,beta);
+
+    alpha = Kokkos::complex<float>(4.5f,0.0f);
+    beta = 0.0;
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("NN",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("CN",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("NC",alpha,beta);
+    test_gemm<Kokkos::complex<float>,Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("CC",alpha,beta);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
+/*
+#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F( TestCategory, gemm_int ) {
+    test_gemm<int,int,int,TestExecSpace> ("N");
+}
+#endif
+
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F( TestCategory, gemm_double_int ) {
+    test_gemm<double,int,float,TestExecSpace> ("N");
+}
+#endif
+*/
diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
index 85b427d445d6..6caa9d96a150 100644
--- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
+++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
@@ -47,11 +47,15 @@
 #include <Kokkos_Core.hpp>
 #include <stdexcept>
 #include "KokkosSparse_CrsMatrix.hpp"
+#include "Kokkos_ArithTraits.hpp"
 
-#ifndef kokkos_complex_double
-#define kokkos_complex_double Kokkos::complex<double>
-#define kokkos_complex_float Kokkos::complex<float>
-#endif
+// #ifndef kokkos_complex_double
+// #define kokkos_complex_double Kokkos::complex<double>
+// #define kokkos_complex_float Kokkos::complex<float>
+// #endif
+
+typedef Kokkos::complex<double> kokkos_complex_double;
+typedef Kokkos::complex<float> kokkos_complex_float;
 
 namespace Test{ // anonymous
 
@@ -189,6 +193,40 @@ testCrsMatrix ()
   //printf ("A is %d by %d\n", A.numRows (), A.numCols ());
 }
 
+template <typename scalar_t, typename lno_t, typename size_type, typename device>
+void
+testCrsMatrixRawConstructor()
+{
+  int nrows = 5;
+  //note: last 2 columns will be empty.
+  //This makes sure the ncols provided to constructor is preserved.
+  int ncols = 7;
+  int nnz = 9;
+  //NOTE: this is not a mistake, the raw ptr constructor takes rowmap as ordinal.
+  std::vector<lno_t> rowmap = {0, 0, 2, 5, 6, 9};
+  std::vector<lno_t> entries = {3, 4, 0, 1, 2, 2, 0, 3, 4};
+  std::vector<scalar_t> values;
+  for(int i = 0; i < nnz; i++)
+    values.push_back(Kokkos::ArithTraits<scalar_t>::one() * (1.0 * rand() / RAND_MAX));
+  KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> A(
+      "A", nrows, ncols, nnz, values.data(), rowmap.data(), entries.data());
+  EXPECT_EQ(A.numRows(), nrows);
+  EXPECT_EQ(A.numCols(), ncols);
+  EXPECT_EQ(A.nnz(), nnz);
+  //verify rowmap, entries, values: should all be identical to original raw arrays
+  //(except the rowmap elements are now size_type)
+  auto checkRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto checkEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  auto checkValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+  for(int i = 0; i < nrows + 1; i++)
+    EXPECT_EQ(checkRowmap(i), (size_type) rowmap[i]);
+  for(int i = 0; i < nnz; i++)
+  {
+    EXPECT_EQ(checkEntries(i), entries[i]);
+    EXPECT_EQ(checkValues(i), values[i]);
+  }
+}
+
 template <typename scalar_t, typename lno_t, typename size_type, typename device>
 void
 testCrsMatrixHostMirror ()
@@ -226,6 +264,7 @@ testCrsMatrixHostMirror ()
 #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
 TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   testCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
+  testCrsMatrixRawConstructor<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
 } \
 TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   testCrsMatrixHostMirror<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
@@ -329,4 +368,4 @@ TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ #
  EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
-
+#undef EXECUTE_TEST
diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
index 421e1a08889e..14187d3243bd 100644
--- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
+++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
@@ -46,6 +46,7 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
+#include "KokkosKernels_TestUtils.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 //#include <Kokkos_Sparse_CrsMatrix.hpp>
@@ -58,10 +59,13 @@
 #include <complex>
 #include "KokkosSparse_gauss_seidel.hpp"
 
-#ifndef kokkos_complex_double
-#define kokkos_complex_double Kokkos::complex<double>
-#define kokkos_complex_float Kokkos::complex<float>
-#endif
+// #ifndef kokkos_complex_double
+// #define kokkos_complex_double Kokkos::complex<double>
+// #define kokkos_complex_float Kokkos::complex<float>
+// #endif
+
+typedef Kokkos::complex<double> kokkos_complex_double;
+typedef Kokkos::complex<float> kokkos_complex_float;
 
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
@@ -139,37 +143,6 @@ int run_block_gauss_seidel_1(
   return 0;
 }
 
-template<typename vec_t>
-vec_t create_x_vector(vec_t& kok_x, double max_value = 10.0) {
-  typedef typename vec_t::value_type scalar_t;
-  auto h_x = Kokkos::create_mirror_view (kok_x);
-  for (size_t j = 0; j < h_x.extent(1); ++j){
-    for (size_t i = 0; i < h_x.extent(0); ++i){
-      scalar_t r =
-          static_cast <scalar_t> (rand()) /
-          static_cast <scalar_t> (RAND_MAX / max_value);
-      h_x.access(i, j) = r;
-    }
-  }
-  Kokkos::deep_copy (kok_x, h_x);
-  return kok_x;
-}
-
-template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
-      crsMat.numRows());
-  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-  return y_vector;
-}
-
-template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector_mv(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
-      crsMat.numRows(), x_vector.extent(1));
-  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-  return y_vector;
-}
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, typename device>
@@ -226,10 +199,10 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth
   lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size;
 
   const scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X"), nv);
-  //create_x_vector operates on host mirror, then copies to device. But create_y does everything on device.
-  create_x_vector(solution_x);
+  //create_random_x_vector operates on host mirror, then copies to device. But create_y does everything on device.
+  create_random_x_vector(solution_x);
   exec_space().fence();
-  scalar_view_t y_vector = create_y_vector(crsmat2, solution_x);
+  scalar_view_t y_vector = create_random_y_vector(crsmat2, solution_x);
   mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
 #ifdef gauss_seidel_testmore
   GSAlgorithm gs_algorithms[] ={GS_DEFAULT, GS_TEAM, GS_PERMUTED};
@@ -252,7 +225,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth
 
     bool is_symmetric_graph = true;
     size_t shmem_size = 32128;
-    
+
     for(int i = 0; i < 2; ++i)
     {
       if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm.
@@ -335,8 +308,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth
   constexpr lno_t numVecs = 2;
 
   scalar_view2d_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X"), nv, numVecs);
-  create_x_vector(solution_x);
-  scalar_view2d_t y_vector = create_y_vector_mv(crsmat2, solution_x);
+  create_random_x_vector(solution_x);
+  scalar_view2d_t y_vector = create_random_y_vector_mv(crsmat2, solution_x);
   exec_space().fence();
   auto solution_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), solution_x);
   //Need to fence before reading from solution_host
@@ -375,7 +348,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth
 
     scalar_view_t res_norms("Residuals", numVecs);
     auto h_res_norms = Kokkos::create_mirror_view(res_norms);
-    
+
     for(int i = 0; i < 2; ++i)
     {
       if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm.
@@ -518,6 +491,6 @@ TEST_F( TestCategory, sparse ## _ ## block_gauss_seidel_rank2 ## _ ## SCALAR ##
  EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
-
+#undef EXECUTE_TEST
 
 
diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
index cbdb673bb168..83823dc14153 100644
--- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
+++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
@@ -60,20 +60,68 @@
 #include "KokkosSparse_gauss_seidel.hpp"
 #include "KokkosSparse_partitioning_impl.hpp"
 #include "KokkosSparse_sor_sequential_impl.hpp"
+#include "KokkosKernels_Sorting.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
-#ifndef kokkos_complex_double
-#define kokkos_complex_double Kokkos::complex<double>
-#define kokkos_complex_float Kokkos::complex<float>
-#endif
+// #ifndef kokkos_complex_double
+// #define kokkos_complex_double Kokkos::complex<double>
+// #define kokkos_complex_float Kokkos::complex<float>
+// #endif
+
+typedef Kokkos::complex<double> kokkos_complex_double;
+typedef Kokkos::complex<float> kokkos_complex_float;
 
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
+
+
 namespace Test {
 
-template <typename crsMat_t, typename vec_t, typename device>
-int run_gauss_seidel(
+//Run GS on the given vectors, where the handle is already set up.
+template <typename Handle, typename crsMat_t, typename vec_t>
+void run_gauss_seidel(
+    Handle& kh,
+    crsMat_t input_mat,
+    vec_t x_vector,
+    vec_t y_vector,
+    bool is_symmetric_graph,
+    typename crsMat_t::value_type omega,
+    int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward.
+    )
+{
+  const size_t num_rows = input_mat.numRows();
+  const size_t num_cols = input_mat.numCols();
+  const int apply_count = 2;
+
+  gauss_seidel_symbolic
+    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph);
+  gauss_seidel_numeric
+    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph);
+
+  switch (apply_type){
+  case 0:
+    symmetric_gauss_seidel_apply
+      (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
+    break;
+  case 1:
+    forward_sweep_gauss_seidel_apply
+    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
+    break;
+  case 2:
+    backward_sweep_gauss_seidel_apply
+    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
+    break;
+  default:
+    symmetric_gauss_seidel_apply
+    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
+    break;
+  }
+}
+
+template <typename crsMat_t, typename vec_t>
+void run_gauss_seidel(
     crsMat_t input_mat,
     GSAlgorithm gs_algorithm,
     vec_t x_vector,
@@ -82,16 +130,12 @@ int run_gauss_seidel(
     int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward.
     int cluster_size = 1,
     bool classic = false, // only with two-stage, true for sptrsv instead of richardson
-    ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT) 
+    ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT)
 {
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type lno_view_t;
-  typedef typename graph_t::entries_type lno_nnz_view_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-
-  typedef typename lno_view_t::value_type size_type;
-  typedef typename lno_nnz_view_t::value_type lno_t;
-  typedef typename scalar_view_t::value_type scalar_t;
+  using size_type = typename crsMat_t::size_type;
+  using lno_t = typename crsMat_t::ordinal_type;
+  using scalar_t = typename crsMat_t::value_type;
+  using device = typename crsMat_t::device_type;
 
   typedef KokkosKernelsHandle
       <size_type,lno_t, scalar_t,
@@ -100,8 +144,6 @@ int run_gauss_seidel(
   scalar_t omega(0.9);
 
   KernelHandle kh;
-  kh.set_team_work_size(16);
-  kh.set_dynamic_scheduling(true);
   if(gs_algorithm == GS_CLUSTER)
     kh.create_gs_handle(clusterAlgo, cluster_size);
   else if(gs_algorithm == GS_TWOSTAGE) {
@@ -110,141 +152,18 @@ int run_gauss_seidel(
     kh.set_gs_twostage(!classic, input_mat.numRows());
     if (classic) {
       // two-stage with SpTRSV supports only omega = one
-      const scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one ();
-      omega = one;
+      omega = Kokkos::ArithTraits<scalar_t>::one ();
     }
   }
   else
     kh.create_gs_handle(GS_DEFAULT);
 
-  const size_t num_rows_1 = input_mat.numRows();
-  const size_t num_cols_1 = input_mat.numCols();
-  //const int apply_count = 100;
-  const int apply_count = 1;
-
-  gauss_seidel_symbolic
-    (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph);
-  gauss_seidel_numeric
-    (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph);
+  run_gauss_seidel(kh, input_mat, x_vector, y_vector, is_symmetric_graph, omega, apply_type);
 
-  switch (apply_type){
-  case 0:
-    symmetric_gauss_seidel_apply
-      (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  case 1:
-    forward_sweep_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  case 2:
-    backward_sweep_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  default:
-    symmetric_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  }
   kh.destroy_gs_handle();
-  return 0;
-}
-
-template<typename vec_t>
-vec_t create_x_vector(vec_t& kok_x, double max_value = 10.0) {
-  typedef typename vec_t::value_type scalar_t;
-  auto h_x = Kokkos::create_mirror_view (kok_x);
-  for (size_t j = 0; j < h_x.extent(1); ++j){
-    for (size_t i = 0; i < h_x.extent(0); ++i){
-      scalar_t r =
-          static_cast <scalar_t> (rand()) /
-          static_cast <scalar_t> (RAND_MAX / max_value);
-      h_x.access(i, j) = r;
-    }
-  }
-  Kokkos::deep_copy (kok_x, h_x);
-  return kok_x;
 }
 
-template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
-      crsMat.numRows());
-  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-  return y_vector;
-}
-
-template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector_mv(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector (Kokkos::ViewAllocateWithoutInitializing("Y VECTOR"),
-      crsMat.numRows(), x_vector.extent(1));
-  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-  return y_vector;
-}
-}
-
-template<typename scalar_t, typename lno_t, typename size_type, typename device, typename crsMat_t>
-crsMat_t symmetrize(crsMat_t A)
-{
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
-  auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
-  auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
-  auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
-  lno_t numRows = A.numRows();
-  //symmetrize as input_mat + input_mat^T, to still have a diagonally dominant matrix
-  typedef std::map<lno_t, scalar_t> Row;
-  std::vector<Row> symRows(numRows);
-  for(lno_t r = 0; r < numRows; r++)
-  {
-    auto& row = symRows[r];
-    for(size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++)
-    {
-      lno_t c = host_entries(i);
-      auto& col = symRows[c];
-      auto it = row.find(c);
-      if(it == row.end())
-        row[c] = host_values(i);
-      else
-        row[c] += host_values(i);
-      it = col.find(r);
-      if(it == col.end())
-        col[r] = host_values(i);
-      else
-        col[r] += host_values(i);
-    }
-  }
-  //Count entries
-  Kokkos::View<size_type*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_rowmap("Rowmap", numRows + 1);
-  size_t accum = 0;
-  for(lno_t r = 0; r <= numRows; r++)
-  {
-    new_host_rowmap(r) = accum;
-    if(r < numRows)
-      accum += symRows[r].size();
-  }
-  //Allocate new entries/values
-  Kokkos::View<lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_entries("Entries", accum);
-  Kokkos::View<scalar_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_values("Values", accum);
-  for(lno_t r = 0; r < numRows; r++)
-  {
-    auto rowIt = symRows[r].begin();
-    for(size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++)
-    {
-      new_host_entries(i) = rowIt->first;
-      new_host_values(i) = rowIt->second;
-      rowIt++;
-    }
-  }
-  lno_view_t new_rowmap("Rowmap", numRows + 1);
-  lno_nnz_view_t new_entries("Entries", accum);
-  scalar_view_t new_values("Values", accum);
-  Kokkos::deep_copy(new_rowmap, new_host_rowmap);
-  Kokkos::deep_copy(new_entries, new_host_entries);
-  Kokkos::deep_copy(new_values, new_host_values);
-  return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries);
-}
+} // namespace Test
 
 template <typename scalar_t, typename lno_t, typename size_type, typename device>
 void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool symmetric)
@@ -259,13 +178,13 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   if(symmetric)
   {
     //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
-    input_mat = symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
   }
   lno_t nv = input_mat.numRows();
   scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv);
-  create_x_vector(solution_x);
+  create_random_x_vector(solution_x);
   mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
-  scalar_view_t y_vector = create_y_vector(input_mat, solution_x);
+  scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x);
   //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior
   //of each algorithm _should be_ the same on every execution space, which is why
   //we just test GS_DEFAULT.
@@ -278,7 +197,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   {
     Kokkos::Impl::Timer timer1;
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view_t, device>(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
     //double gs = timer1.seconds();
     //KokkosKernels::Impl::print_1Dview(x_vector);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
@@ -297,7 +216,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
         Kokkos::Impl::Timer timer1;
         //Zero out X before solving
         Kokkos::deep_copy(x_vector, zero);
-        run_gauss_seidel<crsMat_t, scalar_view_t, device>(
+        run_gauss_seidel(
             input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo);
         KokkosBlas::axpby(one, solution_x, -one, x_vector);
         mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
@@ -309,8 +228,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   for (int apply_type = 0; apply_type < apply_count; ++apply_type)
   {
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
@@ -319,8 +237,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   for (int apply_type = 0; apply_type < apply_count; ++apply_type)
   {
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
@@ -342,14 +259,14 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   if(symmetric)
   {
     //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
-    input_mat = symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
   }
   lno_t nv = input_mat.numRows();
   host_scalar_view2d_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv, numVecs);
-  create_x_vector(solution_x);
+  create_random_x_vector(solution_x);
   scalar_view2d_t x_vector(Kokkos::ViewAllocateWithoutInitializing("X"), nv, numVecs);
   Kokkos::deep_copy(x_vector, solution_x);
-  scalar_view2d_t y_vector = create_y_vector_mv(input_mat, x_vector);
+  scalar_view2d_t y_vector = create_random_y_vector_mv(input_mat, x_vector);
   auto x_host = Kokkos::create_mirror_view(x_vector);
   std::vector<mag_t> initial_norms(numVecs);
   for(lno_t i = 0; i < numVecs; i++)
@@ -370,8 +287,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
     Kokkos::Impl::Timer timer1;
     //Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view2d_t, device>(
-        input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
     Kokkos::deep_copy(x_host, x_vector);
     for(lno_t i = 0; i < numVecs; i++)
     {
@@ -397,7 +313,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
         Kokkos::Impl::Timer timer1;
         //Zero out X before solving
         Kokkos::deep_copy(x_vector, zero);
-        run_gauss_seidel<crsMat_t, scalar_view2d_t, device>(
+        run_gauss_seidel(
             input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo);
         Kokkos::deep_copy(x_host, x_vector);
         for(lno_t i = 0; i < numVecs; i++)
@@ -420,8 +336,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   {
     //Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view2d_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
     Kokkos::deep_copy(x_host, x_vector);
     for(lno_t i = 0; i < numVecs; i++)
     {
@@ -441,8 +356,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   {
     //Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel<crsMat_t, scalar_view2d_t, device>
-      (input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
     Kokkos::deep_copy(x_host, x_vector);
     for(lno_t i = 0; i < numVecs; i++)
     {
@@ -484,7 +398,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t ro
   //record the correct solution, to compare against at the end
   vector_t xgold("X gold", numRows);
   Kokkos::deep_copy(xgold, x);
-  vector_t y = Test::create_y_vector(input_mat, x);
+  vector_t y = Test::create_random_y_vector(input_mat, x);
   exec_space().fence();
   auto y_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y);
   //initial solution is zero
@@ -610,6 +524,96 @@ void test_sgs_zero_rows()
   }
 }
 
+template <typename scalar_t, typename lno_t, typename size_type, typename device>
+void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, lno_t nnzPerShortRow, bool symmetric)
+{
+  using namespace Test;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::index_type::non_const_type entries_view_t;
+  typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t;
+  typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
+  const scalar_t one = Kokkos::ArithTraits<scalar_t>::one();
+  srand(245);
+  std::vector<size_type> rowmap = {0};
+  std::vector<lno_t> entries;
+  std::vector<scalar_t> values;
+  std::vector<lno_t> rowLengths;
+  for(lno_t i = 0; i < numRows; i++)
+  {
+    if(i < numLongRows)
+      rowLengths.push_back(numRows);
+    else
+      rowLengths.push_back(nnzPerShortRow);
+  }
+  std::random_shuffle(rowLengths.begin(), rowLengths.end());
+  size_type totalEntries = 0;
+  int randSteps = 1000000;
+  scalar_t offDiagBase;
+  {
+    scalar_t unused;
+    Test::getRandomBounds(0.6, unused, offDiagBase);
+  }
+  for(lno_t i = 0; i < numRows; i++)
+  {
+    for(lno_t ent = 0; ent < rowLengths[i]; ent++)
+    {
+      if(ent == 0)
+      {
+        entries.push_back(i);
+        values.push_back(2.5 * one);
+      }
+      else
+      {
+        entries.push_back(rand() % numRows);
+        values.push_back((-0.3 + (0.6 * (rand() % randSteps) / randSteps)) * offDiagBase);
+      }
+    }
+    totalEntries += rowLengths[i];
+    rowmap.push_back(totalEntries);
+  }
+  scalar_view_t valuesView(Kokkos::ViewAllocateWithoutInitializing("Values"), totalEntries);
+  entries_view_t entriesView(Kokkos::ViewAllocateWithoutInitializing("Entries"), totalEntries);
+  rowmap_view_t rowmapView(Kokkos::ViewAllocateWithoutInitializing("Rowmap"), numRows + 1);
+  Kokkos::deep_copy(valuesView, Kokkos::View<scalar_t*, Kokkos::HostSpace>(values.data(), totalEntries));
+  Kokkos::deep_copy(entriesView, Kokkos::View<lno_t*, Kokkos::HostSpace>(entries.data(), totalEntries));
+  Kokkos::deep_copy(rowmapView, Kokkos::View<size_type*, Kokkos::HostSpace>(rowmap.data(), numRows + 1));
+  crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView);
+  input_mat = KokkosKernels::Impl::sort_and_merge_matrix(input_mat);
+  if(symmetric)
+  {
+    //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+  }
+  lno_t nv = input_mat.numRows();
+  scalar_view_t solution_x(Kokkos::ViewAllocateWithoutInitializing("X (correct)"), nv);
+  create_random_x_vector(solution_x);
+  mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
+  scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x);
+  //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior
+  //of each algorithm _should be_ the same on every execution space, which is why
+  //we just test GS_DEFAULT.
+  int apply_count = 1;  //test symmetric, forward, backward
+  scalar_view_t x_vector(Kokkos::ViewAllocateWithoutInitializing("x vector"), nv);
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type)
+  {
+    typedef KokkosKernelsHandle
+      <size_type,lno_t, scalar_t,
+      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+
+    KernelHandle kh;
+    kh.create_gs_handle(GS_DEFAULT);
+    auto gsHandle = kh.get_point_gs_handle();
+    gsHandle->set_long_row_threshold(3 * nnzPerShortRow);
+    //Reset x vector to 0
+    Kokkos::deep_copy(x_vector, scalar_t());
+    run_gauss_seidel(kh, input_mat, x_vector, y_vector, symmetric, 0.9, apply_type);
+    KokkosBlas::axpby(one, solution_x, -one, x_vector);
+    mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
+    EXPECT_LT(result_norm_res, 0.25 * initial_norm_res);
+  }
+}
+
 #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
 TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   test_gauss_seidel_rank1<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, false); \
@@ -631,7 +635,10 @@ TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## O
 } \
 TEST_F( TestCategory, sparse ## _ ## sequential_sor ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
   test_sequential_sor<SCALAR,ORDINAL,OFFSET,DEVICE>(1000, 1000 * 15, 50, 10); \
-}
+} \
+TEST_F( TestCategory, sparse ## _ ## gauss_seidel_long_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
+  test_gauss_seidel_long_rows<SCALAR,ORDINAL,OFFSET,DEVICE>(500, 10, 20, true); \
+} \
 
 #if (defined (KOKKOSKERNELS_INST_DOUBLE) \
  && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
@@ -730,3 +737,4 @@ TEST_F( TestCategory, sparse ## _ ## sequential_sor ## _ ## SCALAR ## _ ## ORDIN
  EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
+#undef EXECUTE_TEST
diff --git a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp
index 5a033fdf344d..5d1b6305614a 100644
--- a/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/packages/kokkos-kernels/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -359,7 +359,7 @@ Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag)
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance){
+void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy){
 
   typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
@@ -390,24 +390,40 @@ void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_vari
   Kokkos::fill_random(input_xt,rand_pool,randomUpperBound<ScalarX>(10));
   Kokkos::fill_random(output_yt,rand_pool,randomUpperBound<ScalarY>(10));
 
-  std::vector<char> nonTransModes = {'N', 'C'};
-  std::vector<char> transModes = {'T', 'H'};
+  std::vector<char> nonTransModes = {'N'};
+  std::vector<char> transModes = {'T'};
+  std::vector<double> testAlphaBeta = {0.0, 1.0};
+  if(heavy)
+  {
+    nonTransModes.push_back('C');
+    transModes.push_back('H');
+    testAlphaBeta.push_back(-1.0);
+    testAlphaBeta.push_back(2.5);
+  }
   for(auto mode : nonTransModes)
   {
-    Test::check_spmv(input_mat, input_x, output_y, 1.0, 0.0, mode);
-    Test::check_spmv(input_mat, input_x, output_y, 0.0, 1.0, mode);
-    Test::check_spmv(input_mat, input_x, output_y, 1.0, 1.0, mode);
+    for(double alpha : testAlphaBeta)
+    {
+      for(double beta : testAlphaBeta)
+      {
+        Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode);
+      }
+    }
   }
   for(auto mode : transModes)
   {
-    Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 0.0, mode);
-    Test::check_spmv(input_mat, input_xt, output_yt, 0.0, 1.0, mode);
-    Test::check_spmv(input_mat, input_xt, output_yt, 1.0, 1.0, mode);
+    for(double alpha : testAlphaBeta)
+    {
+      for(double beta : testAlphaBeta)
+      {
+        Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode);
+      }
+    }
   }
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
-void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){
+void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy, int numMV){
   lno_t numCols = numRows;
 
   typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
@@ -435,19 +451,35 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v
   Kokkos::deep_copy(b_y_copy, b_y);
   Kokkos::deep_copy(b_yt_copy, b_yt);
 
-  std::vector<char> nonTransModes = {'N', 'C'};
-  std::vector<char> transModes = {'T', 'H'};
+  std::vector<char> nonTransModes = {'N'};
+  std::vector<char> transModes = {'T'};
+  std::vector<double> testAlphaBeta = {0.0, 1.0};
+  if(heavy)
+  {
+    nonTransModes.push_back('C');
+    transModes.push_back('H');
+    testAlphaBeta.push_back(-1.0);
+    testAlphaBeta.push_back(2.5);
+  }
   for(auto mode : nonTransModes)
   {
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, numMV, mode);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, numMV, mode);
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, numMV, mode);
+    for(double alpha : testAlphaBeta)
+    {
+      for(double beta : testAlphaBeta)
+      {
+        Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, mode);
+      }
+    }
   }
   for(auto mode : transModes)
   {
-    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, numMV, mode);
-    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, numMV, mode);
-    Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, 1.0, 1.0, numMV, mode);
+    for(double alpha : testAlphaBeta)
+    {
+      for(double beta : testAlphaBeta)
+      {
+        Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, numMV, mode);
+      }
+    }
   }
 }
 
@@ -477,7 +509,19 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N');
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T');
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T');
-    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T');
+    //Testing all modes together, since matrix is square
+    std::vector<char> modes = {'N', 'C', 'T', 'H'};
+    std::vector<double> testAlphaBeta = {0.0, 1.0, -1.0, 2.5};
+    for(auto mode : modes)
+    {
+      for(double alpha : testAlphaBeta)
+      {
+        for(double beta : testAlphaBeta)
+        {
+          Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, mode);
+        }
+      }
+    }
   }
 }
 
@@ -836,17 +880,23 @@ TEST_F( TestCategory,sparse ## _ ## spmv_issue_101 ## _ ## OFFSET ## _ ## DEVICE
 
 #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
 TEST_F( TestCategory,sparse ## _ ## spmv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 200, 10); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 100, 10); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (10000, 10000 * 20, 100, 5); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 30, 200, 10, true); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 30, 100, 10, true); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 20, 100, 5, true); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 200, 10, false); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 100, 10, false); \
+  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (10000, 10000 * 20, 100, 5, false); \
   test_spmv_controls<SCALAR,ORDINAL,OFFSET,DEVICE> (10000, 10000 * 20, 100, 5); \
 }
 
 #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \
 TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## LAYOUT ## _ ## DEVICE ) { \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, 5); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, 1); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, 10); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 30, 200, 10, true, 1); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 30, 100, 10, true, 5); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 20, 100, 5, true, 10); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, false, 1); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, false, 5); \
+  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, false, 10); \
   test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 30); \
 }
 
diff --git a/packages/kokkos/BUILD.md b/packages/kokkos/BUILD.md
index e1f0e3e47276..bb1a31f266ec 100644
--- a/packages/kokkos/BUILD.md
+++ b/packages/kokkos/BUILD.md
@@ -262,6 +262,9 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_
 * Kokkos_ARCH_ZEN2
     * Whether to optimize for the Zen2 architecture
     * BOOL Default: OFF
+* Kokkos_ARCH_ZEN3
+    * Whether to optimize for the Zen3 architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_HSW
     * Whether to optimize for the HSW architecture
     * BOOL Default: OFF
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index 5859fe32c432..7bb6de4cd924 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,8 +1,31 @@
 # Change Log
 
+## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
+
+**Bug Fixes:**
+- Windows: Remove atomic_compare_exchange_strong overload conflicts with Windows [\#4024](https://github.com/kokkos/kokkos/pull/4024)
+- OpenMPTarget: Fixup allocation headers with OpenMPTarget backend [\#4020](https://github.com/kokkos/kokkos/pull/4020)
+- OpenMPTarget: Add missing specailization for OMPT to Kokkos Random [\#4022](https://github.com/kokkos/kokkos/pull/4022)
+- AMD: Add support for AMD Zen3 CPU architecture [\#4021](https://github.com/kokkos/kokkos/pull/4021)
+- SYCL: Implement SYCL::print_configuration [\#4012](https://github.com/kokkos/kokkos/pull/4012)
+- Containers: staticcsrgraph: use device type instead of execution space to construct views [\#3998](https://github.com/kokkos/kokkos/pull/3998)
+- nvcc_wrapper: fix errors in argument handling, suppress duplicates of GPU architecture and RDC flags [\#4006](https://github.com/kokkos/kokkos/pull/4006)
+- CI: Add icpx testing to intel container [\#4004](https://github.com/kokkos/kokkos/pull/4004)
+- CMake/TRIBITS: Keep quoted compiler flags when passing to Trilinos [\#4007](https://github.com/kokkos/kokkos/pull/4007)
+- CMake: Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+
 ## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00)
 
+**Highlights:**
+- SYCL Backend Almost Feature Complete
+- OpenMPTarget Backend Almost Feature Complete
+- Performance Improvements for HIP backend
+- Require CMake 3.16 or newer
+- Tool Callback Interface Enhancements
+- cmath wrapper functions available now in Kokkos::Experimental
+
 **Features:**
 - Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861)
 - Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849)
@@ -48,7 +71,6 @@
 - Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677)
 - Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671)
 - Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626)
-- Introduce KOKKOS_PRINTF [\#3615](https://github.com/kokkos/kokkos/pull/3615)
 - Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564)
 - Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index 6fc1bf7d2f7f..d154aebc289f 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -112,7 +112,7 @@ ENDIF()
 
 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 00)
+set(Kokkos_VERSION_PATCH 01)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -206,8 +206,17 @@ ENDIF()
 IF (KOKKOS_HAS_TRILINOS)
   # Overwrite the old flags at the top-level
   # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags
-  # we have to match the annoying behavior
-  STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}")
+  # we have to match the annoying behavior, also we have to preserve quotes
+  # which needs another workaround.
+  IF (KOKKOS_ENABLE_SYCL)
+    SET(KOKKOS_COMPILE_OPTIONS_TMP)
+    FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    ENDFOREACH()
+    STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
+  ELSE()
+    STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}")
+  ENDIF()
   LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
   IF (KOKKOS_ENABLE_CUDA)
     LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS})
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index 2599121d70ad..bda8572073a3 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -2,7 +2,7 @@
 
 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 00
+KOKKOS_VERSION_PATCH = 01
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,OpenMP,Pthread,Serial
@@ -14,7 +14,7 @@ KOKKOS_DEVICES ?= "Pthread"
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908
-# AMD-CPUS: AMDAVX,Zen,Zen2
+# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@@ -372,6 +372,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 
 # AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
+KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
 KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
@@ -381,12 +382,12 @@ KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@@ -780,6 +781,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1)
   endif
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -mavx2
+    KOKKOS_LDFLAGS += -mavx2
+  else
+    KOKKOS_CXXFLAGS += -march=znver3 -mtune=znver3
+    KOKKOS_LDFLAGS += -march=znver3 -mtune=znver3
+  endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX")
diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt
index fd099054ba45..4df76a1dbbd1 100644
--- a/packages/kokkos/algorithms/CMakeLists.txt
+++ b/packages/kokkos/algorithms/CMakeLists.txt
@@ -5,10 +5,12 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
-
-KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
+        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
+             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+  KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
+ENDIF()
 
 KOKKOS_SUBPACKAGE_POSTPROCESS()
 
 
-
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 904cf5ccb967..55ce19971faf 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -687,6 +687,24 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+template <>
+struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
+  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  KOKKOS_FUNCTION
+  static int get_state_idx(const locks_view_type& locks) {
+    const int team_size = omp_get_num_threads();
+    int i               = omp_get_team_num() * team_size + omp_get_thread_num();
+    const int lock_size = locks.extent_int(0);
+
+    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+      i = (i + 1) % lock_size;
+    }
+    return i;
+  }
+};
+#endif
+
 }  // namespace Impl
 
 template <class DeviceType>
diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
index 9109837985a9..50f8f0a332a6 100644
--- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -44,7 +44,7 @@ IF(Kokkos_ENABLE_OPENMP)
   )
 ENDIF()
 
-foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL)
+foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
   # Because there is always an exception to the rule
   if(Tag STREQUAL "Threads")
     set(DEVICE "PTHREAD")
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index 1f14875096dd..c37e779c9927 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -109,6 +109,16 @@ struct RandomProperties {
   }
 };
 
+// FIXME_OPENMPTARGET: Need this for OpenMPTarget because contra to the standard
+// llvm requires the binary operator defined not just the +=
+KOKKOS_INLINE_FUNCTION
+RandomProperties operator+(const RandomProperties& org,
+                           const RandomProperties& add) {
+  RandomProperties val = org;
+  val += add;
+  return val;
+}
+
 template <class GeneratorPool, class Scalar>
 struct test_random_functor {
   using rnd_type = typename GeneratorPool::generator_type;
diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
index a3c362ec201b..9c6308c84347 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -370,7 +370,10 @@ template <class ExecutionSpace, typename KeyType>
 void test_sort(unsigned int N) {
   test_1D_sort<ExecutionSpace, KeyType>(N);
   test_3D_sort<ExecutionSpace, KeyType>(N);
+// FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
   test_dynamic_view_sort<ExecutionSpace, KeyType>(N);
+#endif
   test_issue_1160_sort<ExecutionSpace>();
 }
 }  // namespace Impl
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 5556e888e34b..4e52e4d09f4f 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -67,6 +67,11 @@ shared_versioned_libraries=""
 
 # Does the User set the architecture
 arch_set=0
+arch_flag=""
+
+# Does the user set RDC?
+rdc_set=0
+rdc_flag=""
 
 # Does the user overwrite the host compiler
 ccbin_set=0
@@ -190,8 +195,34 @@ do
     host_only_args="$host_only_args $1 $2"
     shift
     ;;
+  # Handle nvcc args controlling whether to generated relocatable device code
+  --relocatable-device-code=*|-rdc=*)
+    if [ "$rdc_set" -eq 0 ]; then
+        rdc_set=1
+        rdc_flag="$1"
+        cuda_args="$cuda_args $rdc_flag"
+    elif [  "$rdc_flag" != "$1" ]; then
+        echo "RDC is being set twice with different flags, which is not handled"
+        echo "$rdc_flag"
+        echo "$1"
+        exit 1
+    fi
+    ;;
+  -rdc)
+    if [ "$rdc_set" -eq 0 ]; then
+        rdc_set=1
+        rdc_flag="$1 $2"
+        cuda_args="$cuda_args $rdc_flag"
+        shift
+    elif [ "$rdc_flag" != "$1 $2" ]; then
+        echo "RDC is being set twice with different flags, which is not handled"
+        echo "$rdc_flag"
+        echo "$1 $2"
+        exit 1
+    fi
+    ;;
   #Handle known nvcc args
-  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
+  --dryrun|--verbose|--keep|--keep-dir*|-G|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
@@ -199,13 +230,13 @@ do
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
-  -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount=*|--maxrregcount=*)
+    cuda_args="$cuda_args $1"
+    ;;
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
-  -rdc=*|-maxrregcount*|--maxrregcount*)
-    cuda_args="$cuda_args $1"
-    ;;
   #Handle unsupported standard flags
   --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a)
     fallback_std_flag="-std=c++14"
@@ -323,20 +354,36 @@ do
     ;;
 
   #Handle -arch argument (if its not set use a default) this is the version with = sign
-  -arch*|-gencode*)
-    cuda_args="$cuda_args $1"
-    arch_set=1
+  -arch=*|-gencode=*)
+    if [ "$arch_set" -eq 0 ]; then
+        arch_set=1
+        arch_flag="$1"
+        cuda_args="$cuda_args $arch_flag"
+    elif [  "$arch_flag" != "$1" ]; then
+        echo "ARCH is being set twice with different flags, which is not handled"
+        echo "$arch_flag"
+        echo "$1"
+        exit 1
+    fi
+    ;;
+  #Handle -arch argument (if its not set use a default) this is the version without = sign
+  -arch|-gencode)
+    if [ "$arch_set" -eq 0 ]; then
+        arch_set=1
+        arch_flag="$1 $2"
+        cuda_args="$cuda_args $arch_flag"
+        shift
+    elif [ "$arch_flag" != "$1 $2" ]; then
+        echo "ARCH is being set twice with different flags, which is not handled"
+        echo "$arch_flag"
+        echo "$1 $2"
+        exit 1
+    fi
     ;;
   #Handle -code argument (if its not set use a default) this is the version with = sign
   -code*)
     cuda_args="$cuda_args $1"
     ;;
-  #Handle -arch argument (if its not set use a default) this is the version without = sign
-  -arch|-gencode)
-    cuda_args="$cuda_args $1 $2"
-    arch_set=1
-    shift
-    ;;
   #Handle -code argument (if its not set use a default) this is the version without = sign
   -code)
     cuda_args="$cuda_args $1 $2"
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index fbfae3711ec1..3455b0cb42e7 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -99,5 +99,6 @@
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
+#cmakedefine KOKKOS_ARCH_AMD_ZEN3
 
 #cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index ec18e70a36a3..e8b85542c633 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -63,6 +63,7 @@ KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0")
 KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6")
 KOKKOS_ARCH_OPTION(ZEN             HOST "AMD Zen architecture")
 KOKKOS_ARCH_OPTION(ZEN2            HOST "AMD Zen2 architecture")
+KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
@@ -215,6 +216,15 @@ IF (KOKKOS_ARCH_ZEN2)
   SET(KOKKOS_ARCH_AMD_AVX2 ON)
 ENDIF()
 
+IF (KOKKOS_ARCH_ZEN3)
+  COMPILER_SPECIFIC_FLAGS(
+    Intel   -mavx2
+    DEFAULT -march=znver3 -mtune=znver3
+  )
+  SET(KOKKOS_ARCH_AMD_ZEN3 ON)
+  SET(KOKKOS_ARCH_AMD_AVX2 ON)
+ENDIF()
+
 IF (KOKKOS_ARCH_WSM)
   COMPILER_SPECIFIC_FLAGS(
     Intel   -xSSE4.2
@@ -284,7 +294,7 @@ IF (KOKKOS_ARCH_SKX)
   )
 ENDIF()
 
-IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2)
+IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2 OR KOKKOS_ARCH_ZEN3)
   SET(KOKKOS_USE_ISA_X86_64 ON)
 ENDIF()
 
@@ -457,7 +467,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
   ENDIF()
   IF (KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__
+      IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
   ENDIF()
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index 4434d6928f46..23847263a952 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -101,7 +101,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
   IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang
     SET(KOKKOS_CLANG_IS_INTEL TRUE)
-    SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE)
+    SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE)
   ENDIF()
 ENDIF()
 
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index 445dad47ce56..d7f83ddbdf87 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -61,7 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP)
     COMPILER_SPECIFIC_FLAGS(
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
-      IntelClang -Xcompiler -fiopenmp
+      IntelLLVM  -Xcompiler -fiopenmp
       PGI        -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
@@ -70,7 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP)
   ELSE()
     COMPILER_SPECIFIC_FLAGS(
       Clang      ${ClangOpenMPFlag}
-      IntelClang -fiopenmp
+      IntelLLVM  -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
       PGI        -mp
       Cray       NO-VALUE-SPECIFIED
@@ -92,7 +92,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
 
   COMPILER_SPECIFIC_FLAGS(
     Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
-    IntelClang -fiopenmp -Wno-openmp-mapping
+    IntelLLVM  -fiopenmp -Wno-openmp-mapping
     XL         -qsmp=omp -qoffload -qnoeh
     PGI        -mp=gpu
     DEFAULT    -fopenmp
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index 858322394d7a..e1a3e5f8bd00 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
diff --git a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
index f22e5d1eca92..00d3eafd231e 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -114,15 +114,11 @@ namespace Kokkos {
 template <class StaticCrsGraphType, class InputSizeType>
 inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
     const std::string& label, const std::vector<InputSizeType>& input) {
-  using output_type = StaticCrsGraphType;
-  // using input_type = std::vector<InputSizeType>; // unused
-
+  using output_type  = StaticCrsGraphType;
   using entries_type = typename output_type::entries_type;
-
-  using work_type = View<typename output_type::size_type[],
-                         typename output_type::array_layout,
-                         typename output_type::execution_space,
-                         typename output_type::memory_traits>;
+  using work_type    = View<
+      typename output_type::size_type[], typename output_type::array_layout,
+      typename output_type::device_type, typename output_type::memory_traits>;
 
   output_type output;
 
@@ -161,10 +157,9 @@ inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
 
   static_assert(entries_type::rank == 1, "Graph entries view must be rank one");
 
-  using work_type = View<typename output_type::size_type[],
-                         typename output_type::array_layout,
-                         typename output_type::execution_space,
-                         typename output_type::memory_traits>;
+  using work_type = View<
+      typename output_type::size_type[], typename output_type::array_layout,
+      typename output_type::device_type, typename output_type::memory_traits>;
 
   output_type output;
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index dc5e0194ab0a..58d723ac110a 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -179,8 +179,6 @@ class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
       const RecordBase::function_type arg_dealloc = &deallocate);
 
  public:
-  std::string get_label() const;
-
   KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
       const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size) {
@@ -190,10 +188,6 @@ class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
     return nullptr;
 #endif
   }
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index aa720371df73..8ee76b43862f 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -113,7 +113,7 @@ class SYCL {
   void fence() const;
 
   /// \brief Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream&, const bool detail = false);
 
   /// \brief Free any resources being consumed by the device.
   static void impl_finalize();
@@ -131,12 +131,10 @@ class SYCL {
     sycl::device get_device() const;
 
     friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) {
-      return that.info(os);
+      return SYCL::impl_sycl_info(os, that.m_device);
     }
 
    private:
-    std::ostream& info(std::ostream& os) const;
-
     sycl::device m_device;
   };
 
@@ -154,6 +152,9 @@ class SYCL {
   }
 
  private:
+  static std::ostream& impl_sycl_info(std::ostream& os,
+                                      const sycl::device& device);
+
   Kokkos::Impl::HostSharedPtr<Impl::SYCLInternal> m_space_instance;
 };
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
index 6fbb4245b8fb..b99b0017ca17 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -107,12 +107,6 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
                      SharedAllocationRecord<void, void>::m_alloc_size);
 }
 
-// TODO: Implement deep copy back see CudaSpace
-std::string SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                                   void>::get_label() const {
-  return std::string("OpenMPTargetAllocation");
-}
-
 SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::OpenMPTargetSpace &arg_space,
@@ -141,23 +135,6 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
 
 //----------------------------------------------------------------------------
 
-void *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
-    reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  // Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data()
-  // , r_old->data()
-  //                                           , std::min( r_old->size() ,
-  //                                           r_new->size() ) );
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 9c29eb190d17..3a09ee919540 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -105,6 +105,12 @@ bool SYCL::impl_is_initialized() {
 
 void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); }
 
+void SYCL::print_configuration(std::ostream& s, const bool detailed) {
+  s << "macro  KOKKOS_ENABLE_SYCL : defined" << '\n';
+  if (detailed)
+    SYCL::impl_sycl_info(s, m_space_instance->m_queue->get_device());
+}
+
 void SYCL::fence() const {
   Impl::SYCLInternal::fence(*m_space_instance->m_queue);
 }
@@ -143,119 +149,118 @@ void SYCL::impl_initialize(SYCL::SYCLDevice d) {
   Impl::SYCLInternal::singleton().initialize(d.get_device());
 }
 
-std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const {
+std::ostream& SYCL::impl_sycl_info(std::ostream& os,
+                                   const sycl::device& device) {
   using namespace sycl::info;
-  return os << "Name: " << m_device.get_info<device::name>()
-            << "\nDriver Version: "
-            << m_device.get_info<device::driver_version>()
-            << "\nIs Host: " << m_device.is_host()
-            << "\nIs CPU: " << m_device.is_cpu()
-            << "\nIs GPU: " << m_device.is_gpu()
-            << "\nIs Accelerator: " << m_device.is_accelerator()
-            << "\nVendor Id: " << m_device.get_info<device::vendor_id>()
+  return os << "Name: " << device.get_info<device::name>()
+            << "\nDriver Version: " << device.get_info<device::driver_version>()
+            << "\nIs Host: " << device.is_host()
+            << "\nIs CPU: " << device.is_cpu()
+            << "\nIs GPU: " << device.is_gpu()
+            << "\nIs Accelerator: " << device.is_accelerator()
+            << "\nVendor Id: " << device.get_info<device::vendor_id>()
             << "\nMax Compute Units: "
-            << m_device.get_info<device::max_compute_units>()
+            << device.get_info<device::max_compute_units>()
             << "\nMax Work Item Dimensions: "
-            << m_device.get_info<device::max_work_item_dimensions>()
+            << device.get_info<device::max_work_item_dimensions>()
             << "\nMax Work Group Size: "
-            << m_device.get_info<device::max_work_group_size>()
+            << device.get_info<device::max_work_group_size>()
             << "\nPreferred Vector Width Char: "
-            << m_device.get_info<device::preferred_vector_width_char>()
+            << device.get_info<device::preferred_vector_width_char>()
             << "\nPreferred Vector Width Short: "
-            << m_device.get_info<device::preferred_vector_width_short>()
+            << device.get_info<device::preferred_vector_width_short>()
             << "\nPreferred Vector Width Int: "
-            << m_device.get_info<device::preferred_vector_width_int>()
+            << device.get_info<device::preferred_vector_width_int>()
             << "\nPreferred Vector Width Long: "
-            << m_device.get_info<device::preferred_vector_width_long>()
+            << device.get_info<device::preferred_vector_width_long>()
             << "\nPreferred Vector Width Float: "
-            << m_device.get_info<device::preferred_vector_width_float>()
+            << device.get_info<device::preferred_vector_width_float>()
             << "\nPreferred Vector Width Double: "
-            << m_device.get_info<device::preferred_vector_width_double>()
+            << device.get_info<device::preferred_vector_width_double>()
             << "\nPreferred Vector Width Half: "
-            << m_device.get_info<device::preferred_vector_width_half>()
+            << device.get_info<device::preferred_vector_width_half>()
             << "\nNative Vector Width Char: "
-            << m_device.get_info<device::native_vector_width_char>()
+            << device.get_info<device::native_vector_width_char>()
             << "\nNative Vector Width Short: "
-            << m_device.get_info<device::native_vector_width_short>()
+            << device.get_info<device::native_vector_width_short>()
             << "\nNative Vector Width Int: "
-            << m_device.get_info<device::native_vector_width_int>()
+            << device.get_info<device::native_vector_width_int>()
             << "\nNative Vector Width Long: "
-            << m_device.get_info<device::native_vector_width_long>()
+            << device.get_info<device::native_vector_width_long>()
             << "\nNative Vector Width Float: "
-            << m_device.get_info<device::native_vector_width_float>()
+            << device.get_info<device::native_vector_width_float>()
             << "\nNative Vector Width Double: "
-            << m_device.get_info<device::native_vector_width_double>()
+            << device.get_info<device::native_vector_width_double>()
             << "\nNative Vector Width Half: "
-            << m_device.get_info<device::native_vector_width_half>()
-            << "\nAddress Bits: " << m_device.get_info<device::address_bits>()
-            << "\nImage Support: " << m_device.get_info<device::image_support>()
+            << device.get_info<device::native_vector_width_half>()
+            << "\nAddress Bits: " << device.get_info<device::address_bits>()
+            << "\nImage Support: " << device.get_info<device::image_support>()
             << "\nMax Mem Alloc Size: "
-            << m_device.get_info<device::max_mem_alloc_size>()
+            << device.get_info<device::max_mem_alloc_size>()
             << "\nMax Read Image Args: "
-            << m_device.get_info<device::max_read_image_args>()
+            << device.get_info<device::max_read_image_args>()
             << "\nImage2d Max Width: "
-            << m_device.get_info<device::image2d_max_width>()
+            << device.get_info<device::image2d_max_width>()
             << "\nImage2d Max Height: "
-            << m_device.get_info<device::image2d_max_height>()
+            << device.get_info<device::image2d_max_height>()
             << "\nImage3d Max Width: "
-            << m_device.get_info<device::image3d_max_width>()
+            << device.get_info<device::image3d_max_width>()
             << "\nImage3d Max Height: "
-            << m_device.get_info<device::image3d_max_height>()
+            << device.get_info<device::image3d_max_height>()
             << "\nImage3d Max Depth: "
-            << m_device.get_info<device::image3d_max_depth>()
+            << device.get_info<device::image3d_max_depth>()
             << "\nImage Max Buffer Size: "
-            << m_device.get_info<device::image_max_buffer_size>()
+            << device.get_info<device::image_max_buffer_size>()
             << "\nImage Max Array Size: "
-            << m_device.get_info<device::image_max_array_size>()
-            << "\nMax Samplers: " << m_device.get_info<device::max_samplers>()
+            << device.get_info<device::image_max_array_size>()
+            << "\nMax Samplers: " << device.get_info<device::max_samplers>()
             << "\nMax Parameter Size: "
-            << m_device.get_info<device::max_parameter_size>()
+            << device.get_info<device::max_parameter_size>()
             << "\nMem Base Addr Align: "
-            << m_device.get_info<device::mem_base_addr_align>()
+            << device.get_info<device::mem_base_addr_align>()
             << "\nGlobal Cache Mem Line Size: "
-            << m_device.get_info<device::global_mem_cache_line_size>()
+            << device.get_info<device::global_mem_cache_line_size>()
             << "\nGlobal Mem Cache Size: "
-            << m_device.get_info<device::global_mem_cache_size>()
+            << device.get_info<device::global_mem_cache_size>()
             << "\nGlobal Mem Size: "
-            << m_device.get_info<device::global_mem_size>()
+            << device.get_info<device::global_mem_size>()
             << "\nMax Constant Buffer Size: "
-            << m_device.get_info<device::max_constant_buffer_size>()
+            << device.get_info<device::max_constant_buffer_size>()
             << "\nMax Constant Args: "
-            << m_device.get_info<device::max_constant_args>()
-            << "\nLocal Mem Size: "
-            << m_device.get_info<device::local_mem_size>()
+            << device.get_info<device::max_constant_args>()
+            << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
-            << m_device.get_info<device::error_correction_support>()
+            << device.get_info<device::error_correction_support>()
             << "\nHost Unified Memory: "
-            << m_device.get_info<device::host_unified_memory>()
+            << device.get_info<device::host_unified_memory>()
             << "\nProfiling Timer Resolution: "
-            << m_device.get_info<device::profiling_timer_resolution>()
+            << device.get_info<device::profiling_timer_resolution>()
             << "\nIs Endian Little: "
-            << m_device.get_info<device::is_endian_little>()
-            << "\nIs Available: " << m_device.get_info<device::is_available>()
+            << device.get_info<device::is_endian_little>()
+            << "\nIs Available: " << device.get_info<device::is_available>()
             << "\nIs Compiler Available: "
-            << m_device.get_info<device::is_compiler_available>()
+            << device.get_info<device::is_compiler_available>()
             << "\nIs Linker Available: "
-            << m_device.get_info<device::is_linker_available>()
+            << device.get_info<device::is_linker_available>()
             << "\nQueue Profiling: "
-            << m_device.get_info<device::queue_profiling>()
+            << device.get_info<device::queue_profiling>()
             << "\nBuilt In Kernels: "
             << Container<std::vector<std::string>>(
-                   m_device.get_info<device::built_in_kernels>())
-            << "\nVendor: " << m_device.get_info<device::vendor>()
-            << "\nProfile: " << m_device.get_info<device::profile>()
-            << "\nVersion: " << m_device.get_info<device::version>()
+                   device.get_info<device::built_in_kernels>())
+            << "\nVendor: " << device.get_info<device::vendor>()
+            << "\nProfile: " << device.get_info<device::profile>()
+            << "\nVersion: " << device.get_info<device::version>()
             << "\nExtensions: "
             << Container<std::vector<std::string>>(
-                   m_device.get_info<device::extensions>())
+                   device.get_info<device::extensions>())
             << "\nPrintf Buffer Size: "
-            << m_device.get_info<device::printf_buffer_size>()
+            << device.get_info<device::printf_buffer_size>()
             << "\nPreferred Interop User Sync: "
-            << m_device.get_info<device::preferred_interop_user_sync>()
+            << device.get_info<device::preferred_interop_user_sync>()
             << "\nPartition Max Sub Devices: "
-            << m_device.get_info<device::partition_max_sub_devices>()
+            << device.get_info<device::partition_max_sub_devices>()
             << "\nReference Count: "
-            << m_device.get_info<device::reference_count>() << '\n';
+            << device.get_info<device::reference_count>() << '\n';
 }
 
 namespace Impl {
@@ -293,15 +298,13 @@ void SYCLSpaceInitializer::fence() {
 }
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
-                                               const bool /*detail*/) {
+                                               const bool detail) {
   msg << "Devices:" << std::endl;
   msg << "  KOKKOS_ENABLE_SYCL: ";
   msg << "yes" << std::endl;
 
   msg << "\nRuntime Configuration:" << std::endl;
-  // FIXME_SYCL not implemented
-  std::abort();
-  // Experimental::SYCL::print_configuration(msg, detail);
+  Experimental::SYCL{}.print_configuration(msg, detail);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
index 3f2e8914ea93..2f824566b804 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@@ -152,12 +152,6 @@ inline T atomic_compare_exchange(
                                  ((LONGLONG*)&compare_and_result));
   return compare_and_result;
 }
-
-template <typename T>
-inline T atomic_compare_exchange_strong(volatile T* const dest,
-                                        const T& compare, const T& val) {
-  return atomic_compare_exchange(dest, compare, val);
-}
 #endif
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash
index 5ff781b96fc0..8fe8e2b5ecea 100755
--- a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash
+++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash
@@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL)
 DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70)
 if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then
   export KOKKOS_ARCH_TEST=1
-  HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 ARMv8_ThunderX ARMv8_ThunderX2)
+  HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2)
   DeviceArch=()
 fi
 
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index e9871b436971..c601e0ee161f 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -157,6 +157,7 @@ display_help_text() {
       echo "                 AMDAVX          = AMD CPU"
       echo "                 ZEN             = AMD Zen-Core CPU"
       echo "                 ZEN2            = AMD Zen2-Core CPU"
+      echo "                 ZEN3            = AMD Zen3-Core CPU"
       echo "               [AMD: GPU]"
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash
index ea509669f068..8a463270c855 100755
--- a/packages/kokkos/gnu_generate_makefile.bash
+++ b/packages/kokkos/gnu_generate_makefile.bash
@@ -137,6 +137,7 @@ do
       echo "                 AMDAVX          = AMD CPU"
       echo "                 ZEN             = AMD Zen-Core CPU"
       echo "                 ZEN2            = AMD Zen2-Core CPU"
+      echo "                 ZEN3            = AMD Zen3-Core CPU"
       echo "               [ARM]"
       echo "                 ARMv80          = ARMv8.0 Compatible CPU"
       echo "                 ARMv81          = ARMv8.1 Compatible CPU"
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index e746bd7d0103..be8a5e7da5f4 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -23,3 +23,5 @@ tag:  3.1.01     date: 05:04:2020    master: 785d19f2    release: 2be028bc
 tag:  3.2.00     date: 08:19:2020    master: 3b2fdc7e    release: 5dc6d303
 tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
+tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
+tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
index f21124ed6e71..ff9620efa689 100755
--- a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
+++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
@@ -129,6 +129,7 @@ do
       echo "                 AMDAVX          = AMD CPU"
       echo "                 ZEN             = AMD Zen-Core CPU"
       echo "                 ZEN2            = AMD Zen2-Core CPU"
+      echo "                 ZEN3            = AMD Zen3-Core CPU"
       echo "               [ARM]"
       echo "                 ARMv80          = ARMv8.0 Compatible CPU"
       echo "                 ARMv81          = ARMv8.1 Compatible CPU"
diff --git a/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp b/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp
index 2c26f75623cd..dbf655c5e2ac 100644
--- a/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp
+++ b/packages/muelu/adapters/xpetra/MueLu_RefMaxwell_def.hpp
@@ -109,7 +109,8 @@
 #include <Thyra_SolveSupportTypes.hpp>
 // Stratimikos includes
 #include <Stratimikos_DefaultLinearSolverBuilder.hpp>
-#include <Stratimikos_MueLuHelpers.hpp>
+#include <Thyra_MueLuPreconditionerFactory.hpp>
+#include "Teuchos_AbstractFactoryStd.hpp"
 // Ifpack2 includes
 #ifdef HAVE_MUELU_IFPACK2
 #include <Thyra_Ifpack2PreconditionerFactory.hpp>
@@ -171,34 +172,6 @@ namespace MueLu {
     FindNonZeros<Scalar,LocalOrdinal,GlobalOrdinal,Node>(myColsToZero->getData(0),dirichletCols);
   }
 
-
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A,
-                            const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
-                            Teuchos::ArrayRCP<bool>& dirichletRows)
-  {
-    typedef Teuchos::ScalarTraits<Scalar> STS;
-    RCP<const Xpetra::Map<LocalOrdinal,GlobalOrdinal,Node>> rowmap = A.getRowMap();
-    for (LocalOrdinal row = 0; row < Teuchos::as<LocalOrdinal>(rowmap->getNodeNumElements()); ++row) {
-      size_t nnz = A.getNumEntriesInLocalRow(row);
-      ArrayView<const LocalOrdinal> indices;
-      ArrayView<const Scalar> vals;
-      A.getLocalRowView(row, indices, vals);
-
-      Scalar rowsum = STS::zero();
-      Scalar diagval = STS::zero();
-      for (LocalOrdinal colID = 0; colID < Teuchos::as<LocalOrdinal>(nnz); colID++) {
-        LocalOrdinal col = indices[colID];
-        if (row == col)
-          diagval = vals[colID];
-        rowsum += vals[colID];
-      }
-      if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol)
-        dirichletRows[row] = true;
-    }
-  }
-
-
 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -259,33 +232,6 @@ namespace MueLu {
     FindNonZeros<Scalar,LocalOrdinal,GlobalOrdinal,Node>(myColsToZero->getDeviceLocalView(),dirichletCols);
   }
 
-
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A,
-                            const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
-                            Kokkos::View<bool*, typename Node::device_type> & dirichletRows)
-  {
-    typedef Teuchos::ScalarTraits<Scalar> STS;
-    RCP<const Xpetra::Map<LocalOrdinal,GlobalOrdinal,Node>> rowmap = A.getRowMap();
-    for (LocalOrdinal row = 0; row < Teuchos::as<LocalOrdinal>(rowmap->getNodeNumElements()); ++row) {
-      size_t nnz = A.getNumEntriesInLocalRow(row);
-      ArrayView<const LocalOrdinal> indices;
-      ArrayView<const Scalar> vals;
-      A.getLocalRowView(row, indices, vals);
-
-      Scalar rowsum = STS::zero();
-      Scalar diagval = STS::zero();
-      for (LocalOrdinal colID = 0; colID < Teuchos::as<LocalOrdinal>(nnz); colID++) {
-        LocalOrdinal col = indices[colID];
-        if (row == col)
-          diagval = vals[colID];
-        rowsum += vals[colID];
-      }
-      if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol)
-        dirichletRows(row) = true;
-    }
-  }
-
 #endif
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -416,7 +362,7 @@ namespace MueLu {
       BCrowsKokkos_ = Utilities_kokkos::DetectDirichletRows(*SM_Matrix_,Teuchos::ScalarTraits<magnitudeType>::eps(),/*count_twos_as_dirichlet=*/true);
 
       if (rowSumTol > 0.)
-        ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrowsKokkos_);
+        Utilities_kokkos::ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrowsKokkos_);
 
       BCcolsKokkos_ = Kokkos::View<bool*,typename Node::device_type>(Kokkos::ViewAllocateWithoutInitializing("dirichletCols"), D0_Matrix_->getColMap()->getNodeNumElements());
       BCdomainKokkos_ = Kokkos::View<bool*,typename Node::device_type>(Kokkos::ViewAllocateWithoutInitializing("dirichletCols"), D0_Matrix_->getDomainMap()->getNodeNumElements());
@@ -438,7 +384,7 @@ namespace MueLu {
       BCrows_ = Teuchos::arcp_const_cast<bool>(Utilities::DetectDirichletRows(*SM_Matrix_,Teuchos::ScalarTraits<magnitudeType>::eps(),/*count_twos_as_dirichlet=*/true));
 
       if (rowSumTol > 0.)
-        ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrows_);
+        Utilities::ApplyRowSumCriterion(*SM_Matrix_, rowSumTol, BCrows_);
 
       BCcols_.resize(D0_Matrix_->getColMap()->getNodeNumElements());
       BCdomain_.resize(D0_Matrix_->getDomainMap()->getNodeNumElements());
@@ -2408,7 +2354,7 @@ namespace MueLu {
             AHBCrows[i*dim+k] = BCdomain_[i];
       magnitudeType rowSumTol = parameterList_.get("refmaxwell: row sum drop tol (1,1)",-1.0);
       if (rowSumTol > 0.)
-        ApplyRowSumCriterion(*AH_, rowSumTol, AHBCrows);
+        Utilities::ApplyRowSumCriterion(*AH_, rowSumTol, AHBCrows);
       if (applyBCsToH_)
         Utilities::ApplyOAZToMatrixRows(AH_, AHBCrows);
     }
@@ -3010,10 +2956,11 @@ namespace MueLu {
     RCP<const Thyra::LinearOpBase<Scalar> > thyraA = Xpetra::ThyraUtils<Scalar,LocalOrdinal,GlobalOrdinal,Node>::toThyra(Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>>(A)->getCrsMatrix());
 
     Stratimikos::DefaultLinearSolverBuilder linearSolverBuilder;
-    Stratimikos::enableMueLu<LocalOrdinal,GlobalOrdinal,Node>(linearSolverBuilder);
+    typedef Thyra::PreconditionerFactoryBase<Scalar>                                     Base;
+    typedef Thyra::MueLuPreconditionerFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node> ImplMueLu;
+    linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd<Base, ImplMueLu>(), "MueLu");
 #ifdef HAVE_MUELU_IFPACK2
     // Register Ifpack2 as a Stratimikos preconditioner strategy.
-    typedef Thyra::PreconditionerFactoryBase<Scalar> Base;
     typedef Thyra::Ifpack2PreconditionerFactory<Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Impl;
     linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd<Base, Impl>(), "Ifpack2");
 #endif
diff --git a/packages/muelu/doc/UsersGuide/masterList.xml b/packages/muelu/doc/UsersGuide/masterList.xml
index a36a7e2ba1f7..efdcf0a2371e 100644
--- a/packages/muelu/doc/UsersGuide/masterList.xml
+++ b/packages/muelu/doc/UsersGuide/masterList.xml
@@ -411,6 +411,24 @@
       <comment-ML>parameter not existing in ML</comment-ML>
     </parameter>
 
+    <parameter>
+      <name>aggregation: classical scheme</name>
+      <type>string</type>
+      <default>"direct"</default>
+      <description>Prolongator formation option for classical coarsening.</description>
+      <visible>false</visible>
+      <comment-ML>parameter not existing in ML</comment-ML>
+    </parameter>
+
+    <parameter>
+      <name>aggregation: row sum drop tol</name>
+      <type>double</type>
+      <default>-1.0</default>
+      <description>Detection threshold for mass-dominated      rows. Defaults to -1 (meaning disabled)</description>
+      <visible>false</visible>
+      <comment-ML>parameter not existing in ML</comment-ML>
+    </parameter>
+
     <parameter>
       <name>aggregation: block diagonal: interleaved blocksize</name>
       <type>int</type>
@@ -826,11 +844,10 @@
       <name>aggregation: coarsening order</name>
       <type>int</type>
       <default>0</default>
-      <description>The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.</description>
+      <description>The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. </description>
       <comment-ML>parameter not existing in ML</comment-ML>
     </parameter>
 
-
     <parameter>
       <name>aggregation: pairwise: size</name>
       <type>int</type>
@@ -1177,15 +1194,6 @@ Only used when tentative: calculate qr is set to false.</description>
       <name-ML>not supported by ML</name-ML>
     </parameter>
 
-    <parameter>
-      <name>interp: interpolation order</name>
-      <type>int</type>
-      <default>1</default>
-      <description>Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.</description>
-      <visible>true</visible>
-      <comment-ML>parameter not existing in ML</comment-ML>
-    </parameter>
-
     <parameter>
       <name>interp: build coarse coordinates</name>
       <type>bool</type>
diff --git a/packages/muelu/doc/UsersGuide/options_aggregation.tex b/packages/muelu/doc/UsersGuide/options_aggregation.tex
index aac7f46e4474..0f2c5c2a6e34 100644
--- a/packages/muelu/doc/UsersGuide/options_aggregation.tex
+++ b/packages/muelu/doc/UsersGuide/options_aggregation.tex
@@ -62,5 +62,5 @@
           
 \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.}
           
-\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.}
+\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. }
           
\ No newline at end of file
diff --git a/packages/muelu/doc/UsersGuide/options_multigrid.tex b/packages/muelu/doc/UsersGuide/options_multigrid.tex
index 9b6382f689bf..fdec2ab25d34 100644
--- a/packages/muelu/doc/UsersGuide/options_multigrid.tex
+++ b/packages/muelu/doc/UsersGuide/options_multigrid.tex
@@ -9,8 +9,6 @@
           
 \cbb{sa: use filtered matrix}{bool}{true}{Matrix to use for smoothing the tentative prolongator. The two options are: to use the original matrix, and to use the filtered matrix with filtering based on filtered graph used for aggregation.}
           
-\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.}
-          
 \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.}
           
 \cbb{filtered matrix: use lumping}{bool}{true}{Lump (add to diagonal) dropped entries during the construction of a filtered matrix. This allows user to preserve constant nullspace.}
diff --git a/packages/muelu/doc/UsersGuide/paramlist.tex b/packages/muelu/doc/UsersGuide/paramlist.tex
index 31c1a9baaed3..782a7615a83a 100644
--- a/packages/muelu/doc/UsersGuide/paramlist.tex
+++ b/packages/muelu/doc/UsersGuide/paramlist.tex
@@ -115,7 +115,7 @@
           
 \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.}
           
-\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.}
+\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. }
           
 \cbb{aggregate qualities: check symmetry}{bool}{false}{Whether to check symmetry and use nonsymmetric aggregate quality estimate if necessary.}
           
@@ -168,8 +168,6 @@
           
 \cbb{sa: use filtered matrix}{bool}{true}{Matrix to use for smoothing the tentative prolongator. The two options are: to use the original matrix, and to use the filtered matrix with filtering based on filtered graph used for aggregation.}
           
-\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.}
-          
 \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.}
           
 \cbb{filtered matrix: use lumping}{bool}{true}{Lump (add to diagonal) dropped entries during the construction of a filtered matrix. This allows user to preserve constant nullspace.}
diff --git a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex
index 4a85d8e27376..5b5c5c8be0bb 100644
--- a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex
+++ b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex
@@ -70,6 +70,10 @@
       aggregation. Possible values: "classical", "distance laplacian",
       "unsupported vector smoothing"}
         
+\cbb{aggregation: classical scheme}{string}{"direct"}{Prolongator formation option for classical coarsening.}
+        
+\cbb{aggregation: row sum drop tol}{double}{-1.0}{Detection threshold for mass-dominated      rows. Defaults to -1 (meaning disabled)}
+        
 \cbb{aggregation: block diagonal: interleaved blocksize}{int}{3}{Effective block size to use for
       block-diagonalization.  This assumes the PDE is interleaved}
         
@@ -173,7 +177,7 @@
         
 \cbb{aggregation: number of spatial dimensions}{int}{3}{The number of spatial dimensions in the problem.}
         
-\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory.}
+\cbb{aggregation: coarsening order}{int}{0}{The interpolation order used while constructing these aggregates, this value will be passed to the prolongator factory. There, possible values are 0 for piece-wise constant and 1 for piece-wise linear interpolation to transfer values from coarse points to fine points. }
         
 \cbb{aggregation: pairwise: size}{int}{8}{Target size for pairwise aggregation.  The number
       of pairwise steps used will be log base-2 of this number.}
@@ -260,8 +264,6 @@
         
 \cbb{sa: rowsumabs diagonal replacement value}{double}{0.0}{If it's determined that a diagonal entry in prolongator smoothing is too small, replace that entry with this value.}
         
-\cbb{interp: interpolation order}{int}{1}{Interpolation order used to interpolate values from coarse points to fine points. Possible values are 0 for piece-wise constant interpolation and 1 for piece-wise linear interpolation. This parameter is set to 1 by default.}
-        
 \cbb{interp: build coarse coordinates}{bool}{true}{If false, skip the calculation of coarse coordinates.}
         
 \cba{transfer: params}{\parameterlist}{Sublist of options for use by transfer.}
diff --git a/packages/muelu/research/regionMG/example/elasticity_3d.xml b/packages/muelu/research/regionMG/example/elasticity_3d.xml
index 8f4323366bef..2cea9bc61167 100644
--- a/packages/muelu/research/regionMG/example/elasticity_3d.xml
+++ b/packages/muelu/research/regionMG/example/elasticity_3d.xml
@@ -39,7 +39,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/example/poisson_3d.xml b/packages/muelu/research/regionMG/example/poisson_3d.xml
index 2d552bd84b62..8bf2429c8350 100644
--- a/packages/muelu/research/regionMG/example/poisson_3d.xml
+++ b/packages/muelu/research/regionMG/example/poisson_3d.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml
index 02a6e9ef291d..dbc1d0b713ea 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_1dof-complex.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof.xml
index 36d028fd48a4..8d36dec5bbfd 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_1dof.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_1dof.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml b/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml
index d8e25a82f5cb..ce720e370686 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_1dof_3level.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml
index 1dacffa64316..ff6e61c0f0ee 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml
index 57a77564b2e7..b83bcabee52f 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_linear_1dof_comp.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml b/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml
index 48f6d321d601..dc58deed4634 100644
--- a/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml
+++ b/packages/muelu/research/regionMG/test/structured/structured_linear_3dof.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/src/CMakeLists.txt b/packages/muelu/src/CMakeLists.txt
index 359b0ad5b9eb..21cd0c294c10 100644
--- a/packages/muelu/src/CMakeLists.txt
+++ b/packages/muelu/src/CMakeLists.txt
@@ -44,6 +44,7 @@ INCLUDE_DIRECTORIES(${DIR}/Transfers/BaseClass)
 INCLUDE_DIRECTORIES(${DIR}/Transfers/BlackBox)
 INCLUDE_DIRECTORIES(${DIR}/Smoothers/BlockedSmoothers)
 INCLUDE_DIRECTORIES(${DIR}/Transfers/BlockedTransfers)
+INCLUDE_DIRECTORIES(${DIR}/Transfers/Classical)
 INCLUDE_DIRECTORIES(${DIR}/Transfers/Energy-Minimization)
 INCLUDE_DIRECTORIES(${DIR}/Transfers/Energy-Minimization/Solvers)
 INCLUDE_DIRECTORIES(${DIR}/Transfers/GeneralGeometric)
@@ -387,6 +388,7 @@ TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Smoothers/BlockedSmoothers NOSIER
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BaseClass NOSIERRABJAM)
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BlackBox NOSIERRABJAM)
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/BlockedTransfers NOSIERRABJAM)
+TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Classical NOSIERRABJAM)
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Generic NOSIERRABJAM)
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Energy-Minimization NOSIERRABJAM)
 TRILINOS_CREATE_CLIENT_TEMPLATE_HEADERS(${DIR}/Transfers/Energy-Minimization/Solvers NOSIERRABJAM)
@@ -458,3 +460,6 @@ TRIBITS_ADD_LIBRARY(
 # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
 # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
 # touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
+# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
+# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
+# touch CMakeLists.txt because a new file was created in Utils/ExplicitInstantiation of Utils/ForwardDeclaration
diff --git a/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp b/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp
index 1193faa8e4c8..425cda7da535 100644
--- a/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp
+++ b/packages/muelu/src/Graph/Containers/MueLu_GraphBase.hpp
@@ -70,6 +70,10 @@ namespace MueLu {
 #include "MueLu_UseShortNamesOrdinal.hpp"
 
   public:
+    // For Zoltan2 compatibility
+    using lno_t  = LocalOrdinal;
+    using gno_t  = GlobalOrdinal;
+    using node_t = Node;
 
     //! @name Constructors/Destructors.
     //@{
diff --git a/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp b/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp
index ab9999fefaef..09609adcf60a 100644
--- a/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp
+++ b/packages/muelu/src/Graph/Containers/MueLu_Graph_decl.hpp
@@ -94,6 +94,8 @@ namespace MueLu {
     //! Returns overlapping import map (nodes).
     const RCP<const Map> GetImportMap() const                                { return graph_->getColMap();    }
 
+    const RCP<const CrsGraph> GetGraph() const {return graph_;}
+
     //! Set map with local ids of boundary nodes.
     void SetBoundaryNodeMap(const ArrayRCP<const bool>& localDirichletNodes) { localDirichletNodes_ = localDirichletNodes; }
 
diff --git a/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp b/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp
index 64706667ba53..09f55a70193e 100644
--- a/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp
+++ b/packages/muelu/src/Graph/Containers/MueLu_LWGraph_decl.hpp
@@ -139,7 +139,16 @@ namespace MueLu {
     /// Return a simple one-line description of the Graph.
     std::string description() const                              { return "MueLu.description()"; } //FIXME use object's label
 
+    //! Return the row pointers of the local graph
+    const ArrayRCP<const LO> getRowPtrs() const {
+      return rows_;
+    }
 
+    //! Return the list entries in the local graph
+    const ArrayRCP<const LO> getEntries() const {
+      return columns_;
+    }
+    
     //! Print the Graph with some verbosity level to an FancyOStream object.
     //using MueLu::Describable::describe; // overloading, not hiding
     //void describe(Teuchos::FancyOStream &out, const VerbLevel verbLevel = Default) const;;
diff --git a/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp b/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp
new file mode 100644
index 000000000000..7014efcc3281
--- /dev/null
+++ b/packages/muelu/src/Graph/Containers/MueLu_Zoltan2GraphAdapter.hpp
@@ -0,0 +1,459 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+
+#ifndef MUELU_ZOLTAN2GRAPHADAPTER_HPP_
+#define MUELU_ZOLTAN2GRAPHADAPTER_HPP_
+
+#include "MueLu_ConfigDefs.hpp"
+
+#if defined(HAVE_MUELU_ZOLTAN2)
+
+#include <Teuchos_RCP.hpp>
+#include <Teuchos_Comm.hpp>
+#include <Teuchos_ArrayView.hpp>
+#include <Xpetra_Map.hpp>
+#include <Zoltan2_InputTraits.hpp>
+#include <Zoltan2_GraphAdapter.hpp>
+#include <Zoltan2_StridedData.hpp>
+#include <Zoltan2_PartitioningSolution.hpp>
+#include "MueLu_GraphBase.hpp"
+
+
+
+// Zoltab2 InputTraits for MueLu Graph objects
+namespace Zoltan2 {
+
+template <typename LocalOrdinal,
+          typename GlobalOrdinal,
+          typename Node>
+struct InputTraits<MueLu::GraphBase<LocalOrdinal,GlobalOrdinal,Node> >
+{
+  typedef Zoltan2::default_scalar_t scalar_t;
+  typedef LocalOrdinal  lno_t;
+  typedef GlobalOrdinal gno_t;
+  typedef size_t offset_t;
+  typedef Zoltan2::default_part_t  part_t;
+  typedef Node          node_t;
+  static inline std::string name() {return "MueLu::Graph";}
+
+  Z2_STATIC_ASSERT_TYPES // validate the types
+};
+}//end namespace Zoltan2
+
+
+namespace MueLu {
+
+template <typename User, typename UserCoord=User>
+class MueLuGraphBaseAdapter : public Zoltan2::GraphAdapter<User,UserCoord> {
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+  typedef typename Zoltan2::InputTraits<User>::scalar_t    scalar_t;
+  typedef typename Zoltan2::InputTraits<User>::offset_t    offset_t;
+  typedef typename Zoltan2::InputTraits<User>::lno_t    lno_t;
+  typedef typename Zoltan2::InputTraits<User>::gno_t    gno_t;
+  typedef typename Zoltan2::InputTraits<User>::part_t   part_t;
+  typedef typename Zoltan2::InputTraits<User>::node_t   node_t;
+  typedef User xgraph_t;
+  typedef User user_t;
+  typedef UserCoord userCoord_t;
+#endif
+
+  //! MueLu::GraphBase Compatibility Layer
+  const Teuchos::RCP< const Teuchos::Comm< int > >  getComm() const { return graph_->GetComm();}
+  const Teuchos::RCP< const Xpetra::Map<lno_t, gno_t, node_t> > getRowMap() const { return graph_->GetDomainMap();}
+  const RCP< const Xpetra::Map<lno_t, gno_t, node_t> > getColMap() const { 
+    // For some GraphBases' this is a ColMap, in others it is a seperate map that is
+    // only non-null in parallel.
+    Teuchos::RCP<const Xpetra::Map<lno_t,gno_t,node_t> > map =  graph_->GetImportMap();
+    if(map.is_null()) map = graph_->GetDomainMap();
+    return map;
+  }
+  size_t getNodeNumEntries() const { return graph_->GetNodeNumEdges();}
+  size_t getNodeNumRows() const { return getRowMap()->getNodeNumElements();}
+  size_t getNodeNumCols() const { return getColMap()->getNodeNumElements();}
+
+  void getLocalRowView(lno_t LocalRow, Teuchos::ArrayView< const lno_t > &indices) const {
+   indices = graph_->getNeighborVertices(LocalRow);
+  }
+
+
+
+  /*! \brief Destructor
+   */
+  ~MueLuGraphBaseAdapter() { }
+
+  /*! \brief Constructor for graph with no weights or coordinates.
+   *  \param ingraph the Epetra_CrsGraph, Tpetra::CrsGraph or Xpetra::CrsGraph
+   *  \param numVtxWeights  the number of weights per vertex (default = 0)
+   *  \param numEdgeWeights the number of weights per edge  (default = 0)
+   *
+   * Most adapters do not have RCPs in their interface.  This
+   * one does because the user is obviously a Trilinos user.
+   */
+
+   MueLuGraphBaseAdapter(const RCP<const User> &ingraph, 
+                      int nVtxWeights=0, int nEdgeWeights=0);
+
+  /*! \brief Provide a pointer to weights for the primary entity type.
+   *    \param val A pointer to the weights for index \c idx.
+   *    \param stride    A stride for the \c val array.  If \stride is
+   *             \c k, then val[n * k] is the weight for the
+   *             \c n th entity for index \idx.
+   *    \param idx A number from 0 to one less than 
+   *          weight idx specified in the constructor.
+   *
+   *  The order of the weights should match the order that
+   *  entities appear in the input data structure.
+   */
+
+  void setWeights(const scalar_t *val, int stride, int idx);
+
+  /*! \brief Provide a pointer to vertex weights.
+   *    \param val A pointer to the weights for index \c idx.
+   *    \param stride    A stride for the \c val array.  If \stride is
+   *             \c k, then val[n * k] is the weight for the
+   *             \c n th vertex for index \idx.
+   *    \param idx A number from 0 to one less than 
+   *          number of vertex weights specified in the constructor.
+   *
+   *  The order of the vertex weights should match the order that
+   *  vertices appear in the input data structure.
+   *     \code
+   *       TheGraph->getRowMap()->getNodeElementList()
+   *     \endcode
+   */
+
+  void setVertexWeights(const scalar_t *val, int stride, int idx);
+
+  /*! \brief Specify an index for which the weight should be
+              the degree of the entity
+   *    \param idx Zoltan2 will use the entity's 
+   *         degree as the entity weight for index \c idx.
+   */
+  void setWeightIsDegree(int idx);
+
+  /*! \brief Specify an index for which the vertex weight should be
+              the degree of the vertex
+   *    \param idx Zoltan2 will use the vertex's 
+   *         degree as the vertex weight for index \c idx.
+   */
+  void setVertexWeightIsDegree(int idx);
+
+  /*! \brief Provide a pointer to edge weights.
+   *    \param val A pointer to the weights for index \c idx.
+   *    \param stride    A stride for the \c val array.  If \stride is
+   *             \c k, then val[n * k] is the weight for the
+   *             \c n th edge for index \idx.
+   *    \param dim A number from 0 to one less than the number
+   *          of edge weights specified in the constructor.
+   *
+   *  The order of the edge weights should follow the order that the
+   *  the vertices and edges appear in the input data structure.
+   *
+   *  By vertex:
+   *     \code
+   *       TheGraph->getRowMap()->getNodeElementList()
+   *     \endcode
+   *
+   *  Then by vertex neighbor:
+   *     \code
+   *       TheGraph->getLocalRowView(vertexNum, neighborList);
+   *     \endcode
+   */
+
+  void setEdgeWeights(const scalar_t *val, int stride, int idx);
+
+  /*! \brief Access to Xpetra-wrapped user's graph.
+   */ 
+  RCP<const xgraph_t> getXpetraGraph() const { return graph_; }
+
+  /*! \brief Access to user's graph 
+   */ 
+  RCP<const User> getUserGraph() const { return ingraph_; }
+
+  ////////////////////////////////////////////////////
+  // The Adapter interface.
+  ////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////
+  // The GraphAdapter interface.
+  ////////////////////////////////////////////////////
+
+  // TODO:  Assuming rows == objects; 
+  // TODO:  Need to add option for columns or nonzeros?
+  size_t getLocalNumVertices() const { return getNodeNumRows(); }
+
+  void getVertexIDsView(const gno_t *&ids) const 
+  {
+    ids = NULL;
+    if (getLocalNumVertices())
+      ids = getRowMap()->getNodeElementList().getRawPtr();
+  }
+
+  size_t getLocalNumEdges() const { return getNodeNumEntries(); }
+
+  void getEdgesView(const offset_t *&offsets, const gno_t *&adjIds) const
+  {
+    offsets = offs_.getRawPtr();
+    adjIds = (getLocalNumEdges() ? adjids_.getRawPtr() : NULL);
+  }
+
+  int getNumWeightsPerVertex() const { return nWeightsPerVertex_;}
+
+  void getVertexWeightsView(const scalar_t *&weights, int &stride,
+                            int idx) const
+  {
+    if(idx<0 || idx >= nWeightsPerVertex_)
+    {
+      std::ostringstream emsg;
+      emsg << __FILE__ << ":" << __LINE__
+           << "  Invalid vertex weight index " << idx << std::endl;
+      throw std::runtime_error(emsg.str()); 
+    }
+
+
+    size_t length;
+    vertexWeights_[idx].getStridedList(length, weights, stride);
+  }
+
+  bool useDegreeAsVertexWeight(int idx) const {return vertexDegreeWeight_[idx];}
+
+  int getNumWeightsPerEdge() const { return nWeightsPerEdge_;}
+
+  void getEdgeWeightsView(const scalar_t *&weights, int &stride, int idx) const
+  {
+    if(idx<0 || idx >= nWeightsPerEdge_)
+    {
+      std::ostringstream emsg;
+      emsg << __FILE__ << ":" << __LINE__
+           << "  Invalid edge weight index " << idx << std::endl;
+      throw std::runtime_error(emsg.str()); 
+    }
+
+
+    size_t length;
+    edgeWeights_[idx].getStridedList(length, weights, stride);
+  }
+
+
+  template <typename Adapter>
+  void applyPartitioningSolution(const User &in, User *&out,
+                                 const Zoltan2::PartitioningSolution<Adapter> &solution) const {
+    TEUCHOS_TEST_FOR_EXCEPTION(1, std::invalid_argument,"applyPartitionlingSolution not implemeneted");
+}
+
+  template <typename Adapter>
+  void applyPartitioningSolution(const User &in, RCP<User> &out,
+                                 const Zoltan2::PartitioningSolution<Adapter> &solution) const {
+    TEUCHOS_TEST_FOR_EXCEPTION(1, std::invalid_argument,"applyPartitionlingSolution not implemeneted");
+  }
+
+
+private:
+
+  RCP<const User > ingraph_;
+  RCP<const xgraph_t > graph_;
+  RCP<const Teuchos::Comm<int> > comm_;
+
+  ArrayRCP<const offset_t> offs_;
+  ArrayRCP<const gno_t> adjids_;
+
+  int nWeightsPerVertex_;
+  ArrayRCP<Zoltan2::StridedData<lno_t, scalar_t> > vertexWeights_;
+  ArrayRCP<bool> vertexDegreeWeight_;
+
+  int nWeightsPerEdge_;
+  ArrayRCP<Zoltan2::StridedData<lno_t, scalar_t> > edgeWeights_;
+
+  int coordinateDim_;
+  ArrayRCP<Zoltan2::StridedData<lno_t, scalar_t> > coords_;
+
+};
+
+
+/////////////////////////////////////////////////////////////////
+// Definitions
+/////////////////////////////////////////////////////////////////
+
+template <typename User, typename UserCoord>
+  MueLuGraphBaseAdapter<User,UserCoord>::MueLuGraphBaseAdapter(
+    const RCP<const User> &ingraph, int nVtxWgts, int nEdgeWgts):
+      ingraph_(ingraph), graph_(), comm_() , offs_(), adjids_(),
+      nWeightsPerVertex_(nVtxWgts), vertexWeights_(), vertexDegreeWeight_(),
+      nWeightsPerEdge_(nEdgeWgts), edgeWeights_(),
+      coordinateDim_(0), coords_()
+{
+  typedef Zoltan2::StridedData<lno_t,scalar_t> input_t;
+  graph_ = ingraph;
+
+  comm_ = getRowMap()->getComm();
+  size_t nvtx = getNodeNumRows();
+  size_t nedges = getNodeNumEntries();
+
+  // Unfortunately we have to copy the offsets and edge Ids
+  // because edge Ids are not usually stored in vertex id order.
+  size_t n = nvtx + 1;
+  offs_.resize(n);
+  offset_t* offs = const_cast<offset_t*>(offs_.getRawPtr());
+  gno_t* adjids=0;
+  if(nedges > 0) {
+    adjids_.resize(nedges);
+    adjids = const_cast<gno_t*>(adjids_.getRawPtr());
+  }
+
+  offs[0] = 0;
+  for (size_t v=0; v < nvtx; v++){
+    ArrayView<const lno_t> nbors;
+    getLocalRowView(v, nbors);
+    offs[v+1] = offs[v] + nbors.size();
+    for (offset_t e=offs[v], i=0; e < offs[v+1]; e++) {
+      adjids[e] = getColMap()->getGlobalElement(nbors[i++]);
+    }
+  }
+
+  if (nWeightsPerVertex_ > 0) {
+    vertexWeights_ = 
+          arcp(new input_t[nWeightsPerVertex_], 0, nWeightsPerVertex_, true);
+    vertexDegreeWeight_ =
+          arcp(new bool[nWeightsPerVertex_], 0, nWeightsPerVertex_, true);
+    for (int i=0; i < nWeightsPerVertex_; i++)
+      vertexDegreeWeight_[i] = false;
+  }
+
+  
+}
+
+////////////////////////////////////////////////////////////////////////////
+template <typename User, typename UserCoord>
+  void MueLuGraphBaseAdapter<User,UserCoord>::setWeights(
+    const scalar_t *weightVal, int stride, int idx)
+{
+  if (this->getPrimaryEntityType() == Zoltan2::GRAPH_VERTEX)
+    setVertexWeights(weightVal, stride, idx);
+  else 
+    setEdgeWeights(weightVal, stride, idx);
+}
+
+////////////////////////////////////////////////////////////////////////////
+template <typename User, typename UserCoord>
+  void MueLuGraphBaseAdapter<User,UserCoord>::setVertexWeights(
+    const scalar_t *weightVal, int stride, int idx)
+{
+  typedef Zoltan2::StridedData<lno_t,scalar_t> input_t;
+
+  if(idx<0 || idx >= nWeightsPerVertex_)
+  {
+      std::ostringstream emsg;
+      emsg << __FILE__ << ":" << __LINE__
+           << "  Invalid vertex weight index " << idx << std::endl;
+      throw std::runtime_error(emsg.str()); 
+  }
+
+  size_t nvtx = getLocalNumVertices();
+  ArrayRCP<const scalar_t> weightV(weightVal, 0, nvtx*stride, false);
+  vertexWeights_[idx] = input_t(weightV, stride);
+}
+
+////////////////////////////////////////////////////////////////////////////
+template <typename User, typename UserCoord>
+  void MueLuGraphBaseAdapter<User,UserCoord>::setWeightIsDegree(
+    int idx)
+{
+  if (this->getPrimaryEntityType() == Zoltan2::GRAPH_VERTEX)
+    setVertexWeightIsDegree(idx);
+  else {
+    std::ostringstream emsg;
+    emsg << __FILE__ << "," << __LINE__
+         << " error:  setWeightIsNumberOfNonZeros is supported only for"
+         << " vertices" << std::endl;
+    throw std::runtime_error(emsg.str());
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+template <typename User, typename UserCoord>
+  void MueLuGraphBaseAdapter<User,UserCoord>::setVertexWeightIsDegree(
+    int idx)
+{
+  if(idx<0 || idx >= nWeightsPerVertex_)
+  {
+      std::ostringstream emsg;
+      emsg << __FILE__ << ":" << __LINE__
+           << "  Invalid vertex weight index " << idx << std::endl;
+      throw std::runtime_error(emsg.str()); 
+  }
+
+  vertexDegreeWeight_[idx] = true;
+}
+
+////////////////////////////////////////////////////////////////////////////
+template <typename User, typename UserCoord>
+  void MueLuGraphBaseAdapter<User,UserCoord>::setEdgeWeights(
+    const scalar_t *weightVal, int stride, int idx)
+{
+  typedef Zoltan2::StridedData<lno_t,scalar_t> input_t;
+
+  if(idx<0 || idx >= nWeightsPerEdge_)
+  {
+      std::ostringstream emsg;
+      emsg << __FILE__ << ":" << __LINE__
+           << "  Invalid edge weight index " << idx << std::endl;
+      throw std::runtime_error(emsg.str()); 
+  }
+
+  size_t nedges = getLocalNumEdges();
+  ArrayRCP<const scalar_t> weightV(weightVal, 0, nedges*stride, false);
+  edgeWeights_[idx] = input_t(weightV, stride);
+}
+
+
+}  //namespace MueLu
+
+
+#endif// MUELU_HAVE_ZOLTAN2
+  
+#endif
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index 937601632abd..4a2d2ca2e38f 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -118,13 +118,14 @@ namespace MueLu {
 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
     SET_VALID_ENTRY("aggregation: drop tol");
     SET_VALID_ENTRY("aggregation: Dirichlet threshold");
+    SET_VALID_ENTRY("aggregation: row sum drop tol");
     SET_VALID_ENTRY("aggregation: drop scheme");
     SET_VALID_ENTRY("aggregation: block diagonal: interleaved blocksize");
     SET_VALID_ENTRY("aggregation: distance laplacian directional weights");
 
     {
       typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
-      validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian","block diagonal","block diagonal classical","block diagonal distance laplacian"), "aggregation: drop scheme")));
+      validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian","signed classical","block diagonal","block diagonal classical","block diagonal distance laplacian","block diagonal signed classical"), "aggregation: drop scheme")));
                                                                         
     }
     SET_VALID_ENTRY("aggregation: distance laplacian algo");
@@ -135,7 +136,7 @@ namespace MueLu {
     validParamList->set< RCP<const FactoryBase> >("A",                  Teuchos::null, "Generating factory of the matrix A");
     validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
     validParamList->set< RCP<const FactoryBase> >("Coordinates",        Teuchos::null, "Generating factory for Coordinates");
-    validParamList->set< RCP<const FactoryBase> >("BlockNumber",        Teuchos::null, "Generating factory for Coordinates");
+    validParamList->set< RCP<const FactoryBase> >("BlockNumber",        Teuchos::null, "Generating factory for BlockNUmber");
 
     return validParamList;
   }
@@ -154,7 +155,8 @@ namespace MueLu {
       if (algo == "distance laplacian" || algo == "block diagonal distance laplacian") {
         Input(currentLevel, "Coordinates");    
       }
-      if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" || algo == "block diagonal")  {
+      if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" 
+          || algo == "block diagonal" || algo == "block diagonal signed classical")  {
         Input(currentLevel, "BlockNumber");
       }
     }     
@@ -188,22 +190,32 @@ namespace MueLu {
 
     bool use_block_algorithm=false;
     LO interleaved_blocksize = as<LO>(pL.get<int>("aggregation: block diagonal: interleaved blocksize"));
+    bool useSignedClassical = false;
+
+    // NOTE:  If we're doing blockDiagonal, we'll not want to do rowSum twice (we'll do it
+    // in the block diagonalizaiton). So we'll clobber the rowSumTol with -1.0 in this case
+    typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
+
     if(algo == "distance laplacian" ) { 
       // Grab the coordinates for distance laplacian
       Coords = Get< RCP<RealValuedMultiVector > >(currentLevel, "Coordinates");
       A = realA;
     }
+    else if(algo == "signed classical") {
+      useSignedClassical = true;
+      algo = "classical";
+      A = realA;
+    }
     else if(algo == "block diagonal") {
       // Handle the "block diagonal" filtering and then leave
       BlockDiagonalize(currentLevel,realA,false);
       return;
     }
-    else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian")  {
+    else if (algo == "block diagonal classical" || algo == "block diagonal distance laplacian" || algo == "block diagonal signed classical")  {
       // Handle the "block diagonal" filtering, and then continue onward
       use_block_algorithm = true;
       RCP<Matrix> filteredMatrix = BlockDiagonalize(currentLevel,realA,true);
-      if(algo == "block diagonal") return;
-      else if(algo == "block diagonal distance laplacian") {  
+      if(algo == "block diagonal distance laplacian") {  
         // We now need to expand the coordinates by the interleaved blocksize
         RCP<RealValuedMultiVector> OldCoords = Get< RCP<RealValuedMultiVector > >(currentLevel, "Coordinates");
         if (OldCoords->getLocalLength() != realA->getNodeNumRows()) {
@@ -227,8 +239,13 @@ namespace MueLu {
       else if(algo == "block diagonal classical") {
         algo = "classical";
       }
-      // Both cases
+      else if(algo == "block diagonal signed classical") {
+        algo = "classical";
+        useSignedClassical = true;
+      }
+      // All cases
       A = filteredMatrix;
+      rowSumTol = -1.0;
     }
     else {
       A = realA;
@@ -275,12 +292,13 @@ namespace MueLu {
 
     if (doExperimentalWrap) {
       TEUCHOS_TEST_FOR_EXCEPTION(predrop_ != null   && algo != "classical", Exceptions::RuntimeError, "Dropping function must not be provided for \"" << algo << "\" algorithm");
-      TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian)");
+      TEUCHOS_TEST_FOR_EXCEPTION(algo != "classical" && algo != "distance laplacian" && algo != "signed classical", Exceptions::RuntimeError, "\"algorithm\" must be one of (classical|distance laplacian|signed classical)");
 
       SC threshold = as<SC>(pL.get<double>("aggregation: drop tol"));
       std::string distanceLaplacianAlgoStr = pL.get<std::string>("aggregation: distance laplacian algo");
       std::string classicalAlgoStr = pL.get<std::string>("aggregation: classical algo");
       real_type realThreshold = STS::magnitude(threshold);// CMS: Rename this to "magnitude threshold" sometime
+
       ////////////////////////////////////////////////////
       // Remove this bit once we are confident that cut-based dropping works.
 #ifdef HAVE_MUELU_DEBUG
@@ -337,6 +355,11 @@ namespace MueLu {
 
       const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
 
+
+      // NOTE: We don't support signed classical with cut drop at present
+      TEUCHOS_TEST_FOR_EXCEPTION(useSignedClassical && classicalAlgo != defaultAlgo, Exceptions::RuntimeError, "\"aggregation: classical algo\" != default is not supported for scalled classical aggregation");
+
+
       GO numDropped = 0, numTotal = 0;
       std::string graphType = "unamalgamated"; //for description purposes only
       if (algo == "classical") {
@@ -359,12 +382,14 @@ namespace MueLu {
         // At this points we either have
         //     (predrop_ != null)
         // Therefore, it is sufficient to check only threshold
-        if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && A->hasCrsGraph()) {
+        if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !useSignedClassical && A->hasCrsGraph()) {
           // Case 1:  scalar problem, no dropping => just use matrix graph
           RCP<GraphBase> graph = rcp(new Graph(A->getCrsGraph(), "graph of A"));
           // Detect and record rows that correspond to Dirichlet boundary conditions
-          ArrayRCP<const bool > boundaryNodes;
-          boundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+          ArrayRCP<bool > boundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+          if (rowSumTol > 0.) 
+            Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes);
+
           graph->SetBoundaryNodeMap(boundaryNodes);
           numTotal = A->getNodeNumEntries();
 
@@ -383,7 +408,8 @@ namespace MueLu {
           Set(currentLevel, "Graph", graph);
 
         } else if ( (A->GetFixedBlockSize() == 1 && threshold != STS::zero()) ||
-                    (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph())) {
+                    (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
+                    (A->GetFixedBlockSize() == 1 && useSignedClassical) ) {
           // Case 2:  scalar problem with dropping => record the column indices of undropped entries, but still use original
           //                                          graph's map information, e.g., whether index is local
           // OR a matrix without a CrsGraph
@@ -392,9 +418,20 @@ namespace MueLu {
           ArrayRCP<LO> rows   (A->getNodeNumRows()+1);
           ArrayRCP<LO> columns(A->getNodeNumEntries());
 
-          RCP<Vector> ghostedDiag = MueLu::Utilities<SC,LO,GO,NO>::GetMatrixOverlappedDiagonal(*A);
-          const ArrayRCP<const SC> ghostedDiagVals = ghostedDiag->getData(0);
-          ArrayRCP<const bool> boundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+          using MT = typename STS::magnitudeType;
+          RCP<Vector> ghostedDiag;
+          ArrayRCP<const SC> ghostedDiagVals;
+          ArrayRCP<const MT> negMaxOffDiagonal;
+          if(useSignedClassical) {
+            negMaxOffDiagonal = MueLu::Utilities<SC,LO,GO,NO>::GetMatrixMaxMinusOffDiagonal(*A);
+          }
+          else {
+            ghostedDiag = MueLu::Utilities<SC,LO,GO,NO>::GetMatrixOverlappedDiagonal(*A);
+            ghostedDiagVals = ghostedDiag->getData(0);
+          }
+          ArrayRCP<bool> boundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+          if (rowSumTol > 0.)
+            Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes);          
 
           LO realnnz = 0;
           rows[0] = 0;
@@ -411,20 +448,36 @@ namespace MueLu {
               //FIXME For now, hardwiring the dropping in here
               
               LO rownnz = 0;
-              for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
-                LO col = indices[colID];
-                
-                // we avoid a square root by using squared values
-                typename STS::magnitudeType aiiajj = STS::magnitude(threshold*threshold * ghostedDiagVals[col]*ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
-                typename STS::magnitudeType aij    = STS::magnitude(vals[colID]*vals[colID]);                                          // |a_ij|^2
-                
-                if (aij > aiiajj || row == col) {
+              if(useSignedClassical) {
+                // Signed classical
+                for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                  LO col = indices[colID];               
+                  MT max_neg_aik = realThreshold * STS::real(negMaxOffDiagonal[row]);
+                  MT neg_aij    = - STS::real(vals[colID]);
+                  //printf(" - a_ij = %6.4e >? %6.4e * %6.4e = alpha max(-aik)\n",neg_aij,threshold, negMaxOffDiagonal[row]);
+                  if (neg_aij > max_neg_aik || row == col) {
+                    columns[realnnz++] = col;
+                    rownnz++;
+                  } else
+                    numDropped++;
+                }
+                rows[row+1] = realnnz;
+              }
+              else {
+                // Standard abs classical
+                for (LO colID = 0; colID < Teuchos::as<LO>(nnz); colID++) {
+                  LO col = indices[colID];               
+                  MT  aiiajj = STS::magnitude(threshold*threshold * ghostedDiagVals[col]*ghostedDiagVals[row]);  // eps^2*|a_ii|*|a_jj|
+                  MT aij    = STS::magnitude(vals[colID]*vals[colID]);                                          // |a_ij|^2
+                  
+                  if (aij > aiiajj || row == col) {
                   columns[realnnz++] = col;
                   rownnz++;
-                } else
-                  numDropped++;
+                  } else
+                    numDropped++;
+                }
+                rows[row+1] = realnnz;
               }
-              rows[row+1] = realnnz;
             }
             else {
               /* Cut Algorithm */
@@ -581,8 +634,11 @@ namespace MueLu {
           // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
           // TODO the array one bigger than the number of local rows, and the last entry can
           // TODO hold the actual number of boundary nodes.  Clever, huh?
-          ArrayRCP<const bool > pointBoundaryNodes;
-          pointBoundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+          ArrayRCP<bool > pointBoundaryNodes;
+          pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+          if (rowSumTol > 0.)
+            Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
+
 
           // extract striding information
           LO blkSize = A->GetFixedBlockSize();     //< the full block size (number of dofs per node in strided map)
@@ -676,7 +732,6 @@ namespace MueLu {
           // Case 4:  Multiple DOF/node problem with dropping
           const RCP<const Map> rowMap = A->getRowMap();
           const RCP<const Map> colMap = A->getColMap();
-
           graphType = "amalgamated";
 
           // build node row map (uniqueMap) and node column map (nonUniqueMap)
@@ -701,8 +756,11 @@ namespace MueLu {
           // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
           // TODO the array one bigger than the number of local rows, and the last entry can
           // TODO hold the actual number of boundary nodes.  Clever, huh?
-          ArrayRCP<const bool > pointBoundaryNodes;
-          pointBoundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+          ArrayRCP<bool > pointBoundaryNodes;
+          pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+          if (rowSumTol > 0.)
+            Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
+
 
           // extract striding information
           LO blkSize = A->GetFixedBlockSize();     //< the full block size (number of dofs per node in strided map)
@@ -800,7 +858,6 @@ namespace MueLu {
       } else if (algo == "distance laplacian") {
         LO blkSize   = A->GetFixedBlockSize();
         GO indexBase = A->getRowMap()->getIndexBase();
-
         // [*0*] : FIXME
         // ap: somehow, if I move this line to [*1*], Belos throws an error
         // I'm not sure what's going on. Do we always have to Get data, if we did
@@ -811,8 +868,10 @@ namespace MueLu {
         // TODO If we use ArrayRCP<LO>, then we can record boundary nodes as usual.  Size
         // TODO the array one bigger than the number of local rows, and the last entry can
         // TODO hold the actual number of boundary nodes.  Clever, huh?
-        ArrayRCP<const bool > pointBoundaryNodes;
-        pointBoundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+        ArrayRCP<bool > pointBoundaryNodes;
+        pointBoundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+        if (rowSumTol > 0.)
+          Utilities::ApplyRowSumCriterion(*A, rowSumTol, pointBoundaryNodes);
 
         if ( (blkSize == 1) && (threshold == STS::zero()) ) {
           // Trivial case: scalar problem, no dropping. Can return original graph
@@ -1558,6 +1617,7 @@ namespace MueLu {
  
     const ParameterList  & pL = GetParameterList();
     const typename STS::magnitudeType dirichletThreshold = STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
+    const typename STS::magnitudeType rowSumTol = as<typename STS::magnitudeType>(pL.get<double>("aggregation: row sum drop tol"));
 
     RCP<LocalOrdinalVector> BlockNumber = Get<RCP<LocalOrdinalVector> >(currentLevel, "BlockNumber");
     RCP<LocalOrdinalVector> ghostedBlockNumber;
@@ -1619,7 +1679,10 @@ namespace MueLu {
       else rows_graph[row+1] = realnnz;
     }
     
-    ArrayRCP<const bool> boundaryNodes = MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold);
+    ArrayRCP<bool> boundaryNodes = Teuchos::arcp_const_cast<bool>(MueLu::Utilities<SC,LO,GO,NO>::DetectDirichletRows(*A, dirichletThreshold));
+    if (rowSumTol > 0.)
+      Utilities::ApplyRowSumCriterion(*A, rowSumTol, boundaryNodes);
+
         
     if(!generate_matrix) {
       // We can't resize an Arrayrcp and pass the checks for setAllValues
diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp
index 926e5df4b554..c43758ee8352 100644
--- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp
+++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_decl.hpp
@@ -56,6 +56,7 @@
 #include "MueLu_StructuredAggregationFactory_fwd.hpp"
 #include "MueLu_Level_fwd.hpp"
 #include "MueLu_Exceptions.hpp"
+#include "MueLu_AggregationStructuredAlgorithm_fwd.hpp"
 
 namespace MueLu {
 
diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp
index 658fd6dd9cfb..3c44a42d6067 100644
--- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp
+++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_def.hpp
@@ -343,7 +343,7 @@ namespace MueLu {
     Set(currentLevel, "lCoarseNodesPerDim", geoData->getLocalCoarseNodesPerDir());
     Set(currentLevel, "coarseCoordinatesFineMap", coarseCoordinatesFineMap);
     Set(currentLevel, "coarseCoordinatesMap", coarseCoordinatesMap);
-    Set(currentLevel, "interpolationOrder", interpolationOrder);
+    Set(currentLevel, "structuredInterpolationOrder", interpolationOrder);
     Set(currentLevel, "numDimensions", numDimensions);
 
   } // Build()
diff --git a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp
index d7398aa40001..318913a92799 100644
--- a/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Graph/StructuredAggregation/MueLu_StructuredAggregationFactory_kokkos_def.hpp
@@ -91,7 +91,6 @@ namespace MueLu {
                                                  "Coarsening rate per spatial dimensions");
     validParamList->set<int>                    ("aggregation: coarsening order", 0,
                                                   "The interpolation order used to construct grid transfer operators based off these aggregates.");
-
     validParamList->set<RCP<const FactoryBase> >("Graph",                   Teuchos::null,
                                                  "Graph of the matrix after amalgamation but without dropping.");
     validParamList->set<RCP<const FactoryBase> >("DofsPerNode",             Teuchos::null,
@@ -248,10 +247,10 @@ namespace MueLu {
       Set(currentLevel, "prolongatorGraph", myGraph);
     }
 
-    Set(currentLevel, "lCoarseNodesPerDim",       geoData->getCoarseNodesPerDirArray());
-    Set(currentLevel, "indexManager",             geoData);
-    Set(currentLevel, "interpolationOrder",       interpolationOrder);
-    Set(currentLevel, "numDimensions",            numDimensions);
+    Set(currentLevel, "lCoarseNodesPerDim",           geoData->getCoarseNodesPerDirArray());
+    Set(currentLevel, "indexManager",                 geoData);
+    Set(currentLevel, "structuredInterpolationOrder", interpolationOrder);
+    Set(currentLevel, "numDimensions",                numDimensions);
 
   } // Build()
 
diff --git a/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp b/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp
index ca10e38a0910..49e7b229e595 100644
--- a/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp
+++ b/packages/muelu/src/Headers/MueLu_UseShortNamesScalar.hpp
@@ -59,6 +59,12 @@ typedef MueLu::BraessSarazinSmoother<Scalar,LocalOrdinal,GlobalOrdinal,Node> Bra
 #ifdef MUELU_CGSOLVER_SHORT
 typedef MueLu::CGSolver<Scalar,LocalOrdinal,GlobalOrdinal,Node> CGSolver;
 #endif
+#ifdef MUELU_CLASSICALMAPFACTORY_SHORT
+typedef MueLu::ClassicalMapFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node> ClassicalMapFactory;
+#endif
+#ifdef MUELU_CLASSICALPFACTORY_SHORT
+typedef MueLu::ClassicalPFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node> ClassicalPFactory;
+#endif
 #ifdef MUELU_CLONEREPARTITIONINTERFACE_SHORT
 typedef MueLu::CloneRepartitionInterface<Scalar,LocalOrdinal,GlobalOrdinal,Node> CloneRepartitionInterface;
 #endif
diff --git a/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp b/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp
index efd88fdd6895..af6c4664b5ac 100644
--- a/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp
+++ b/packages/muelu/src/Interface/MueLu_FactoryFactory_decl.hpp
@@ -80,6 +80,8 @@
 #include "MueLu_BlockedRAPFactory.hpp"
 #include "MueLu_BraessSarazinSmoother.hpp"
 #include "MueLu_BrickAggregationFactory.hpp"
+#include "MueLu_ClassicalMapFactory.hpp"
+#include "MueLu_ClassicalPFactory.hpp"
 #include "MueLu_CloneRepartitionInterface.hpp"
 #include "MueLu_CoalesceDropFactory.hpp"
 #include "MueLu_SmooVecCoalesceDropFactory.hpp"
@@ -239,6 +241,8 @@ namespace MueLu {
       if (factoryName == "BlockedCoordinatesTransferFactory")     return Build2<BlockedCoordinatesTransferFactory>     (paramList, factoryMapIn, factoryManagersIn);
       if (factoryName == "BlockedRAPFactory")                     return BuildRAPFactory<BlockedRAPFactory>            (paramList, factoryMapIn, factoryManagersIn);
       if (factoryName == "BrickAggregationFactory")               return Build2<BrickAggregationFactory>               (paramList, factoryMapIn, factoryManagersIn);
+      if (factoryName == "ClassicalMapFactory")                   return Build2<ClassicalMapFactory>             (paramList, factoryMapIn, factoryManagersIn);
+      if (factoryName == "ClassicalPFactory")                     return Build2<ClassicalPFactory>             (paramList, factoryMapIn, factoryManagersIn);
       if (factoryName == "CloneRepartitionInterface")             return Build2<CloneRepartitionInterface>             (paramList, factoryMapIn, factoryManagersIn);
       if (factoryName == "CoarseMapFactory")                      return Build2<CoarseMapFactory>                      (paramList, factoryMapIn, factoryManagersIn);
       if (factoryName == "CoarseningVisualizationFactory")        return Build2<CoarseningVisualizationFactory>        (paramList, factoryMapIn, factoryManagersIn);
diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp
index b011afc7efef..6047eef91c97 100644
--- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp
+++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_decl.hpp
@@ -56,6 +56,8 @@
 
 #include "MueLu_AggregationExportFactory_fwd.hpp"
 #include "MueLu_BrickAggregationFactory_fwd.hpp"
+#include "MueLu_ClassicalMapFactory_fwd.hpp"
+#include "MueLu_ClassicalPFactory_fwd.hpp"
 #include "MueLu_CoalesceDropFactory_fwd.hpp"
 #include "MueLu_CoarseMapFactory_fwd.hpp"
 #include "MueLu_ConstraintFactory_fwd.hpp"
@@ -231,7 +233,7 @@ namespace MueLu {
                                         int levelID, std::vector<keep_pair>& keeps, RCP<Factory> & nullSpaceFactory) const;
     void UpdateFactoryManager_BlockNumber(Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, 
                                           FactoryManager& manager,int levelID, std::vector<keep_pair>& keeps) const;
-    void UpdateFactoryManager_LocalOrdinalTransfer(const std::string VarName, Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, 
+    void UpdateFactoryManager_LocalOrdinalTransfer(const std::string& VarName, const std::string& multigridAlgo, Teuchos::ParameterList& paramList, const Teuchos::ParameterList& defaultList, 
                                            FactoryManager& manager,int levelID, std::vector<keep_pair>& keeps) const;
 
     // Algorithm-specific components for UpdateFactoryManager
diff --git a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
index 86628cbcd1f0..accb26c24d50 100644
--- a/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
+++ b/packages/muelu/src/Interface/MueLu_ParameterListInterpreter_def.hpp
@@ -62,6 +62,8 @@
 #include "MueLu_AggregationExportFactory.hpp"
 #include "MueLu_AggregateQualityEstimateFactory.hpp"
 #include "MueLu_BrickAggregationFactory.hpp"
+#include "MueLu_ClassicalMapFactory.hpp"
+#include "MueLu_ClassicalPFactory.hpp"
 #include "MueLu_CoalesceDropFactory.hpp"
 #include "MueLu_CoarseMapFactory.hpp"
 #include "MueLu_ConstraintFactory.hpp"
@@ -553,7 +555,7 @@ namespace MueLu {
         Exceptions::RuntimeError, "Unknown \"reuse: type\" value: \"" << reuseType << "\". Please consult User's Guide.");
 
     MUELU_SET_VAR_2LIST(paramList, defaultList, "multigrid algorithm", std::string, multigridAlgo);
-    TEUCHOS_TEST_FOR_EXCEPTION(strings({"unsmoothed", "sa", "pg", "emin", "matlab", "pcoarsen"}).count(multigridAlgo) == 0,
+    TEUCHOS_TEST_FOR_EXCEPTION(strings({"unsmoothed", "sa", "pg", "emin", "matlab", "pcoarsen","classical"}).count(multigridAlgo) == 0,
         Exceptions::RuntimeError, "Unknown \"multigrid algorithm\" value: \"" << multigridAlgo << "\". Please consult User's Guide.");
 #ifndef HAVE_MUELU_MATLAB
     TEUCHOS_TEST_FOR_EXCEPTION(multigridAlgo == "matlab", Exceptions::RuntimeError,
@@ -615,6 +617,10 @@ namespace MueLu {
       // Unsmoothed aggregation
       manager.SetFactory("P", manager.GetFactory("Ptent"));
 
+    } else if (multigridAlgo == "classical") {
+      // Classical AMG
+      manager.SetFactory("P", manager.GetFactory("Ptent"));
+
     } else if (multigridAlgo == "sa") {
       // Smoothed aggregation
       UpdateFactoryManager_SA(paramList, defaultList, manager, levelID, keeps);
@@ -647,7 +653,7 @@ namespace MueLu {
 
     // == BlockNumber Transfer ==
     if(useBlockNumber_)
-      UpdateFactoryManager_LocalOrdinalTransfer("BlockNumber",paramList,defaultList,manager,levelID,keeps);
+      UpdateFactoryManager_LocalOrdinalTransfer("BlockNumber",multigridAlgo,paramList,defaultList,manager,levelID,keeps);
 
     // === Coordinates ===
     UpdateFactoryManager_Coordinates(paramList, defaultList, manager, levelID, keeps);
@@ -1003,6 +1009,7 @@ namespace MueLu {
        ParameterList dropParams;
        dropParams.set("lightweight wrap", true);
        MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop scheme",             std::string, dropParams);
+       MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: row sum drop tol",        double, dropParams);
        MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: block diagonal: interleaved blocksize", int, dropParams);
        MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop tol",                     double, dropParams);
        MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: Dirichlet threshold",          double, dropParams);
@@ -1020,7 +1027,7 @@ namespace MueLu {
 
     // Aggregation scheme
     MUELU_SET_VAR_2LIST(paramList, defaultList, "aggregation: type", std::string, aggType);
-    TEUCHOS_TEST_FOR_EXCEPTION(!strings({"uncoupled", "coupled", "brick", "matlab","notay"}).count(aggType),
+    TEUCHOS_TEST_FOR_EXCEPTION(!strings({"uncoupled", "coupled", "brick", "matlab","notay","classical"}).count(aggType),
         Exceptions::RuntimeError, "Unknown aggregation algorithm: \"" << aggType << "\". Please consult User's Guide.");
     #ifndef HAVE_MUELU_MATLAB
     if (aggType == "matlab")
@@ -1078,6 +1085,40 @@ namespace MueLu {
         aggFactory->SetFactory("Coordinates", this->GetFactoryManager(levelID-1)->GetFactory("Coordinates"));
       }
     }
+    else if (aggType == "classical") {
+      // Map and coloring
+      RCP<Factory> mapFact = rcp(new ClassicalMapFactory());
+      ParameterList mapParams;
+      MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: deterministic",             bool, mapParams);
+      MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: coloring algorithm", std::string, mapParams);
+      mapFact->SetParameterList(mapParams);
+      manager.SetFactory("FC Splitting", mapFact);
+      manager.SetFactory("CoarseMap", mapFact);
+
+      aggFactory = rcp(new ClassicalPFactory());      
+      ParameterList aggParams;
+      MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: classical scheme", std::string, aggParams);
+      MUELU_TEST_AND_SET_PARAM_2LIST(paramList, defaultList, "aggregation: drop scheme", std::string, aggParams);
+      aggFactory->SetParameterList(aggParams);
+      aggFactory->SetFactory("FC Splitting",manager.GetFactory("FC Splitting"));
+      aggFactory->SetFactory("CoarseMap",manager.GetFactory("CoarseMap"));
+      aggFactory->SetFactory("DofsPerNode", manager.GetFactory("Graph"));
+      aggFactory->SetFactory("Graph", manager.GetFactory("Graph"));
+      std::string drop_algo = aggParams.get<std::string>("aggregation: drop scheme");
+      if (drop_algo.find("block diagonal") != std::string::npos) 
+        aggFactory->SetFactory("BlockNumber", manager.GetFactory("BlockNumber"));
+      
+      // Now we short-circuit, because we neither need nor want TentativePFactory here      
+      manager.SetFactory("Ptent",     aggFactory);
+      manager.SetFactory("P Graph",     aggFactory);
+
+      
+      if (reuseType == "tP" && levelID) {
+        //        keeps.push_back(keep_pair("Nullspace", Ptent.get()));
+        keeps.push_back(keep_pair("Ptent",aggFactory.get()));
+      }
+      return;
+    }
 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
     else if (aggType == "notay") {
       aggFactory = rcp(new NotayAggregationFactory());
@@ -1101,6 +1142,7 @@ namespace MueLu {
 #endif
 
 
+
     manager.SetFactory("Aggregates", aggFactory);
 
     // Coarse map
@@ -1307,12 +1349,15 @@ namespace MueLu {
   // =====================================================================================================
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void ParameterListInterpreter<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  UpdateFactoryManager_LocalOrdinalTransfer(const std::string VarName, ParameterList& paramList, const ParameterList& /* defaultList */,
+  UpdateFactoryManager_LocalOrdinalTransfer(const std::string & VarName, const std::string &multigridAlgo,ParameterList& paramList, const ParameterList& /* defaultList */,
                                             FactoryManager& manager, int levelID, std::vector<keep_pair>& /* keeps */) const
   {    
     if(levelID >= 1){
-      RCP<Factory> fact = rcp(new LocalOrdinalTransferFactory(VarName));
-      fact->SetFactory("Aggregates", manager.GetFactory("Aggregates"));
+      RCP<Factory> fact = rcp(new LocalOrdinalTransferFactory(VarName,multigridAlgo));
+      if(multigridAlgo == "classical") 
+        fact->SetFactory("P Graph", manager.GetFactory("P Graph"));
+      else 
+        fact->SetFactory("Aggregates", manager.GetFactory("Aggregates"));
       fact->SetFactory("CoarseMap",  manager.GetFactory("CoarseMap"));
       fact->SetFactory(VarName, this->GetFactoryManager(levelID-1)->GetFactory(VarName));
 
diff --git a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp
index 5f3316c7415a..2b1521434046 100644
--- a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp
+++ b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_decl.hpp
@@ -50,7 +50,7 @@
 #include "MueLu_TwoLevelFactoryBase.hpp"
 #include "Xpetra_MultiVector_fwd.hpp"
 #include "Xpetra_MultiVectorFactory_fwd.hpp"
-#include "Xpetra_Matrix.hpp"
+#include "Xpetra_CrsGraph_fwd.hpp"
 
 #include "MueLu_CoarseMapFactory_fwd.hpp"
 #include "MueLu_LocalOrdinalTransferFactory_fwd.hpp"
@@ -92,6 +92,9 @@ namespace MueLu {
   ----------|--------------|------------
   | TransferVec | LocalOrdinalTransferFactory   | coarse level transfervec
 */
+
+
+
   template<class LocalOrdinal = DefaultLocalOrdinal,
            class GlobalOrdinal = DefaultGlobalOrdinal,
            class Node = DefaultNode>
@@ -99,7 +102,7 @@ namespace MueLu {
 #undef MUELU_LOCALORDINALTRANSFERFACTORY_SHORT
 #include "MueLu_UseShortNamesOrdinal.hpp"
 
-  public:
+  public:    
     //! @name Constructors/Destructors.
     //@{
 
@@ -114,7 +117,10 @@ namespace MueLu {
        The operator associated with <tt>projectionName</tt> will be applied to the MultiVector associated with
        <tt>vectorName</tt>.
     */
-    LocalOrdinalTransferFactory(const std::string TransferVecName): TransferVecName_(TransferVecName) { }
+    LocalOrdinalTransferFactory(const std::string & TransferVecName, const std::string & mode): TransferVecName_(TransferVecName) { 
+      if(mode == "classical") useAggregatesMode_ = false;
+      else useAggregatesMode_ = true;
+    }
 
     //! Destructor.
     virtual ~LocalOrdinalTransferFactory() { }
@@ -144,8 +150,13 @@ namespace MueLu {
     //@}
 
   private:
-    
 
+    void BuildAggregates(Level & fineLevel, Level &coarseLevel) const;
+
+    void BuildFC(Level & fineLevel, Level &coarseLevel) const;
+    
+    //! Use aggregates mode (as opposed to FC mode)
+    bool useAggregatesMode_;
 
     //! The name for the vector to be transfered.  This allows us to have multiple factories for different variables
     std::string TransferVecName_;
diff --git a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp
index 5d6474820ba5..74ce2d67ed7e 100644
--- a/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp
+++ b/packages/muelu/src/Misc/MueLu_LocalOrdinalTransferFactory_def.hpp
@@ -49,6 +49,8 @@
 #include "Xpetra_ImportFactory.hpp"
 #include "Xpetra_VectorFactory.hpp"
 #include "Xpetra_MapFactory.hpp"
+#include "Xpetra_CrsGraph.hpp"
+
 #include "Xpetra_IO.hpp"
 
 #include "MueLu_CoarseMapFactory.hpp"
@@ -64,7 +66,8 @@ namespace MueLu {
   RCP<const ParameterList> LocalOrdinalTransferFactory<LocalOrdinal, GlobalOrdinal, Node>::GetValidParameterList() const {
     RCP<ParameterList> validParamList = rcp(new ParameterList());
 
-    validParamList->set<RCP<const FactoryBase> >(TransferVecName_,              Teuchos::null, "Factory for TransferVec generation");
+    validParamList->set<RCP<const FactoryBase> >(TransferVecName_,               Teuchos::null, "Factory for TransferVec generation");
+    validParamList->set<RCP<const FactoryBase> >("P Graph",                      Teuchos::null, "Factory for P generation");
     validParamList->set<RCP<const FactoryBase> >("Aggregates",                   Teuchos::null, "Factory for aggregates generation");
     validParamList->set<RCP<const FactoryBase> >("CoarseMap",                    Teuchos::null, "Generating factory of the coarse map");
 
@@ -78,8 +81,13 @@ namespace MueLu {
       isAvailableXfer = coarseLevel.IsAvailable(TransferVecName_, this);
       if (isAvailableXfer == false) {
         Input(fineLevel, TransferVecName_);
-        Input(fineLevel, "Aggregates");
         Input(fineLevel, "CoarseMap");
+
+        if(useAggregatesMode_) 
+          Input(fineLevel, "Aggregates");
+        else {
+          Input(coarseLevel, "P Graph");
+        }
       }
     }
 
@@ -87,6 +95,73 @@ namespace MueLu {
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void LocalOrdinalTransferFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(Level & fineLevel, Level &coarseLevel) const {
+    if(useAggregatesMode_) BuildAggregates(fineLevel,coarseLevel);
+    else BuildFC(fineLevel,coarseLevel);
+  }
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void LocalOrdinalTransferFactory<LocalOrdinal, GlobalOrdinal, Node>::BuildFC(Level & fineLevel, Level &coarseLevel) const {
+    FactoryMonitor m(*this, "Build", coarseLevel);
+
+    GetOStream(Runtime0) << "Transferring " <<TransferVecName_ << std::endl;
+    LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
+
+    if (coarseLevel.IsAvailable(TransferVecName_, this)) {
+      GetOStream(Runtime0) << "Reusing "<<TransferVecName_ << std::endl;
+      return;
+    }
+
+    // Get everything we need
+    RCP<const CrsGraph> P                = Get< RCP<const CrsGraph> >(coarseLevel,"P Graph");
+    RCP<LocalOrdinalVector> fineTV       = Get< RCP<LocalOrdinalVector> >(fineLevel, TransferVecName_);
+    RCP<const Map>      coarseMap  = Get< RCP<const Map> >  (fineLevel, "CoarseMap");
+    RCP<const Map>      uniqueMap  = fineTV->getMap();
+    ArrayRCP<const LO> fineData    = fineTV->getData(0);
+    
+    // FIXME: Handle MPI parallel
+    // Sanity checks
+    TEUCHOS_TEST_FOR_EXCEPTION(P->getRowMap()->getComm()->getSize() != 1,Exceptions::RuntimeError,"BuildFC: Only currently supports 1 MPI rank.");
+     
+    // Allocate new LO Vector
+    RCP<LocalOrdinalVector> coarseTV   = LocalOrdinalVectorFactory::Build(coarseMap,1);
+    ArrayRCP<LO>     coarseData = coarseTV->getDataNonConst(0);
+            
+    // Invalidate everything first, to check for errors
+    for(LO i=0; i<coarseData.size(); i++)
+      coarseData[i] = LO_INVALID;
+   
+    // Fill in coarse TV
+    size_t error_count = 0;
+    for (LO row=0; row<(LO)P->getNodeNumRows(); row++) {
+      LO fineNumber = fineData[row];
+      ArrayView<const LO> indices;
+      P->getLocalRowView(row,indices);
+      
+      // FIXME: MPI parallel
+      for(LO j=0; j<(LO)indices.size(); j++) {
+        if(coarseData[indices[j]] == LO_INVALID) 
+          coarseData[indices[j]] = fineNumber;
+        else if (coarseData[indices[j]] != fineNumber)
+          error_count++;          
+      }
+
+    }
+
+    // Error checking:  All nodes in an aggregate must share a local ordinal
+    if(error_count > 0) {
+      std::ostringstream ofs;
+      ofs << "LocalOrdinalTransferFactory("<<TransferVecName_<<"): ERROR:  Each coarse dof must have a unique LO value.  We had "<<std::to_string(error_count)<<" unknowns that did not match.";
+      throw std::runtime_error(ofs.str());
+    }
+      
+    Set<RCP<LocalOrdinalVector> >(coarseLevel, TransferVecName_, coarseTV);
+
+  }
+  
+
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void LocalOrdinalTransferFactory<LocalOrdinal, GlobalOrdinal, Node>::BuildAggregates(Level & fineLevel, Level &coarseLevel) const {
     FactoryMonitor m(*this, "Build", coarseLevel);
 
     GetOStream(Runtime0) << "Transferring " <<TransferVecName_ << std::endl;
diff --git a/packages/muelu/src/MueCentral/MueLu_FactoryManager_decl.hpp b/packages/muelu/src/MueCentral/MueLu_FactoryManager_decl.hpp
index 2d9b93d6f14c..d83787c59ce9 100644
--- a/packages/muelu/src/MueCentral/MueLu_FactoryManager_decl.hpp
+++ b/packages/muelu/src/MueCentral/MueLu_FactoryManager_decl.hpp
@@ -56,6 +56,7 @@
 #include "MueLu_CoarseMapFactory_fwd.hpp"
 #include "MueLu_ConstraintFactory_fwd.hpp"
 #include "MueLu_DirectSolver_fwd.hpp"
+#include "MueLu_InitialBlockNumberFactory_fwd.hpp"
 #include "MueLu_LineDetectionFactory_fwd.hpp"
 #include "MueLu_NullspaceFactory_fwd.hpp"
 #include "MueLu_PatternFactory_fwd.hpp"
@@ -65,6 +66,7 @@
 #include "MueLu_SaPFactory_fwd.hpp"
 #include "MueLu_ScaledNullspaceFactory_fwd.hpp"
 #include "MueLu_SmootherFactory_fwd.hpp"
+#include "MueLu_StructuredAggregationFactory_fwd.hpp"
 #include "MueLu_TentativePFactory_fwd.hpp"
 #include "MueLu_TransPFactory_fwd.hpp"
 #include "MueLu_TrilinosSmoother_fwd.hpp"
diff --git a/packages/muelu/src/MueCentral/MueLu_FactoryManager_def.hpp b/packages/muelu/src/MueCentral/MueLu_FactoryManager_def.hpp
index be81f48cf51d..9b46b6f8469a 100644
--- a/packages/muelu/src/MueCentral/MueLu_FactoryManager_def.hpp
+++ b/packages/muelu/src/MueCentral/MueLu_FactoryManager_def.hpp
@@ -72,6 +72,7 @@
 #include "MueLu_TransPFactory.hpp"
 #include "MueLu_TrilinosSmoother.hpp"
 #include "MueLu_UncoupledAggregationFactory.hpp"
+#include "MueLu_StructuredAggregationFactory.hpp"
 #include "MueLu_HybridAggregationFactory.hpp"
 #include "MueLu_ZoltanInterface.hpp"
 #include "MueLu_InterfaceMappingTransferFactory.hpp"
@@ -194,7 +195,7 @@ namespace MueLu {
       if (varName == "Graph")                           return MUELU_KOKKOS_FACTORY(varName, CoalesceDropFactory, CoalesceDropFactory_kokkos);
       if (varName == "UnAmalgamationInfo")              return MUELU_KOKKOS_FACTORY(varName, AmalgamationFactory, AmalgamationFactory_kokkos);
       if (varName == "Aggregates")                      return MUELU_KOKKOS_FACTORY(varName, UncoupledAggregationFactory, UncoupledAggregationFactory_kokkos);
-      if (varName == "AggregateQualities")       return SetAndReturnDefaultFactory(varName, rcp(new AggregateQualityEstimateFactory()));
+      if (varName == "AggregateQualities")              return SetAndReturnDefaultFactory(varName, rcp(new AggregateQualityEstimateFactory()));
       if (varName == "CoarseMap")                       return MUELU_KOKKOS_FACTORY(varName, CoarseMapFactory, CoarseMapFactory_kokkos);
       if (varName == "DofsPerNode")                     return GetFactory("Graph");
       if (varName == "Filtering")                       return GetFactory("Graph");
@@ -202,6 +203,9 @@ namespace MueLu {
       if (varName == "LineDetection_VertLineIds")       return SetAndReturnDefaultFactory(varName, rcp(new LineDetectionFactory()));
       if (varName == "LineDetection_Layers")            return GetFactory("LineDetection_VertLineIds");
       if (varName == "CoarseNumZLayers")                return GetFactory("LineDetection_VertLineIds");
+      
+      // Structured
+      if (varName == "structuredInterpolationOrder")    return SetAndReturnDefaultFactory(varName, rcp(new StructuredAggregationFactory()));
 
       // Non-Galerkin
       if (varName == "K")                               return GetFactory("A");
@@ -233,7 +237,7 @@ namespace MueLu {
       if (varName == "CoarseDualNodeID2PrimalNodeID")   return SetAndReturnDefaultFactory(varName, rcp(new InterfaceAggregationFactory()));
 #ifdef HAVE_MUELU_INTREPID2
       // If we're asking for it, find who made P
-      if (varName == "pcoarsen: element to node map")                      return GetFactory("P");
+      if (varName == "pcoarsen: element to node map")   return GetFactory("P");
 #endif
 
       TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, "MueLu::FactoryManager::GetDefaultFactory(): No default factory available for building '" + varName + "'.");
diff --git a/packages/muelu/src/MueCentral/MueLu_MasterList.cpp b/packages/muelu/src/MueCentral/MueLu_MasterList.cpp
index 82a0c37fc825..f1bdcba59603 100644
--- a/packages/muelu/src/MueCentral/MueLu_MasterList.cpp
+++ b/packages/muelu/src/MueCentral/MueLu_MasterList.cpp
@@ -202,6 +202,8 @@ namespace MueLu {
   "<Parameter name=\"aggregation: mode\" type=\"string\" value=\"uncoupled\"/>"
   "<Parameter name=\"aggregation: ordering\" type=\"string\" value=\"natural\"/>"
   "<Parameter name=\"aggregation: drop scheme\" type=\"string\" value=\"classical\"/>"
+  "<Parameter name=\"aggregation: classical scheme\" type=\"string\" value=\"direct\"/>"
+  "<Parameter name=\"aggregation: row sum drop tol\" type=\"double\" value=\"-1.0\"/>"
   "<Parameter name=\"aggregation: block diagonal: interleaved blocksize\" type=\"int\" value=\"3\"/>"
   "<Parameter name=\"aggregation: number of random vectors\" type=\"int\" value=\"10\"/>"
   "<Parameter name=\"aggregation: number of times to pre or post smooth\" type=\"int\" value=\"10\"/>"
@@ -284,7 +286,6 @@ namespace MueLu {
   "<Parameter name=\"sa: max eigenvalue\" type=\"double\" value=\"-1.0\"/>"
   "<Parameter name=\"sa: rowsumabs diagonal replacement tolerance\" type=\"double\" value=\"-1.0\"/>"
   "<Parameter name=\"sa: rowsumabs diagonal replacement value\" type=\"double\" value=\"0.0\"/>"
-  "<Parameter name=\"interp: interpolation order\" type=\"int\" value=\"1\"/>"
   "<Parameter name=\"interp: build coarse coordinates\" type=\"bool\" value=\"true\"/>"
   "<ParameterList name=\"transfer: params\"/>"
   "<Parameter name=\"pcoarsen: element\" type=\"string\" value=\"\"/>"
@@ -605,6 +606,10 @@ namespace MueLu {
       
          ("aggregation: drop scheme","aggregation: drop scheme")
       
+         ("aggregation: classical scheme","aggregation: classical scheme")
+      
+         ("aggregation: row sum drop tol","aggregation: row sum drop tol")
+      
          ("aggregation: block diagonal: interleaved blocksize","aggregation: block diagonal: interleaved blocksize")
       
          ("aggregation: number of random vectors","aggregation: number of random vectors")
@@ -769,8 +774,6 @@ namespace MueLu {
       
          ("not supported by ML","sa: rowsumabs diagonal replacement value")
       
-         ("interp: interpolation order","interp: interpolation order")
-      
          ("interp: build coarse coordinates","interp: build coarse coordinates")
       
          ("transfer: params","transfer: params")
diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp
new file mode 100644
index 000000000000..dd6a34b9a227
--- /dev/null
+++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_decl.hpp
@@ -0,0 +1,147 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+
+#ifndef MUELU_CLASSICALMAPFACTORY_DECL_HPP_
+#define MUELU_CLASSICALMAPFACTORY_DECL_HPP_
+
+#include "Xpetra_StridedMapFactory_fwd.hpp"
+#include "Xpetra_Import_fwd.hpp"
+#include "Xpetra_Vector_fwd.hpp"
+#include "Xpetra_VectorFactory_fwd.hpp"
+
+#include "MueLu_ConfigDefs.hpp"
+#include "MueLu_SingleLevelFactoryBase.hpp"
+#include "MueLu_ClassicalMapFactory_fwd.hpp"
+#include "MueLu_GraphBase_fwd.hpp"
+#include "MueLu_Level_fwd.hpp"
+#include "MueLu_Exceptions.hpp"
+#include "MueLu_Graph_fwd.hpp"
+#include "MueLu_LWGraph_fwd.hpp"
+#ifdef HAVE_MUELU_KOKKOSCORE
+#include "MueLu_LWGraph_kokkos_fwd.hpp"
+#endif
+
+namespace MueLu {
+
+  /*!
+    @class ClassicalMapFactory class.
+    @brief Factory for generating F/C-splitting and a coarse level map. Used by ClassicalPFactory.
+
+    @ingroup MueLuTransferClasses
+
+    ## Input/output ##
+
+    ### User parameters of this factory ###
+    Parameter | type | default | master.xml | validated | requested | description
+    ----------|------|---------|:----------:|:---------:|:---------:|------------
+    Graph | Factory | null | | * | * | Generating factory for graph.
+    The * in the @c master.xml column denotes that the parameter is defined in the @c master.xml file.<br>
+    The * in the @c validated column means that the parameter is declared in the list of valid input parameters (see @c GetValidParameters() ).<br>
+    The * in the @c requested column states that the data is requested as input with all dependencies (see @c DeclareInput() ).
+
+
+    ### Variables provided by this factory ###
+
+    After @c Build() the following data is available (if requested)
+
+    Parameter | generated by | description
+    ----------|--------------|------------
+    | Colors | ClassicalMapFactory | ArrayRCP<LO> of colors
+    | CoarseMap | CoarseMapFactory | Map containing the coarse map used as domain map in the classical prolongator
+
+  */
+
+  template <class Scalar = DefaultScalar,
+          class LocalOrdinal = DefaultLocalOrdinal,
+          class GlobalOrdinal = DefaultGlobalOrdinal,
+          class Node = DefaultNode>
+  class ClassicalMapFactory : public SingleLevelFactoryBase {
+#undef MUELU_CLASSICALMAPFACTORY_SHORT
+#include "MueLu_UseShortNames.hpp"
+
+  public:
+    //! F/C/Dirichlet point type
+    typedef enum {F_PT=-1, UNASSIGNED=0, C_PT=1, DIRICHLET_PT=2} point_type;
+
+    //! @name Input
+    //@{
+
+    RCP<const ParameterList> GetValidParameterList() const override;
+
+    /*!
+      @brief Specifies the data that this class needs, and the factories that generate that data.
+
+      If the Build method of this class requires some data, but the generating factory is not specified in DeclareInput,
+      then this class will fall back to the settings in FactoryManager.
+    */
+    void DeclareInput(Level &currentLevel) const override;
+
+    //@}
+
+    //! @name Build methods.
+    //@{
+
+    //! Build an object with this factory.
+    void Build(Level &currentLevel) const override;
+
+    //@}
+
+
+  protected:
+    virtual void GenerateCoarseMap(const Map & fineMap, LO num_c_points, Teuchos::RCP<const Map> & coarseMap) const;
+
+    virtual void DoGraphColoring(const GraphBase & graph, Teuchos::ArrayRCP<LO> & myColors, LO & numColors) const;    
+
+    virtual void DoMISNaive(const GraphBase & graph, Teuchos::ArrayRCP<LO> & myColors, LO & numColors) const;    
+
+    virtual void DoDistributedGraphColoring(RCP<const GraphBase> & graph, Teuchos::ArrayRCP<LO> & myColors, LO & numColors) const;    
+
+  }; //class ClassicalMapFactory
+
+} //namespace MueLu
+
+#define MUELU_CLASSICALMAPFACTORY_SHORT
+#endif /* MUELU_CLASSICALMAPFACTORY_DECL_HPP_ */
diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp
new file mode 100644
index 000000000000..38ef97ae11fa
--- /dev/null
+++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalMapFactory_def.hpp
@@ -0,0 +1,498 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+
+#ifndef MUELU_CLASSICALMAPFACTORY_DEF_HPP_
+#define MUELU_CLASSICALMAPFACTORY_DEF_HPP_
+
+
+
+#include <Teuchos_Array.hpp>
+#include <Teuchos_ArrayRCP.hpp>
+
+
+#ifdef HAVE_MPI
+#include <Teuchos_DefaultMpiComm.hpp>
+#endif
+
+#include <Xpetra_Vector.hpp>
+#include <Xpetra_StridedMapFactory.hpp>
+#include <Xpetra_VectorFactory.hpp>
+#include <Xpetra_Import.hpp>
+#include <Xpetra_IO.hpp>
+
+#include "MueLu_ClassicalMapFactory_decl.hpp"
+#include "MueLu_Level.hpp"
+#include "MueLu_GraphBase.hpp"
+#include "MueLu_MasterList.hpp"
+#include "MueLu_Monitor.hpp"
+#include "MueLu_GraphBase.hpp"
+#include "MueLu_Graph.hpp"
+#include "MueLu_LWGraph.hpp"
+
+#ifdef HAVE_MUELU_ZOLTAN2
+#include "MueLu_Zoltan2GraphAdapter.hpp"
+#include <Zoltan2_XpetraCrsGraphAdapter.hpp>
+#include <Zoltan2_ColoringProblem.hpp>
+#include <Zoltan2_ColoringSolution.hpp>
+
+#endif
+
+// NOTE: We should be checking for KokkosKernels here, but
+// MueLu doesn't have a macro for that
+#ifdef HAVE_MUELU_KOKKOSCORE
+#include "MueLu_LWGraph_kokkos.hpp"
+#include <KokkosGraph_Distance1ColorHandle.hpp>
+#include <KokkosGraph_Distance1Color.hpp>
+#endif
+
+namespace MueLu {
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  RCP<const ParameterList> ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GetValidParameterList() const
+  {
+    RCP<ParameterList> validParamList = rcp(new ParameterList());
+#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
+    SET_VALID_ENTRY("aggregation: deterministic");
+    SET_VALID_ENTRY("aggregation: coloring algorithm");
+#undef SET_VALID_ENTRY
+    validParamList->set< RCP<const FactoryBase> >("A",              Teuchos::null, "Generating factory of the matrix A");
+    validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
+    validParamList->set< RCP<const FactoryBase> >("Graph",       null, "Generating factory of the graph");
+    validParamList->set< RCP<const FactoryBase> >("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
+ 
+    return validParamList;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level &currentLevel) const
+  {
+    Input(currentLevel, "A");
+    Input(currentLevel, "UnAmalgamationInfo");
+    Input(currentLevel, "Graph");
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level &currentLevel) const
+  {
+    FactoryMonitor m(*this, "Build", currentLevel);
+    RCP<const GraphBase> graph = Get<RCP<GraphBase> >(currentLevel,"Graph");
+    RCP<const Matrix> A = Get<RCP<Matrix> >(currentLevel,"A");
+    const ParameterList& pL = GetParameterList();
+    /* ============================================================= */
+    /* Phase 1 : Compute an initial MIS                              */
+    /* ============================================================= */
+    ArrayRCP<LO> myColors;
+    LO numColors=0;
+
+    RCP<LocalOrdinalVector> fc_splitting;
+    std::string coloringAlgo = pL.get<std::string>("aggregation: coloring algorithm");
+
+    // Switch to Zoltan2 if we're parallel and Tpetra (and not file)
+#ifdef HAVE_MUELU_ZOLTAN2
+    int numProcs = A->getRowMap()->getComm()->getSize();
+    if(coloringAlgo!="file" && numProcs && graph->GetDomainMap()->lib() == Xpetra::UseTpetra)
+      coloringAlgo="Zoltan2";
+#endif
+
+    // Switch to MIS if we're in Epetra (and not file)
+    if(coloringAlgo!="file" && graph->GetDomainMap()->lib() == Xpetra::UseEpetra)
+      coloringAlgo="MIS";
+
+
+    if(coloringAlgo == "file") {
+      // Read the CF splitting from disk
+      // NOTE: For interoperability reasons, this is dependent on the point_type enum not changing
+      std::string map_file   = std::string("map_fcsplitting_") + std::to_string(currentLevel.GetLevelID()) + std::string(".m");
+      std::string color_file = std::string("fcsplitting_")     + std::to_string(currentLevel.GetLevelID()) + std::string(".m");
+        
+      FILE * mapfile = fopen(map_file.c_str(),"r");
+      using real_type = typename Teuchos::ScalarTraits<SC>::magnitudeType;
+      using RealValuedMultiVector = typename Xpetra::MultiVector<real_type,LO,GO,NO>;
+      RCP<RealValuedMultiVector> mv;
+           
+
+      if(mapfile) {
+        fclose(mapfile);
+        RCP<const Map> colorMap = Xpetra::IO<Scalar, LocalOrdinal, GlobalOrdinal, Node>::ReadMap(map_file, A->getRowMap()->lib(), A->getRowMap()->getComm());
+        TEUCHOS_TEST_FOR_EXCEPTION(!colorMap->isCompatible(*A->getRowMap()),std::invalid_argument,"Coloring on disk has incompatible map with A");
+
+        mv = Xpetra::IO<real_type, LocalOrdinal, GlobalOrdinal, Node>::ReadMultiVector(color_file,colorMap);
+      }
+      else {
+        // Use A's rowmap and hope it matches
+        mv = Xpetra::IO<real_type, LocalOrdinal, GlobalOrdinal, Node>::ReadMultiVector(color_file,A->getRowMap());
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION(mv.is_null(),std::invalid_argument,"Coloring on disk cannot be read");      
+      fc_splitting = LocalOrdinalVectorFactory::Build(A->getRowMap());
+      TEUCHOS_TEST_FOR_EXCEPTION(mv->getLocalLength() != fc_splitting->getLocalLength(),std::invalid_argument,"Coloring map mismatch");
+
+      // Overlay the Dirichlet Points (and copy out the rest)
+      auto boundaryNodes = graph->GetBoundaryNodeMap();
+      ArrayRCP<const real_type> mv_data= mv->getData(0);
+      ArrayRCP<LO> fc_data= fc_splitting->getDataNonConst(0);     
+      for(LO i=0; i<(LO)fc_data.size(); i++) {
+        if(boundaryNodes[i]) 
+          fc_data[i] = DIRICHLET_PT;
+        else
+          fc_data[i] = Teuchos::as<LO>(mv_data[i]);
+      }
+    }
+#ifdef HAVE_MUELU_ZOLTAN2
+    else if(coloringAlgo.find("Zoltan2")!=std::string::npos && graph->GetDomainMap()->lib() == Xpetra::UseTpetra) {
+      SubFactoryMonitor sfm(*this,"DistributedGraphColoring",currentLevel);
+      DoDistributedGraphColoring(graph,myColors,numColors);
+    }
+#endif
+    else if(coloringAlgo == "MIS" || graph->GetDomainMap()->lib() == Xpetra::UseTpetra) {
+      SubFactoryMonitor sfm(*this,"MIS",currentLevel)
+;      TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() != 1, std::invalid_argument,"MIS on more than 1 MPI rank is not supported");
+      DoMISNaive(*graph,myColors,numColors);
+    }
+#ifdef HAVE_MUELU_KOKKOSCORE  
+    else {
+      SubFactoryMonitor sfm(*this,"GraphColoring",currentLevel);
+      TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() != 1, std::invalid_argument,"KokkosKernels graph coloring on more than 1 MPI rank is not supported");
+      DoGraphColoring(*graph,myColors,numColors);
+    }
+#else
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unrecognized distance 1 coloring algorithm");
+    }
+#endif
+
+
+    /* ============================================================= */
+    /* Phase 2 : Mark the C-Points                                   */
+    /* ============================================================= */
+    LO num_c_points = 0, num_d_points=0, num_f_points = 0;
+    if(fc_splitting.is_null()) {
+      // We just have a coloring, so we need to generate a splitting
+      auto boundaryNodes = graph->GetBoundaryNodeMap();
+      fc_splitting = LocalOrdinalVectorFactory::Build(A->getRowMap());
+      ArrayRCP<LO> myPointType = fc_splitting->getDataNonConst(0);
+      for(LO i=0; i<(LO)myColors.size(); i++) {
+        if(boundaryNodes[i]) {
+          myPointType[i] = DIRICHLET_PT;
+          num_d_points++;
+        }
+        else if ((LO)myColors[i] == 1) {
+          myPointType[i] = C_PT;
+          num_c_points++;
+        }
+        else
+          myPointType[i] = F_PT;
+      }
+      num_f_points = (LO)myColors.size() - num_d_points - num_c_points;
+    }
+    else {
+      // If we read the splitting off disk, we just need to count
+      ArrayRCP<LO> myPointType = fc_splitting->getDataNonConst(0);
+
+      for(LO i=0; i<(LO)myPointType.size(); i++) {
+        if(myPointType[i] == DIRICHLET_PT)
+          num_d_points++;
+        else if (myPointType[i] == C_PT)
+          num_c_points++;
+      }
+      num_f_points = (LO)myPointType.size() - num_d_points - num_c_points;
+    }
+
+    /* Output statistics on c/f/d points */
+    if (GetVerbLevel() & Statistics1) {
+      // NOTE: We batch the communication here
+      GO l_counts[] = {(GO)num_c_points, (GO) num_f_points, (GO) num_d_points};
+      GO g_counts[3];
+
+      RCP<const Teuchos::Comm<int> > comm = A->getRowMap()->getComm();
+      Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 3, l_counts, g_counts);
+      GetOStream(Statistics1) << "ClassicalMapFactory: C/F/D = "<<g_counts[0]<<"/"<<g_counts[1]<<"/"<<g_counts[2]<<std::endl;
+    }
+
+
+    /* Generate the Coarse map */
+    RCP<const Map> coarseMap;
+    {
+      SubFactoryMonitor sfm(*this,"Coarse Map",currentLevel);
+      GenerateCoarseMap(*A->getRowMap(),num_c_points,coarseMap);
+    }
+        
+    Set(currentLevel, "FC Splitting",fc_splitting);
+    Set(currentLevel, "CoarseMap", coarseMap);    
+   
+  }
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+GenerateCoarseMap(const Map & fineMap, LO num_c_points, RCP<const Map> & coarseMap) const {
+
+  // FIXME: Assumes scalar PDE
+  std::vector<size_t> stridingInfo_(1);
+  stridingInfo_[0]=1;
+  GO domainGIDOffset = 0;
+
+  coarseMap = StridedMapFactory::Build(fineMap.lib(),
+                                       Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
+                                       num_c_points,
+                                       fineMap.getIndexBase(),
+                                       stridingInfo_,
+                                       fineMap.getComm(),
+                                       domainGIDOffset);  
+}
+
+
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+DoGraphColoring(const GraphBase & graph, ArrayRCP<LO> & myColors_out, LO & numColors) const {
+#ifdef HAVE_MUELU_KOKKOSCORE  
+  const ParameterList& pL = GetParameterList();
+  using graph_t = typename LWGraph_kokkos::local_graph_type;
+  using KernelHandle = KokkosKernels::Experimental::
+    KokkosKernelsHandle<typename graph_t::row_map_type::value_type,
+                        typename graph_t::entries_type::value_type,
+                        typename graph_t::entries_type::value_type,
+                        typename graph_t::device_type::execution_space,
+                        typename graph_t::device_type::memory_space,
+                        typename graph_t::device_type::memory_space>;
+  KernelHandle kh;
+
+  // Leave gc algorithm choice as the default
+  kh.create_graph_coloring_handle();
+  
+  // Get the distance-1 graph coloring handle
+  auto coloringHandle = kh.get_graph_coloring_handle();      
+  
+  // Set the distance-1 coloring algorithm to use
+  if(pL.get<bool>("aggregation: deterministic") == true) {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_SERIAL );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: serial" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "serial") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_SERIAL );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: serial" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "vertex based") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_VB );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: vertex based" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "vertex based bit array") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_VBBIT );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: vertex based bit array" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "vertex based color set") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_VBCS );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: vertex based color set" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "vertex based deterministic") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_VBD );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: vertex based deterministic" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "vertex based deterministic bit array") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_VBDBIT );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: vertex based deterministic bit array" << std::endl;
+  } else if(pL.get<std::string>("aggregation: coloring algorithm") == "edge based") {
+    coloringHandle->set_algorithm( KokkosGraph::COLORING_EB );
+    if(IsPrint(Statistics1)) GetOStream(Statistics1) << "  algorithm: edge based" << std::endl;
+  } else {
+    TEUCHOS_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unrecognized distance 1 coloring algorithm");
+  }
+  
+  // Create device views for graph rowptrs/colinds
+  size_t numRows = graph.GetNodeNumVertices();
+  auto graphLWK = dynamic_cast<const LWGraph_kokkos*>(&graph);
+  auto graphLW  = dynamic_cast<const LWGraph*>(&graph);
+  auto graphG   = dynamic_cast<const Graph*>(&graph);
+  TEUCHOS_TEST_FOR_EXCEPTION(!graphLW && !graphLWK && !graphG,std::invalid_argument,"Graph is not a LWGraph or LWGraph_kokkos object");
+    // Run d1 graph coloring
+    // Assume that the graph is symmetric so row map/entries and col map/entries are the same
+
+  if(graphLWK) {
+    KokkosGraph::Experimental::graph_color(&kh, 
+                                           numRows, 
+                                           numRows, // FIXME: This should be the number of columns
+                                           graphLWK->getRowPtrs(),
+                                           graphLWK->getEntries(),
+                                           true);
+  }
+  else if(graphLW) {
+    auto rowptrs = graphLW->getRowPtrs();
+    auto entries = graphLW->getEntries();
+    // Copy rowptrs to a size_t, because kokkos-kernels doesn't like rowptrs as LO's
+    Teuchos::Array<size_t> rowptrs_s(rowptrs.size());
+    std::copy(rowptrs.begin(),rowptrs.end(),rowptrs_s.begin());
+    Kokkos::View<const size_t*,Kokkos::LayoutLeft,Kokkos::HostSpace> rowptrs_v(rowptrs_s.data(),(size_t)rowptrs.size());
+    Kokkos::View<const LO*,Kokkos::LayoutLeft,Kokkos::HostSpace> entries_v(entries.getRawPtr(),(size_t)entries.size());
+    KokkosGraph::Experimental::graph_color(&kh, 
+                                           numRows, 
+                                           numRows, // FIXME: This should be the number of columns
+                                           rowptrs_v,
+                                           entries_v,
+                                           true);
+  }
+  else if(graphG) {  
+    // FIXME:  This is a terrible, terrible hack, based on 0-based local indexing.
+    RCP<const CrsGraph> graphC = graphG->GetGraph();
+    size_t numEntries = graphC->getNodeNumEntries();
+    ArrayView<const LO> indices;
+    graphC->getLocalRowView(0,indices);
+    Kokkos::View<size_t*,Kokkos::LayoutLeft,Kokkos::HostSpace> rowptrs_v("rowptrs_v",graphC->getNodeNumRows()+1);
+    rowptrs_v[0]=0;
+    for(LO i=0; i<(LO)graphC->getNodeNumRows()+1; i++) 
+      rowptrs_v[i+1] = rowptrs_v[i] + graphC->getNumEntriesInLocalRow(i);
+    Kokkos::View<const LO*,Kokkos::LayoutLeft,Kokkos::HostSpace> entries_v(&indices[0],numEntries);    
+    KokkosGraph::Experimental::graph_color(&kh, 
+                                           numRows, 
+                                           numRows, // FIXME: This should be the number of columns
+                                           rowptrs_v,
+                                           entries_v,
+                                           true);       
+  }
+
+  
+  // Extract the colors and store them in the aggregates
+  auto myColors_d = coloringHandle->get_vertex_colors();
+  numColors = static_cast<LO>(coloringHandle->get_num_colors());
+
+  // Copy back to host
+  auto myColors_h = Kokkos::create_mirror_view(myColors_d);
+  myColors_out.resize(myColors_h.size());
+  Kokkos::View<LO*,Kokkos::LayoutLeft,Kokkos::HostSpace> myColors_v(&myColors_out[0],myColors_h.size());
+  Kokkos::deep_copy(myColors_v,myColors_h);
+  
+  //clean up coloring handle
+  kh.destroy_graph_coloring_handle();
+#else
+  TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError,"ClassicalMapFactory: Requires KokkosKernels");
+#endif
+  
+}// end DoGraphColoring
+    
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+DoMISNaive(const GraphBase & graph, ArrayRCP<LO> & myColors, LO & numColors) const {
+  // This is a fall-back routine for when we don't have Kokkos or when it isn't initialized
+  // We just do greedy MIS because this is easy to write.
+
+  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
+  LO MIS = Teuchos::ScalarTraits<LO>::one();
+
+  //FIXME: Not efficient
+  myColors.resize(0);
+  myColors.resize(graph.GetNodeNumVertices(),LO_INVALID);
+  auto boundaryNodes = graph.GetBoundaryNodeMap();
+  LO Nrows = (LO)graph.GetNodeNumVertices();
+
+  
+  for(LO row=0; row < Nrows; row++) {
+    if(boundaryNodes[row])
+      continue;
+    ArrayView<const LO> indices = graph.getNeighborVertices(row);
+    bool has_colored_neighbor=false;
+    for(LO j=0; !has_colored_neighbor && j<(LO)indices.size(); j++) {
+      // FIXME: This does not handle ghosting correctly
+      if(myColors[indices[j]] == MIS) 
+        has_colored_neighbor=true;
+    }
+    if(!has_colored_neighbor)
+      myColors[row] = MIS;   
+  } 
+  numColors=1;
+}
+
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalMapFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+DoDistributedGraphColoring(RCP<const GraphBase> & graph, ArrayRCP<LO> & myColors_out, LO & numColors) const {
+#ifdef HAVE_MUELU_ZOLTAN2
+  //  const ParameterList& pL = GetParameterList();
+  Teuchos::ParameterList params;
+  params.set("color_choice","FirstFit");
+  params.set("color_method","D1");
+  //  params.set("color_choice", colorMethod);
+  //  params.set("color_method", colorAlg);
+  //  params.set("verbose", verbose);
+  //  params.set("serial_threshold",serialThreshold);
+  //params.set("recolor_degrees",recolorDegrees);
+
+  // Do the coloring via Zoltan2
+  using GraphAdapter = MueLuGraphBaseAdapter<GraphBase>;
+  GraphAdapter z_adapter(graph);
+
+  // We need to provide the MPI Comm, or else we wind up using the default (eep!)
+  Zoltan2::ColoringProblem<GraphAdapter> problem(&z_adapter,&params,graph->GetDomainMap()->getComm());
+  problem.solve();
+  Zoltan2::ColoringSolution<GraphAdapter> * soln = problem.getSolution();
+  ArrayRCP<int> colors = soln->getColorsRCP();
+  numColors = (LO)soln->getNumColors();
+
+  // Assign the Array RCP or Copy Out
+  // FIXME:  This probably won't work if LO!=int
+  if(std::is_same<LO,int>::value) 
+    myColors_out = colors;
+  else {
+    myColors_out.resize(colors.size());
+    for(LO i=0; i<(LO)myColors_out.size(); i++)
+      myColors_out[i] = (LO) colors[i];
+  }
+
+  /*
+
+  printf("CMS: numColors = %d\ncolors = ",numColors);
+  for(int i=0;i<colors.size(); i++) 
+    printf("%d ",colors[i]);
+  printf("\n");
+
+  */
+
+   
+
+#endif
+}
+
+
+
+} //namespace MueLu
+
+#endif /* MUELU_CLASSICALMAPFACTORY_DEF_HPP_ */
diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_decl.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_decl.hpp
new file mode 100644
index 000000000000..deb124e479dc
--- /dev/null
+++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_decl.hpp
@@ -0,0 +1,127 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef MUELU_CLASSICALPFACTORY_DECL_HPP
+#define MUELU_CLASSICALPFACTORY_DECL_HPP
+
+#include <Xpetra_MultiVectorFactory_fwd.hpp>
+#include <Xpetra_VectorFactory_fwd.hpp>
+#include <Xpetra_CrsGraphFactory_fwd.hpp>
+#include <Xpetra_Map_fwd.hpp>
+#include <Xpetra_MapFactory_fwd.hpp>
+#include <Xpetra_Import_fwd.hpp>
+#include <Xpetra_Vector_fwd.hpp>
+
+#include "MueLu_ConfigDefs.hpp"
+#include "MueLu_PerfUtils_fwd.hpp"
+#include "MueLu_PFactory.hpp"
+#include "MueLu_ClassicalPFactory_fwd.hpp"
+#include "MueLu_ClassicalMapFactory_fwd.hpp"
+#include "MueLu_Utilities_fwd.hpp"
+#include "MueLu_CoarseMapFactory_fwd.hpp"
+#include "MueLu_AmalgamationInfo_fwd.hpp"
+#include "MueLu_GraphBase_fwd.hpp"
+#include "MueLu_Level_fwd.hpp"
+
+namespace MueLu {
+
+  template <class Scalar = DefaultScalar,
+            class LocalOrdinal = DefaultLocalOrdinal,
+            class GlobalOrdinal = DefaultGlobalOrdinal,
+            class Node = DefaultNode>
+  class ClassicalPFactory : public PFactory {
+#undef MUELU_CLASSICALPFACTORY_SHORT
+#include "MueLu_UseShortNames.hpp"
+
+  public:
+    // Defining types that require the short names included above
+    using point_type = typename ClassicalMapFactory::point_type;
+
+    //! @name Constructors/Destructors.
+    //@{
+
+    //! Constructor
+    ClassicalPFactory() { }
+
+    //! Destructor.
+    virtual ~ClassicalPFactory() { }
+    //@}
+
+    RCP<const ParameterList> GetValidParameterList() const;
+
+    //! Input
+    //@{
+
+    void DeclareInput(Level& fineLevel, Level& coarseLevel) const;
+
+    //@}
+
+    //! @name Build methods.
+    //@{
+
+    void Build (Level& fineLevel, Level& coarseLevel) const;
+    void BuildP(Level& fineLevel, Level& coarseLevel) const;
+    
+  private:
+
+    // Utility algorithms
+    void GenerateStrengthFlags(const Matrix & A,const GraphBase & graph, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong) const;
+
+    // Ghosting Algorithms
+    void GhostCoarseMap(const Matrix &A,const Import & Importer, const ArrayRCP<const LO> myPointType,const RCP<const Map> & coarseMap, RCP<const Map> & coarseColMap) const;
+
+    // Coarsening algorithms
+    void Coarsen_ClassicalModified(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph, RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<const Import> remoteOnlyImporter, RCP<Matrix> & P) const;
+    void Coarsen_Direct(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph, RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<Matrix> & P) const;
+    void Coarsen_Ext_Plus_I(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph, RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<Matrix> & P) const;
+
+    //@}
+
+  }; //class ClassicalPFactory
+
+} //namespace MueLu
+
+#define MUELU_CLASSICALPFACTORY_SHORT
+#endif // MUELU_CLASSICALPFACTORY_DECL_HPP
diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp
new file mode 100644
index 000000000000..4e4860af590e
--- /dev/null
+++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp
@@ -0,0 +1,818 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef MUELU_CLASSICALPFACTORY_DEF_HPP
+#define MUELU_CLASSICALPFACTORY_DEF_HPP
+
+#include <Xpetra_MultiVectorFactory.hpp>
+#include <Xpetra_VectorFactory.hpp>
+#include <Xpetra_CrsGraphFactory.hpp>
+#include <Xpetra_Matrix.hpp>
+#include <Xpetra_Map.hpp>
+#include <Xpetra_Map.hpp>
+#include <Xpetra_MapFactory.hpp>
+#include <Xpetra_Vector.hpp>
+#include <Xpetra_Import.hpp>
+#include <Xpetra_ImportUtils.hpp>
+#include <Xpetra_IO.hpp>
+#include <Xpetra_StridedMapFactory.hpp>
+
+#include <Teuchos_OrdinalTraits.hpp>
+
+#include "MueLu_MasterList.hpp"
+#include "MueLu_Monitor.hpp"
+#include "MueLu_PerfUtils.hpp"
+#include "MueLu_ClassicalPFactory_decl.hpp"
+#include "MueLu_ClassicalMapFactory.hpp"
+#include "MueLu_Utilities.hpp"
+#include "MueLu_AmalgamationInfo.hpp"
+#include "MueLu_GraphBase.hpp"
+
+
+//#define CMS_DEBUG
+//#define CMS_DUMP
+
+namespace { 
+
+template<class SC>
+int Sign(SC val) {
+  using STS = typename Teuchos::ScalarTraits<SC>;
+  typename STS::magnitudeType MT_ZERO = Teuchos::ScalarTraits<typename STS::magnitudeType>::zero();
+  if(STS::real(val) > MT_ZERO) return 1;
+  else if(STS::real(val) < MT_ZERO) return -1;
+  else return 0;
+}
+
+}// anonymous namepsace
+
+namespace MueLu {
+
+
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  RCP<const ParameterList> ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GetValidParameterList() const {
+    RCP<ParameterList> validParamList = rcp(new ParameterList());
+#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
+    SET_VALID_ENTRY("aggregation: deterministic");
+    SET_VALID_ENTRY("aggregation: coloring algorithm");
+    SET_VALID_ENTRY("aggregation: classical scheme");
+
+    // To know if we need BlockNumber
+    SET_VALID_ENTRY("aggregation: drop scheme");
+    {
+      typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
+      validParamList->getEntry("aggregation: classical scheme").setValidator(rcp(new validatorType(Teuchos::tuple<std::string>("direct","ext+i","classical modified"), "aggregation: classical scheme")));
+                                                                        
+    }
+
+#undef SET_VALID_ENTRY
+    validParamList->set< RCP<const FactoryBase> >("A",              Teuchos::null, "Generating factory of the matrix A");
+    validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
+    validParamList->set< RCP<const FactoryBase> >("Graph",       null, "Generating factory of the graph");
+    validParamList->set< RCP<const FactoryBase> >("DofsPerNode", null, "Generating factory for variable \'DofsPerNode\', usually the same as for \'Graph\'");
+    validParamList->set< RCP<const FactoryBase> >("CoarseMap",         Teuchos::null, "Generating factory of the CoarseMap");
+    validParamList->set< RCP<const FactoryBase> >("FC Splitting",         Teuchos::null, "Generating factory of the FC Splitting");
+    validParamList->set< RCP<const FactoryBase> >("BlockNumber",        Teuchos::null, "Generating factory for Block Number");
+    //    validParamList->set< RCP<const FactoryBase> >("Nullspace",      Teuchos::null, "Generating factory of the nullspace");
+
+    return validParamList;
+  }
+
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& fineLevel, Level& /* coarseLevel */) const {
+    Input(fineLevel, "A");
+    Input(fineLevel, "Graph");
+    Input(fineLevel, "DofsPerNode");    
+    Input(fineLevel, "UnAmalgamationInfo");
+    Input(fineLevel, "DofsPerNode");
+    Input(fineLevel, "CoarseMap");
+    Input(fineLevel, "FC Splitting");
+    
+    const ParameterList& pL = GetParameterList();
+    std::string drop_algo = pL.get<std::string>("aggregation: drop scheme");
+    if (drop_algo.find("block diagonal") != std::string::npos) {
+      Input(fineLevel, "BlockNumber");
+    }
+
+  }
+
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const {
+    return BuildP(fineLevel, coarseLevel);
+  }
+
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildP(Level& fineLevel, Level& coarseLevel) const {
+    FactoryMonitor m(*this, "Build", coarseLevel);
+    using STS = Teuchos::ScalarTraits<SC>;
+
+    // We start by assuming that someone did a reasonable strength of connection
+    // algorithm before we start to get our Graph, DofsPerNode and UnAmalgamationInfo
+
+    // We begin by getting a MIS (from a graph coloring) and then at that point we need
+    // to start generating entries for the prolongator.   
+    RCP<const Matrix>      A        = Get< RCP<Matrix> >(fineLevel, "A");
+    RCP<const Map> ownedCoarseMap   = Get<RCP<const Map> >(fineLevel,"CoarseMap");
+    RCP<const LocalOrdinalVector> owned_fc_splitting = Get<RCP<LocalOrdinalVector> >(fineLevel,"FC Splitting");
+    RCP<const GraphBase> graph      = Get< RCP<GraphBase> >(fineLevel, "Graph");
+    //    LO nDofsPerNode                 = Get<LO>(fineLevel, "DofsPerNode");
+    RCP<AmalgamationInfo> amalgInfo = Get< RCP<AmalgamationInfo> >     (fineLevel, "UnAmalgamationInfo");
+    RCP<const Import>    Importer   = A->getCrsGraph()->getImporter();
+    Xpetra::UnderlyingLib lib = ownedCoarseMap->lib();
+
+    //    RCP<MultiVector> fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");
+    RCP<Matrix> P;
+    //    SC SC_ZERO = STS::zero();
+    LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
+    const point_type C_PT = ClassicalMapFactory::C_PT;
+    const point_type F_PT = ClassicalMapFactory::F_PT;
+    const ParameterList& pL = GetParameterList();
+  
+    // FIXME: This guy doesn't work right now for NumPDEs != 1
+    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError,"ClassicalPFactory: Multiple PDEs per node not supported yet");
+
+    // FIXME: This does not work in parallel yet
+//    TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getComm()->getSize() !=  1,Exceptions::RuntimeError,"ClassicalPFactory: MPI Ranks > 1 not supported yet");
+ 
+    // NOTE: Let's hope we never need to deal with this case
+    TEUCHOS_TEST_FOR_EXCEPTION(!A->getRowMap()->isSameAs(*A->getDomainMap()),Exceptions::RuntimeError,"ClassicalPFactory: MPI Ranks > 1 not supported yet");
+
+
+    // Do we need ghosts rows of A and myPointType?
+    std::string scheme = pL.get<std::string>("aggregation: classical scheme");
+    bool need_ghost_rows =false;
+    if(scheme == "ext+i") 
+      need_ghost_rows=true;
+    else if(scheme == "direct")
+      need_ghost_rows=false;
+    else if(scheme == "classical modified") 
+      need_ghost_rows=true;
+    // NOTE: ParameterList validator will check this guy so we don't really need an "else" here
+
+
+    // Ghost the FC splitting and grab the data (if needed)
+    RCP<const LocalOrdinalVector> fc_splitting;
+    ArrayRCP<const LO> myPointType;
+    if(Importer.is_null()) {
+      fc_splitting = owned_fc_splitting;
+    }
+    else {
+      RCP<LocalOrdinalVector> fc_splitting_nonconst = LocalOrdinalVectorFactory::Build(A->getCrsGraph()->getColMap());
+      fc_splitting_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT);
+      fc_splitting = fc_splitting_nonconst;
+    }
+    myPointType = fc_splitting->getData(0);      
+
+
+    /* Ghost A (if needed) */
+    RCP<const Matrix> Aghost;
+    RCP<const LocalOrdinalVector> fc_splitting_ghost;
+    ArrayRCP<const LO> myPointType_ghost;
+    RCP<const Import> remoteOnlyImporter;
+    if(need_ghost_rows && !Importer.is_null()){      
+      ArrayView<const LO> remoteLIDs = Importer->getRemoteLIDs();
+      size_t numRemote = Importer->getNumRemoteIDs();
+      Array<GO> remoteRows(numRemote);
+      for (size_t i = 0; i < numRemote; i++)
+        remoteRows[i] = Importer->getTargetMap()->getGlobalElement(remoteLIDs[i]);
+
+      RCP<const Map> remoteRowMap = MapFactory::Build(lib,Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), remoteRows(),
+                                                      A->getDomainMap()->getIndexBase(), A->getDomainMap()->getComm());
+
+      remoteOnlyImporter = Importer->createRemoteOnlyImport(remoteRowMap);
+      RCP<const CrsMatrix> Acrs = rcp_dynamic_cast<const CrsMatrixWrap>(A)->getCrsMatrix();
+      RCP<CrsMatrix> Aghost_crs = CrsMatrixFactory::Build(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap());
+      Aghost = rcp(new CrsMatrixWrap(Aghost_crs));
+      // We also may need need to ghost myPointType for Aghos
+      RCP<const Import> Importer2 = Aghost->getCrsGraph()->getImporter();
+      if(Importer2.is_null()) {
+        RCP<LocalOrdinalVector> fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Aghost->getColMap());
+        fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT);
+        fc_splitting_ghost = fc_splitting_ghost_nonconst;
+        myPointType_ghost  = fc_splitting_ghost->getData(0);      
+      }
+      /*
+#if OLD_AND_BUSTED
+      if(lib == Xpetra::UseEpetra) {
+#ifdef HAVE_MUELU_EPETRA
+        RCP<CrsMatrix> Ecrs = rcp(new EpetraCrsMatrix(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap()));
+        Aghost = rcp(new CrsMatrixWrap(Ecrs));
+        RCP<const Import> Importer2 = Ecrs->getCrsGraph()->getImporter();
+        if(Importer2.is_null()) {
+          RCP<LocalOrdinalVector> fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Ecrs->getColMap());
+          fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT);
+          fc_splitting_ghost = fc_splitting_ghost_nonconst;
+          myPointType_ghost  = fc_splitting_ghost->getData(0);      
+        }
+#endif
+      }
+      else {
+#ifdef HAVE_MUELU_TPETRA
+        RCP<CrsMatrix> Tcrs = rcp(new TpetraCrsMatrix(Acrs,*remoteOnlyImporter,A->getDomainMap(),remoteOnlyImporter->getTargetMap()));
+        Aghost = rcp(new CrsMatrixWrap(Tcrs));
+        // We also need to ghost myPointType for Aghost, if we've created an Aghost
+        RCP<const Import> Importer2 = Tcrs->getCrsGraph()->getImporter();
+        if(Importer2.is_null()) {
+          RCP<LocalOrdinalVector> fc_splitting_ghost_nonconst = LocalOrdinalVectorFactory::Build(Tcrs->getColMap());
+          fc_splitting_ghost_nonconst->doImport(*owned_fc_splitting,*Importer,Xpetra::INSERT);
+          fc_splitting_ghost = fc_splitting_ghost_nonconst;
+          myPointType_ghost  = fc_splitting_ghost->getData(0);      
+        }
+#endif
+#endif
+      }     
+    */
+    }
+
+
+
+    /* Generate the ghosted Coarse map using the "Tuminaro maneuver" (if needed)*/   
+    RCP<const Map> coarseMap;
+    if(Importer.is_null())  
+      coarseMap = ownedCoarseMap;
+    else {
+      // Generate a domain vector with the coarse ID's as entries for C points
+      GhostCoarseMap(*A,*Importer,myPointType,ownedCoarseMap,coarseMap);
+    }
+  
+
+    // Get the block number, if we need it (and ghost it)
+    RCP<LocalOrdinalVector>  BlockNumber;
+    std::string drop_algo = pL.get<std::string>("aggregation: drop scheme");
+    if (drop_algo.find("block diagonal") != std::string::npos) {
+      RCP<LocalOrdinalVector> OwnedBlockNumber;
+      OwnedBlockNumber = Get<RCP<LocalOrdinalVector> >(fineLevel, "BlockNumber");
+      if(Importer.is_null()) 
+        BlockNumber = OwnedBlockNumber;
+      else{
+        BlockNumber = LocalOrdinalVectorFactory::Build(A->getRowMap());
+        BlockNumber->doImport(*OwnedBlockNumber,*Importer,Xpetra::INSERT);
+      }
+    }
+
+#if defined(CMS_DEBUG) || defined(CMS_DUMP)
+    {
+      std::ofstream ofs(std::string("dropped_graph_") + std::to_string(fineLevel.GetLevelID()) + std::string(".dat"),std::ofstream::out);
+      RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(ofs));
+      graph->print(*fancy,Debug);
+      std::string out_fc = std::string("fc_splitting_") + std::to_string(fineLevel.GetLevelID()) + std::string(".dat");
+
+      // We don't support writing LO vectors in Xpetra (boo!) so....
+      using real_type = typename Teuchos::ScalarTraits<SC>::magnitudeType;
+      using RealValuedMultiVector = typename Xpetra::MultiVector<real_type,LO,GO,NO>;
+      typedef Xpetra::MultiVectorFactory<real_type,LO,GO,NO> RealValuedMultiVectorFactory;
+ 
+      RCP<RealValuedMultiVector> mv = RealValuedMultiVectorFactory::Build(fc_splitting->getMap(),1);
+      ArrayRCP<real_type> mv_data= mv->getDataNonConst(0);
+      ArrayRCP<const LO> fc_data= fc_splitting->getData(0);
+      
+      for(LO i=0; i<(LO)fc_data.size(); i++)
+        mv_data[i] = Teuchos::as<real_type>(fc_data[i]);
+      Xpetra::IO<real_type,LO,GO,NO>::Write(out_fc,*mv);
+
+
+    }
+#endif
+
+  
+    /* Generate reindexing arrays */
+    // Note: cpoint2pcol is ghosted if myPointType is
+    // NOTE: Since the ghosted coarse column map follows the ordering of
+    // the fine column map, this *should* work, because it is in local indices.
+    // FIXME:  Add a check for this in debug mode.
+    Array<LO> cpoint2pcol(myPointType.size(),LO_INVALID);
+    Array<LO> pcol2cpoint(coarseMap->getNodeNumElements(),LO_INVALID);   
+    LO num_c_points = 0;
+    LO num_f_points =0;
+    for(LO i=0; i<(LO) myPointType.size(); i++) {
+        if(myPointType[i] == C_PT) {
+        cpoint2pcol[i] = num_c_points;
+        num_c_points++;
+      }
+      else if (myPointType[i] == F_PT)
+        num_f_points++;
+    }
+    for(LO i=0; i<(LO)cpoint2pcol.size(); i++) {
+      if(cpoint2pcol[i] != LO_INVALID)
+        pcol2cpoint[cpoint2pcol[i]] =i;
+    }
+
+    // Generate edge strength flags (this will make everything easier later)
+    // These do *not* need to be ghosted (unlike A)
+    Teuchos::Array<size_t> eis_rowptr;
+    Teuchos::Array<bool> edgeIsStrong;
+    {
+      SubFactoryMonitor sfm(*this,"Strength Flags",coarseLevel);
+      GenerateStrengthFlags(*A,*graph,eis_rowptr,edgeIsStrong);
+    }
+
+    // Phase 3: Generate the P matrix
+    RCP<const Map> coarseColMap = coarseMap;
+    RCP<const Map> coarseDomainMap = ownedCoarseMap;
+    if(scheme == "ext+i") {
+      SubFactoryMonitor sfm(*this,"Ext+i Interpolation",coarseLevel);
+      Coarsen_Ext_Plus_I(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,P);
+    }
+    else if(scheme == "direct") {
+      SubFactoryMonitor sfm(*this,"Direct Interpolation",coarseLevel);
+      Coarsen_Direct(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,P);
+    }
+    else if(scheme == "classical modified") {
+      SubFactoryMonitor sfm(*this,"Classical Modified Interpolation",coarseLevel);
+      Coarsen_ClassicalModified(*A,Aghost,*graph,coarseColMap,coarseDomainMap,num_c_points,num_f_points,myPointType(),myPointType_ghost(),cpoint2pcol,pcol2cpoint,eis_rowptr,edgeIsStrong,BlockNumber,remoteOnlyImporter,P);
+    }
+    // NOTE: ParameterList validator will check this guy so we don't really need an "else" here
+
+#ifdef CMS_DEBUG
+    Xpetra::IO<SC,LO,GO,NO>::Write("classical_p.mat", *P);
+#endif
+
+    // Save output
+    Set(coarseLevel,"P",P);
+    RCP<const CrsGraph> pg = P->getCrsGraph();
+    Set(coarseLevel,"P Graph",pg);
+
+    //RCP<MultiVector> coarseNullspace = MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
+    //    P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits<SC>::one(), Teuchos::ScalarTraits<SC>::zero());
+    //    Set(coarseLevel, "Nullspace", coarseNullspace);
+
+    if (IsPrint(Statistics1)) {
+      RCP<ParameterList> params = rcp(new ParameterList());
+      params->set("printLoadBalancingInfo", true);
+      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*P, "P", params);
+    }
+  }
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+Coarsen_ClassicalModified(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph,  RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<const Import> remoteOnlyImporter,RCP<Matrix> & P) const {
+    /* ============================================================= */
+    /* Phase 3 : Classical Modified Interpolation                    */
+    /* De Sterck, Falgout, Nolting and Yang. "Distance-two           */
+    /* interpolation for parallel algebraic multigrid", NLAA 2008    */
+    /* 15:115-139                                                    */
+    /* ============================================================= */    
+    /* Definitions:                                                        */
+    /* F = F-points                                                        */
+    /* C = C-points                                                        */
+    /* N_i = non-zero neighbors of node i                                  */
+    /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */
+    /* F_i^s = F \cap S_i [strong F-neighbors of i]                        */
+    /* C_i^s = C \cap S_i [strong C-neighbors of i]                        */
+
+    /* N_i^w = N_i\ (F_i^s \cup C_i^s) [weak neighbors of i]               */
+    /*         This guy has a typo.  The paper had a \cap instead of \cup  */
+    /*         I would note that this set can contain both F-points and    */
+    /*         C-points.  They're just weak neighbors of this guy.         */
+    /*         Note that N_i^w \cup F_i^s \cup C_i^s = N_i by construction */
+
+
+    /* \bar{a}_ij = {    0, if sign(a_ij) == sign(a_ii)                    */
+    /*              { a_ij, otherwise                                      */
+
+    /* F_i^s\star = {k\in N_i | C_i^s \cap C_k^s = \emptyset}              */
+    /*              [set of F-neighbors of i that do not share a strong    */
+    /*               C-neighbor with i]                                    */
+
+
+    /* Rewritten Equation (9) on p. 120                                    */
+    /* \tilde{a}_ii =  (a_ij + \sum_{k\in{N_i^w \cup F_i^s\star}} a_ik     */
+    /*                                                                     */ 
+    /* f_ij = \sum_{k\in{F_i^s\setminusF_i^s*}} \frac{a_ik \bar{a}_kj}{\sum_{m\inC_i^s \bar{a}_km}}    */ 
+    /*                                                                     */ 
+    /* w_ij = \frac{1}{\tilde{a}_ii} ( a_ij + f_ij)  for all j in C_i^s    */ 
+ 
+
+    TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,"ClassicalPFactory: ClassicalModified not implemented");
+
+}
+
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+Coarsen_Direct(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph,  RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<Matrix> & P) const {
+    /* ============================================================= */
+    /* Phase 3 : Direct Interpolation                                */
+    /* We do not use De Sterck, Falgout, Nolting and Yang (2008)     */
+    /* here.  Instead we follow:                                     */
+    /* Trottenberg, Oosterlee and Schueller, Multigrid, 2001.        */
+    /* with some modifications inspirted by PyAMG                    */
+    /* ============================================================= */    
+    /* Definitions:                                                        */
+    /* F = F-points                                                        */
+    /* C = C-points                                                        */
+    /* N_i = non-zero neighbors of node i                                  */
+    /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */
+    /* F_i^s = F \cap S_i [strong F-neighbors of i]                        */
+    /* C_i^s = C \cap S_i [strong C-neighbors of i]                        */
+    /* P_i = Set of interpolatory variables for row i [here = C_i^s]       */
+
+    /* (A.2.17) from p. 426                                                */ 
+    /* a_ij^- = {  a_ij,  if a_ij < 0                                      */ 
+    /*          {     0,  otherwise                                        */ 
+    /* a_ij^+ = {  a_ij,  if a_ij > 0                                      */ 
+    /*          {     0,  otherwise                                        */ 
+    /* P_i^- =  P_i \cap {k | a_ij^- != 0 and a_ij^- = a_ij}               */
+    /*          [strong C-neighbors with negative edges]                   */
+    /* P_i^+ =  P_i \cap {k | a_ij^+ != 0 and a_ij^+ = a_ij}               */
+    /*          [strong C-neighbors with positive edges]                   */
+
+
+    /* de Sterck et al., gives us this:                                                      */
+    /* Rewritten Equation (6) on p. 119                                                      */
+    /* w_ij = - a_ji / a_ii \frac{\sum_{k\in N_i} a_ik} {\sum k\inC_i^s} a_ik},   j\in C_i^s */
+
+    /* Trottenberg et al. (A.7.6) and (A.7.7) on p. 479 gives this:                          */
+    /* alpha_i = \frac{ \sum_{j\in N_i} a_ij^- }{ \sum_{k\in P_i} a_ik^- }                   */
+    /* beta_i  = \frac{ \sum_{j\in N_i} a_ij^+ }{ \sum_{k\in P_i} a_ik^+ }                   */
+    /* w_ik    = { - alpha_i (a_ik / a_ii),   if k\in P_i^-                                  */
+    /*           { -  beta_i (a_ik / a_ii),   if k\in P_i^+                                  */  
+    /* NOTE: The text says to modify, if  P_i^+ is zero but it isn't entirely clear how that */
+    /* works.  We'll follow the PyAMG implementation in a few important ways.                */
+     
+    const point_type C_PT = ClassicalMapFactory::C_PT;
+    const point_type DIRICHLET_PT = ClassicalMapFactory::DIRICHLET_PT;
+   
+    // Initial (estimated) allocation
+    // NOTE: If we only used Tpetra, then we could use these guys as is, but because Epetra, we can't, so there
+    // needs to be a copy below.
+    using STS = typename Teuchos::ScalarTraits<SC>;
+    using MT  = typename STS::magnitudeType;
+    using MTS = typename Teuchos::ScalarTraits<MT>;
+    size_t Nrows = A.getNodeNumRows();
+    double c_point_density = (double)num_c_points / (num_c_points+num_f_points);
+    double mean_strong_neighbors_per_row = (double) graph.GetNodeNumEdges() / graph.GetNodeNumVertices();
+    //    double mean_neighbors_per_row = (double)A.getNodeNumEntries() / Nrows;
+    double nnz_per_row_est = c_point_density*mean_strong_neighbors_per_row;
+
+    size_t nnz_est = std::max(Nrows,std::min((size_t)A.getNodeNumEntries(),(size_t)(nnz_per_row_est*Nrows)));
+    SC SC_ZERO = STS::zero();
+    MT MT_ZERO = MTS::zero();
+    Array<size_t> tmp_rowptr(Nrows+1);
+    Array<LO> tmp_colind(nnz_est);
+
+    // Algorithm (count+realloc)
+    // For each row, i, 
+    // - Count the number of elements in \hat{C}_j, aka [C-neighbors and C-neighbors of strong F-neighbors of i]   
+    size_t ct=0;
+    for(LO row=0; row < (LO) Nrows; row++) {
+      size_t row_start = eis_rowptr[row];
+      ArrayView<const LO> indices;
+      ArrayView<const SC> vals;
+      std::set<LO> C_hat;
+      if(myPointType[row] == DIRICHLET_PT) {
+        // Dirichlet points get ignored completely
+      }
+      else if(myPointType[row] == C_PT) {
+        // C-Points get a single 1 in their row
+        C_hat.insert(cpoint2pcol[row]);
+      }
+      else {
+        // F-Points have a more complicated interpolation
+
+        // C-neighbors of row 
+        A.getLocalRowView(row, indices, vals);
+        for(LO j=0; j<indices.size(); j++)
+          if(myPointType[indices[j]] == C_PT && edgeIsStrong[row_start + j])
+            C_hat.insert(cpoint2pcol[indices[j]]);
+      }// end else 
+      
+      // Realloc if needed
+      if(ct + (size_t)C_hat.size() > (size_t)tmp_colind.size()) {
+        tmp_colind.resize(std::max(ct+(size_t)C_hat.size(),(size_t)2*tmp_colind.size()));
+      }
+      
+      // Copy
+      std::copy(C_hat.begin(), C_hat.end(),tmp_colind.begin()+ct);
+      ct+=C_hat.size();
+      tmp_rowptr[row+1] = tmp_rowptr[row] + C_hat.size();
+    }
+    // Resize down
+    tmp_colind.resize(tmp_rowptr[Nrows]);  
+
+    // Allocate memory & copy
+    P = rcp(new CrsMatrixWrap(A.getRowMap(), coarseColMap, 0));
+    RCP<CrsMatrix> PCrs   = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
+    ArrayRCP<size_t>  P_rowptr;
+    ArrayRCP<LO>      P_colind;
+    ArrayRCP<SC>      P_values;
+
+#ifdef CMS_DEBUG
+printf("CMS: Allocating P w/ %d nonzeros\n",(int)tmp_rowptr[Nrows]);
+#endif
+    PCrs->allocateAllValues(tmp_rowptr[Nrows], P_rowptr, P_colind, P_values);
+    TEUCHOS_TEST_FOR_EXCEPTION(tmp_rowptr.size() !=P_rowptr.size(), Exceptions::RuntimeError,"ClassicalPFactory: Allocation size error (rowptr)");
+    TEUCHOS_TEST_FOR_EXCEPTION(tmp_colind.size() !=P_colind.size(), Exceptions::RuntimeError,"ClassicalPFactory: Allocation size error (colind)");
+    // FIXME:  This can be short-circuited for Tpetra, if we decide we care
+    for(LO i=0; i<(LO)Nrows+1; i++)
+      P_rowptr[i] = tmp_rowptr[i];
+    for(LO i=0; i<(LO)tmp_rowptr[Nrows]; i++)
+      P_colind[i] = tmp_colind[i];
+
+
+    // Algorithm (numeric)
+    for(LO i=0; i < (LO)Nrows; i++) {
+      if(myPointType[i] == DIRICHLET_PT) {
+        // Dirichlet points get ignored completely
+#ifdef CMS_DEBUG        
+        // DEBUG
+        printf("** A(%d,:) is a Dirichlet-Point.\n",i);
+#endif
+      }
+      else if (myPointType[i] == C_PT) {
+        // C Points get a single 1 in their row
+        P_values[P_rowptr[i]] = Teuchos::ScalarTraits<SC>::one();  
+#ifdef CMS_DEBUG        
+        // DEBUG
+        printf("** A(%d,:) is a C-Point.\n",i);
+#endif
+      }
+      else {
+        /* Trottenberg et al. (A.7.6) and (A.7.7) on p. 479 gives this:                          */
+        /* alpha_i = \frac{ \sum_{j\in N_i} a_ij^- }{ \sum_{k\in P_i} a_ik^- }                   */
+        /* beta_i  = \frac{ \sum_{j\in N_i} a_ij^+ }{ \sum_{k\in P_i} a_ik^+ }                   */
+        /* w_ik    = { - alpha_i (a_ik / a_ii),   if k\in P_i^-                                  */
+        /*           { -  beta_i (a_ik / a_ii),   if k\in P_i^+                                  */  
+        ArrayView<const LO> A_indices_i, A_incides_k;
+        ArrayView<const SC> A_vals_i, A_indices_k;
+        A.getLocalRowView(i, A_indices_i, A_vals_i);
+        size_t row_start = eis_rowptr[i];
+        
+        ArrayView<LO> P_indices_i  = P_colind.view(P_rowptr[i],P_rowptr[i+1] - P_rowptr[i]);
+        ArrayView<SC> P_vals_i     = P_values.view(P_rowptr[i],P_rowptr[i+1] - P_rowptr[i]);
+        
+#ifdef CMS_DEBUG          
+        // DEBUG
+        {
+          char mylabel[5]="FUCD";
+          char sw[3]="ws";
+          printf("** A(%d,:) = ",i);
+          for(LO j=0; j<(LO)A_indices_i.size(); j++){  
+            printf("%6.4e(%d-%c%c) ",A_vals_i[j],A_indices_i[j],mylabel[1+myPointType[A_indices_i[j]]],sw[(int)edgeIsStrong[row_start+j]]);
+          }
+          printf("\n");
+        }
+#endif        
+                      
+        SC a_ii            = SC_ZERO;
+        SC pos_numerator   = SC_ZERO, neg_numerator   = SC_ZERO;
+        SC pos_denominator = SC_ZERO, neg_denominator = SC_ZERO;
+        // Find the diagonal and compute the sum ratio
+        for(LO j=0; j<(LO)A_indices_i.size(); j++) {
+          SC a_ik = A_vals_i[j]; 
+          LO k = A_indices_i[j];
+          
+          // Diagonal
+          if(i == k) { 
+            a_ii = a_ik;
+          }          
+          // Only strong C-neighbors are in the denomintor
+          if(myPointType[k] == C_PT && edgeIsStrong[row_start + j]) {
+            if(STS::real(a_ik) > MT_ZERO) pos_denominator += a_ik;
+            else neg_denominator += a_ik;
+          }  
+          
+          // All neighbors are in the numerator
+          // NOTE: As per PyAMG, this does not include the diagonal
+          if(i != k) {
+            if(STS::real(a_ik) > MT_ZERO) pos_numerator += a_ik;
+            else neg_numerator += a_ik;
+          }   
+        }
+        SC alpha = (neg_denominator == MT_ZERO) ? SC_ZERO : (neg_numerator / neg_denominator);
+        SC beta  = (pos_denominator == MT_ZERO) ? SC_ZERO : (pos_numerator / pos_denominator);
+        alpha /= -a_ii;        
+        beta  /= -a_ii;
+
+        // Loop over the entries
+        for(LO p_j=0; p_j<(LO)P_indices_i.size(); p_j++){  
+          LO P_col = pcol2cpoint[P_indices_i[p_j]];
+          SC a_ij = SC_ZERO;
+          
+          // Find A_ij (if it is there)
+          // FIXME: We can optimize this if we assume sorting
+          for(LO a_j =0; a_j<(LO)A_indices_i.size(); a_j++) {
+            if(A_indices_i[a_j] == P_col) {
+              a_ij = A_vals_i[a_j];
+              break;
+            }
+          }
+          SC w_ij = (STS::real(a_ij) < 0 ) ? (alpha * a_ij) : (beta * a_ij);
+#ifdef CMS_DEBUG
+          SC alpha_or_beta = (STS::real(a_ij) < 0 ) ? alpha : beta;
+          printf("P(%d,%d/%d) =  - %6.4e  * %6.4e  = %6.4e\n",i,P_indices_i[p_j],pcol2cpoint[P_indices_i[p_j]],alpha_or_beta,a_ij,w_ij);
+#endif
+          P_vals_i[p_j] = w_ij;          
+        }//end for A_indices_i
+      }//end else C_PT
+    }//end for Numrows
+
+    // Finish up
+    PCrs->setAllValues(P_rowptr, P_colind, P_values);
+    PCrs->expertStaticFillComplete(/*domain*/coarseDomainMap, /*range*/A.getDomainMap());
+}
+
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+Coarsen_Ext_Plus_I(const Matrix & A,const RCP<const Matrix> & Aghost, const GraphBase & graph,  RCP<const Map> & coarseColMap, RCP<const Map> & coarseDomainMap, LO num_c_points, LO num_f_points, const Teuchos::ArrayView<const LO> & myPointType, const Teuchos::ArrayView<const LO> & myPointType_ghost, const Teuchos::Array<LO> & cpoint2pcol, const Teuchos::Array<LO> & pcol2cpoint, Teuchos::Array<size_t> & eis_rowptr, Teuchos::Array<bool> & edgeIsStrong, RCP<LocalOrdinalVector> & BlockNumber, RCP<Matrix> & P) const {
+
+    /* ============================================================= */
+    /* Phase 3 : Extended+i Interpolation                            */
+    /* De Sterck, Falgout, Nolting and Yang. "Distance-two           */
+    /* interpolation for parallel algebraic multigrid", NLAA 2008    */
+    /* 15:115-139                                                    */
+    /* ============================================================= */    
+    /* Definitions:                                                        */
+    /* F = F-points                                                        */
+    /* C = C-points                                                        */
+    /* N_i = non-zero neighbors of node i                                  */
+    /* S_i = {j\in N_i | j strongly influences i } [strong neighbors of i] */
+    /* F_i^s = F \cap S_i [strong F-neighbors of i]                        */
+    /* C_i^s = C \cap S_i [strong C-neighbors of i]                        */
+    /* N_i^w = N_i\ (F_i^s \cup C_i^s) [weak neighbors of i]               */
+    /*         This guy has a typo.  The paper had a \cap instead of \cup  */
+    /*         I would note that this set can contain both F-points and    */
+    /*         C-points.  They're just weak neighbors of this guy.         */
+    /*         Note that N_i^w \cup F_i^s \cup C_i^s = N_i by construction */
+
+    /* \hat{C}_i = C_i \cup (\bigcup_{j\inF_i^s} C_j)                      */
+    /*         [C-neighbors and C-neighbors of strong F-neighbors of i]    */
+    /*                                                                     */
+
+    /* \bar{a}_ij = {    0, if sign(a_ij) == sign(a_ii)                    */
+    /*              { a_ij, otherwise                                      */
+
+
+    /* Rewritten Equation (19) on p. 123                                   */
+    /* f_ik = \frac{\bar{a}_kj}{\sum{l\in \hat{C}_i\cup {i}} \bar{a}_kl    */
+    /* w_ij = -\tilde{a}_ii^{-1} (a_ij + \sum_{k\inF_i^s} a_ik f_ik        */
+    /*         for j in \hat{C}_i                                          */
+    
+    /* Rewritten Equation (20) on p. 124 [for the lumped diagonal]                                  */
+    /* g_ik = \frac{\bar{a}_ki}{\sum{l\in \hat{C}_i\cup {i}} \bar{a}_kl                             */    
+    /* \tilde{a}_ii = a_ii + \sum_{n\inN_i^w\setminus \hat{C}_i} a_in + \sum_{k\inF_i^s} a_ik g_ik  */
+    TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,"ClassicalPFactory: Ext+i not implemented");
+
+}
+
+
+
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+GenerateStrengthFlags(const Matrix & A,const GraphBase & graph, Teuchos::Array<size_t> & eis_rowptr,Teuchos::Array<bool> & edgeIsStrong) const {
+  // To make this easier, we'll create a bool array equal to the nnz in the matrix
+  // so we know whether each edge is strong or not.  This will save us a bunch of
+  // trying to match the graph and matrix later
+  size_t Nrows = A.getNodeNumRows();
+  eis_rowptr.resize(Nrows+1);
+
+  if(edgeIsStrong.size() == 0) {
+    // Preferred
+    edgeIsStrong.resize(A.getNodeNumEntries(),false);
+  }
+  else {
+    edgeIsStrong.resize(A.getNodeNumEntries(),false);
+    edgeIsStrong.assign(A.getNodeNumEntries(),false);
+  }
+  
+  eis_rowptr[0] = 0;
+  for (LO i=0; i<(LO)Nrows; i++) {
+    LO rowstart = eis_rowptr[i];
+    ArrayView<const LO> A_indices;
+    ArrayView<const SC> A_values;
+    A.getLocalRowView(i, A_indices, A_values);
+    LO A_size = (LO) A_indices.size();
+
+    ArrayView<const LO> G_indices = graph.getNeighborVertices(i);
+    LO G_size = (LO) G_indices.size();
+    
+    // Both of these guys should be in the same (sorted) order, but let's check
+    bool is_ok=true;
+    for(LO j=0; j<A_size-1; j++)
+      if (A_indices[j] > A_indices[j+1]) { is_ok=false; break;}
+    for(LO j=0; j<G_size-1; j++)
+      if (G_indices[j] > G_indices[j+1]) { is_ok=false; break;}
+    TEUCHOS_TEST_FOR_EXCEPTION(!is_ok, Exceptions::RuntimeError,"ClassicalPFactory: Exected A and Graph to be sorted");
+    
+    // Now cycle through and set the flags - if the edge is in G it is strong
+    for(LO g_idx=0, a_idx=0; g_idx < G_size; g_idx++) {
+      LO col = G_indices[g_idx];
+      while (A_indices[a_idx] != col && a_idx < A_size) a_idx++;
+      if(a_idx == A_size) {is_ok=false;break;}
+      edgeIsStrong[rowstart+a_idx] = true;      
+    }
+
+    eis_rowptr[i+1] = eis_rowptr[i] + A_size;
+  }
+}
+   
+
+/* ************************************************************************* */
+template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+void ClassicalPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+GhostCoarseMap(const Matrix &A, const Import & Importer, const ArrayRCP<const LO> myPointType, const RCP<const Map> & coarseMap, RCP<const Map> & coarseColMap) const {  
+  const point_type C_PT = ClassicalMapFactory::C_PT;
+  const GO GO_INVALID = Teuchos::OrdinalTraits<GO>::invalid();
+  RCP<GlobalOrdinalVector> d_coarseIds = GlobalOrdinalVectorFactory::Build(A.getRowMap());
+  ArrayRCP<GO> d_data = d_coarseIds->getDataNonConst(0);
+  LO ct=0;
+      
+  for(LO i=0; i<(LO)d_data.size(); i++) {
+    if(myPointType[i] == C_PT) {
+      d_data[i] = coarseMap->getGlobalElement(ct);
+      ct++;
+    }
+    else
+      d_data[i] = GO_INVALID;
+  }
+  
+  // Ghost this guy
+  RCP<GlobalOrdinalVector> c_coarseIds = GlobalOrdinalVectorFactory::Build(A.getColMap());
+  c_coarseIds->doImport(*d_coarseIds,Importer,Xpetra::INSERT);
+  
+  // If we assume that A is in Aztec ordering, then any subset of A's unknowns will
+  // be in Aztec ordering as well, which means we can just condense these guys down        
+  // Overallocate, count and view
+  ArrayRCP<GO> c_data = c_coarseIds->getDataNonConst(0);
+  
+  Array<GO> c_gids(c_data.size());
+  LO count=0;
+  
+  for(LO i=0; i<(LO)c_data.size(); i++) {
+    if(c_data[i] != GO_INVALID) {
+      c_gids[count] = c_data[i];
+      count++;
+    }
+  }
+  // FIXME: Assumes scalar PDE
+  std::vector<size_t> stridingInfo_(1);
+  stridingInfo_[0]=1;
+  GO domainGIDOffset = 0;
+  
+  coarseColMap = StridedMapFactory::Build(coarseMap->lib(),
+                                          Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
+                                          c_gids.view(0,count),
+                                          coarseMap->getIndexBase(),
+                                          stridingInfo_,
+                                          coarseMap->getComm(),
+                                          domainGIDOffset);        
+  
+}
+
+
+} //namespace MueLu
+
+
+
+#define MUELU_CLASSICALPFACTORY_SHORT
+#endif // MUELU_CLASSICALPFACTORY_DEF_HPP
+
+ 
diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
index e714b5b5fb4f..f35adec7a0bd 100644
--- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
+++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
@@ -63,12 +63,11 @@ namespace MueLu {
     RCP<ParameterList> validParamList = rcp(new ParameterList());
 
 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
-    SET_VALID_ENTRY("interp: interpolation order");
     SET_VALID_ENTRY("interp: build coarse coordinates");
 #undef  SET_VALID_ENTRY
 
     // general variables needed in GeometricInterpolationPFactory
-    validParamList->set<RCP<const FactoryBase> >("A",                       Teuchos::null,
+    validParamList->set<RCP<const FactoryBase> >("A",                       	 Teuchos::null,
                                                  "Generating factory of the matrix A");
     validParamList->set<RCP<const FactoryBase> >("Aggregates",                   Teuchos::null,
                                                  "Aggregates generated by StructuredAggregationFactory used to construct a piece-constant prolongator.");
@@ -85,7 +84,9 @@ namespace MueLu {
     validParamList->set<RCP<const FactoryBase> >("numDimensions",                Teuchos::null,
                                                  "Number of spacial dimensions in the problem.");
     validParamList->set<RCP<const FactoryBase> >("lCoarseNodesPerDim",           Teuchos::null,
-                                                 "Number of nodes per spatial dimension on the coarse grid.");
+                                                 "Number of nodes per spatial dimension on the coarse grid.");                              
+    validParamList->set<RCP<const FactoryBase> >("structuredInterpolationOrder", Teuchos::null,
+    						 "Interpolation order for constructing the prolongator.");
     validParamList->set<bool>                   ("keep coarse coords",           false, "Flag to keep coordinates for special coarse grid solve");
     validParamList->set<bool>                   ("interp: remove small entries", true, "Remove small interpolation coeficient from prolongator to reduce fill-in on coarse level");
 
@@ -102,9 +103,10 @@ namespace MueLu {
     Input(fineLevel, "numDimensions");
     Input(fineLevel, "prolongatorGraph");
     Input(fineLevel, "lCoarseNodesPerDim");
+    Input(fineLevel, "structuredInterpolationOrder");
 
     if( pL.get<bool>("interp: build coarse coordinates") ||
-        (pL.get<int>("interp: interpolation order") == 1) ) {
+	Get<int>(fineLevel, "structuredInterpolationOrder") == 1) {
       Input(fineLevel, "Coordinates");
       Input(fineLevel, "coarseCoordinatesFineMap");
       Input(fineLevel, "coarseCoordinatesMap");
@@ -138,7 +140,7 @@ namespace MueLu {
     const ParameterList& pL = GetParameterList();
     const bool removeSmallEntries     = pL.get<bool>("interp: remove small entries");
     const bool buildCoarseCoordinates = pL.get<bool>("interp: build coarse coordinates");
-    const int interpolationOrder      = pL.get<int> ("interp: interpolation order");
+    const int interpolationOrder      = Get<int>(fineLevel, "structuredInterpolationOrder");
     const int numDimensions           = Get<int>(fineLevel, "numDimensions");
 
     // Declared main input/outputs to be retrieved and placed on the fine resp. coarse level
diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
index 5c6e62280938..4730b9ee619a 100644
--- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
@@ -63,7 +63,6 @@ namespace MueLu {
     RCP<ParameterList> validParamList = rcp(new ParameterList());
 
 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
-    SET_VALID_ENTRY("interp: interpolation order");
     SET_VALID_ENTRY("interp: build coarse coordinates");
 #undef  SET_VALID_ENTRY
 
@@ -82,6 +81,8 @@ namespace MueLu {
                                                  "Number of nodes per spatial dimension on the coarse grid.");
     validParamList->set<RCP<const FactoryBase> >("indexManager",                 Teuchos::null,
                                                  "The index manager associated with the local mesh.");
+    validParamList->set<RCP<const FactoryBase> >("structuredInterpolationOrder", Teuchos::null,
+    						 "Interpolation order for constructing the prolongator.");
 
     return validParamList;
   }
@@ -96,9 +97,10 @@ namespace MueLu {
     Input(fineLevel, "numDimensions");
     Input(fineLevel, "prolongatorGraph");
     Input(fineLevel, "lCoarseNodesPerDim");
+    Input(fineLevel, "structuredInterpolationOrder");
 
     if( pL.get<bool>("interp: build coarse coordinates") ||
-        (pL.get<int>("interp: interpolation order") == 1) ) {
+        Get<int>(fineLevel, "structuredInterpolationOrder") == 1) {
       Input(fineLevel, "Coordinates");
       Input(fineLevel, "indexManager");
     }
@@ -130,7 +132,7 @@ namespace MueLu {
     // Get inputs from the parameter list
     const ParameterList& pL = GetParameterList();
     const bool buildCoarseCoordinates = pL.get<bool>("interp: build coarse coordinates");
-    const int interpolationOrder      = pL.get<int> ("interp: interpolation order");
+    const int interpolationOrder      = Get<int>(fineLevel, "structuredInterpolationOrder");
     const int numDimensions           = Get<int>(fineLevel, "numDimensions");
 
     // Declared main input/outputs to be retrieved and placed on the fine resp. coarse level
diff --git a/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp b/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp
index 06c548504b52..1154089e7347 100644
--- a/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp
+++ b/packages/muelu/src/Transfers/PCoarsen/MueLu_IntrepidPCoarsenFactory_def.hpp
@@ -104,7 +104,7 @@ namespace MueLu {
 namespace MueLuIntrepid {
 inline std::string tolower(const std::string & str) {
   std::string data(str);
-  std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) { return std::tolower(c); });
+  std::transform(data.begin(), data.end(), data.begin(), ::tolower);
   return data;
 }
 
diff --git a/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList b/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList
index f0ab4c0887b3..826d4c8934b5 100644
--- a/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList
+++ b/packages/muelu/src/Utils/ClassList/SC-LO-GO-NO.classList
@@ -17,6 +17,8 @@ BlockedRAPFactory
 BrickAggregationFactory
 BraessSarazinSmoother
 CGSolver
+ClassicalMapFactory
+ClassicalPFactory
 CloneRepartitionInterface
 CoalesceDropFactory
 CoalesceDropFactory_kokkos - #if defined(HAVE_MUELU_KOKKOS_REFACTOR)
diff --git a/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake b/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake
index 859a82d2f05a..aaca3f4f6077 100644
--- a/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake
+++ b/packages/muelu/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake
@@ -18,6 +18,8 @@ APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BlockedRAPFactory )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BrickAggregationFactory )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::BraessSarazinSmoother )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CGSolver )
+APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::ClassicalMapFactory )
+APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::ClassicalPFactory )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CloneRepartitionInterface )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CoalesceDropFactory )
 APPEND_SET(MUELU_SC_LO_GO_NO_ETI_CLASSES MueLu::CoalesceDropFactory_kokkos-.?if.defined[HAVE_MUELU_KOKKOS_REFACTOR] )
diff --git a/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp
new file mode 100644
index 000000000000..7e675f6a44bb
--- /dev/null
+++ b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalMapFactory_fwd.hpp
@@ -0,0 +1,63 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef MUELU_CLASSICALMAPFACTORY_FWD_HPP
+#define MUELU_CLASSICALMAPFACTORY_FWD_HPP
+
+
+
+
+namespace MueLu {
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  class ClassicalMapFactory;
+}
+
+#ifndef MUELU_CLASSICALMAPFACTORY_SHORT
+#define MUELU_CLASSICALMAPFACTORY_SHORT
+#endif
+
+
+
+#endif // MUELU_CLASSICALMAPFACTORY_FWD_HPP
diff --git a/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp
new file mode 100644
index 000000000000..8e3c7f7a5dcd
--- /dev/null
+++ b/packages/muelu/src/Utils/ForwardDeclaration/MueLu_ClassicalPFactory_fwd.hpp
@@ -0,0 +1,63 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef MUELU_CLASSICALPFACTORY_FWD_HPP
+#define MUELU_CLASSICALPFACTORY_FWD_HPP
+
+
+
+
+namespace MueLu {
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  class ClassicalPFactory;
+}
+
+#ifndef MUELU_CLASSICALPFACTORY_SHORT
+#define MUELU_CLASSICALPFACTORY_SHORT
+#endif
+
+
+
+#endif // MUELU_CLASSICALPFACTORY_FWD_HPP
diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
index 826f2b10428f..45c497b8fd85 100644
--- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
@@ -261,6 +261,32 @@ tol = 0.;
       return diag;
     }
 
+    /*! @brief Return vector containing: max_{i\not=k}(-a_ik), for each for i in the matrix
+     *
+     * @param[in] A: input matrix
+     * @ret: vector containing max_{i\not=k}(-a_ik)
+    */
+
+    static Teuchos::ArrayRCP<Magnitude> GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { 
+      size_t numRows = A.getRowMap()->getNodeNumElements();
+      Magnitude ZERO = Teuchos::ScalarTraits<Magnitude>::zero();
+      Teuchos::ArrayRCP<Magnitude> maxvec(numRows);
+      Teuchos::ArrayView<const LocalOrdinal> cols;
+      Teuchos::ArrayView<const Scalar> vals;
+      for (size_t i = 0; i < numRows; ++i) {
+        A.getLocalRowView(i, cols, vals);
+        Magnitude mymax = ZERO;
+        for (LocalOrdinal j=0; j < cols.size(); ++j) {
+          if (Teuchos::as<size_t>(cols[j]) != i) {
+            mymax = std::max(mymax,-Teuchos::ScalarTraits<Scalar>::real(vals[j]));
+          }
+        }          
+        maxvec[i] = mymax;
+      }
+      return maxvec;
+    }
+
+
     /*! @brief Return vector containing inverse of input vector
      *
      * @param[in] v: input vector
@@ -721,6 +747,38 @@ tol = 0.;
       return boundaryNodes;
     }
 
+   /*! @brief Apply Rowsum Criterion
+
+        Flags a row i as dirichlet if:
+    
+        \sum_{j\not=i} A_ij > A_ii * tol
+
+        @param[in] A matrix
+        @param[in] rowSumTol See above
+        @param[in/out] dirichletRows boolean array.  The ith entry is true if the above criterion is satisfied (or if it was already set to true)
+
+    */
+    static void                                                                  ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, const Magnitude rowSumTol, Teuchos::ArrayRCP<bool>& dirichletRows) {
+      typedef Teuchos::ScalarTraits<Scalar> STS;
+      RCP<const Xpetra::Map<LocalOrdinal,GlobalOrdinal,Node>> rowmap = A.getRowMap();
+      for (LocalOrdinal row = 0; row < Teuchos::as<LocalOrdinal>(rowmap->getNodeNumElements()); ++row) {
+        size_t nnz = A.getNumEntriesInLocalRow(row);
+        ArrayView<const LocalOrdinal> indices;
+        ArrayView<const Scalar> vals;
+        A.getLocalRowView(row, indices, vals);
+        
+        Scalar rowsum = STS::zero();
+        Scalar diagval = STS::zero();
+        for (LocalOrdinal colID = 0; colID < Teuchos::as<LocalOrdinal>(nnz); colID++) {
+          LocalOrdinal col = indices[colID];
+          if (row == col)
+            diagval = vals[colID];
+          rowsum += vals[colID];
+        }
+        if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol)
+          dirichletRows[row] = true;
+      }
+    }
 
     /*! @brief Detect Dirichlet columns based on Dirichlet rows
 
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
index 2e43ade55857..8f52cadf9bee 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
@@ -211,6 +211,8 @@ namespace MueLu {
     static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero(), const bool replaceSingleEntryRowWithZero = false)
     { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement, replaceSingleEntryRowWithZero); }
     static RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> >          GetMatrixOverlappedDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixOverlappedDiagonal(A); }
+    static Teuchos::ArrayRCP<Magnitude>                                          GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixMaxMinusOffDiagonal(A); }
+
     static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetInverse(Teuchos::RCP<const Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > v, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetInverse(v,tol,tolReplacement); }
     static Teuchos::Array<Magnitude>                                             ResidualNorm(const Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& RHS) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ResidualNorm(Op,X,RHS); }
     static Teuchos::Array<Magnitude>                                             ResidualNorm(const Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& RHS, Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Resid) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ResidualNorm(Op,X,RHS,Resid); }
@@ -222,6 +224,7 @@ namespace MueLu {
     static typename Teuchos::ScalarTraits<Scalar>::magnitudeType                 Distance2(const Teuchos::ArrayView<double> & weight,const Teuchos::Array<Teuchos::ArrayRCP<const Scalar>>& v, LocalOrdinal i0, LocalOrdinal i1) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Distance2(weight,v,i0,i1); }
     static Teuchos::ArrayRCP<const bool>                                         DetectDirichletRows(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, const Magnitude& tol = Teuchos::ScalarTraits<Scalar>::magnitude(0.), const bool count_twos_as_dirichlet=false) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::DetectDirichletRows(A,tol,count_twos_as_dirichlet); }
     static Teuchos::ArrayRCP<const bool>                                         DetectDirichletRowsExt(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, bool & bHasZeroDiagonal, const Magnitude& tol = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::DetectDirichletRowsExt(A,bHasZeroDiagonal,tol); }
+    static void                                                                  ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, const Magnitude rowSumTol, Teuchos::ArrayRCP<bool>& dirichletRows) {return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); }
 
     static void                                                                  SetRandomSeed(const Teuchos::Comm<int> &comm) { MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::SetRandomSeed(comm); }
 
@@ -587,6 +590,7 @@ namespace MueLu {
     static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero(), const bool replaceSingleEntryRowWithZero = false)
     { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement, replaceSingleEntryRowWithZero); }
     static RCP<Vector>                                                           GetMatrixOverlappedDiagonal(const Matrix& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixOverlappedDiagonal(A); }
+    static Teuchos::ArrayRCP<Magnitude>                                          GetMatrixMaxMinusOffDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixMaxMinusOffDiagonal(A); }
     static RCP<Vector>                                                           GetInverse(Teuchos::RCP<const Vector> v, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetInverse(v,tol,tolReplacement); }
     static Teuchos::Array<Magnitude>                                             ResidualNorm(const Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& RHS) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ResidualNorm(Op,X,RHS); }
     static Teuchos::Array<Magnitude>                                             ResidualNorm(const Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X, const Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& RHS, Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Resid) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ResidualNorm(Op,X,RHS,Resid); }
@@ -598,6 +602,7 @@ namespace MueLu {
     static Teuchos::ScalarTraits<Scalar>::magnitudeType                 Distance2(const Teuchos::ArrayView<double> &weight, const Teuchos::Array<Teuchos::ArrayRCP<const Scalar>>& v, LocalOrdinal i0, LocalOrdinal i1) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Distance2(weight,v,i0,i1); }
     static Teuchos::ArrayRCP<const bool>                                         DetectDirichletRows(const Matrix& A, const Magnitude& tol = Teuchos::ScalarTraits<Scalar>::zero(), const bool count_twos_as_dirichlet=false) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::DetectDirichletRows(A,tol,count_twos_as_dirichlet); }
     static Teuchos::ArrayRCP<const bool>                                         DetectDirichletRowsExt(const Matrix& A, bool & bHasZeroDiagonal, const Magnitude& tol = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::DetectDirichletRowsExt(A,bHasZeroDiagonal,tol); }
+    static void                                                                  ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, const Magnitude rowSumTol, Teuchos::ArrayRCP<bool>& dirichletRows) {return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::ApplyRowSumCriterion(A,rowSumTol,dirichletRows); }
     static void                                                                  SetRandomSeed(const Teuchos::Comm<int> &comm) { MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::SetRandomSeed(comm); }
 
     static Scalar PowerMethod(const Matrix& A, bool scaleByDiag = true,
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
index b63abfcdf147..9885e1b9c99e 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
@@ -271,6 +271,10 @@ namespace MueLu {
     static void ZeroDirichletRows(RCP<MultiVector>& X, const Kokkos::View<const bool*, typename NO::device_type>& dirichletRows, SC replaceWith=Teuchos::ScalarTraits<SC>::zero());
 
     static void ZeroDirichletCols(RCP<Matrix>& A, const Kokkos::View<const bool*, typename NO::device_type>& dirichletCols, SC replaceWith=Teuchos::ScalarTraits<SC>::zero());
+    
+    static void ApplyRowSumCriterion(const Matrix& A,
+                                     const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
+                                     Kokkos::View<bool*, typename NO::device_type> & dirichletRows);
 
     static RCP<MultiVector> RealValuedToScalarMultiVector(RCP<RealValuedMultiVector> X);
 
@@ -420,6 +424,10 @@ namespace MueLu {
 
     static void ZeroDirichletCols(RCP<Matrix>& A, const Kokkos::View<const bool*, typename Node::device_type>& dirichletCols, SC replaceWith=Teuchos::ScalarTraits<SC>::zero());
 
+    static void ApplyRowSumCriterion(const Matrix& A,
+                                     const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
+                                     Kokkos::View<bool*, typename NO::device_type> & dirichletRows);
+
     static RCP<MultiVector> RealValuedToScalarMultiVector(RCP<RealValuedMultiVector> X);
 
     static Scalar PowerMethod(const Matrix& A, bool scaleByDiag = true, LO niters = 10, Magnitude tolerance = 1e-2, bool verbose = false, unsigned int seed = 123) {
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
index 9014eb44683b..cc7b3e880312 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
@@ -620,6 +620,54 @@ namespace MueLu {
     return MueLu::ZeroDirichletCols<double,int,int,Node>(A, dirichletCols, replaceWith);
   }
 
+  // Applies rowsum criterion 
+  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A,
+                            const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
+                            Kokkos::View<bool*, typename Node::device_type> & dirichletRows)
+  {
+    typedef Teuchos::ScalarTraits<Scalar> STS;
+    RCP<const Xpetra::Map<LocalOrdinal,GlobalOrdinal,Node>> rowmap = A.getRowMap();
+    for (LocalOrdinal row = 0; row < Teuchos::as<LocalOrdinal>(rowmap->getNodeNumElements()); ++row) {
+      size_t nnz = A.getNumEntriesInLocalRow(row);
+      ArrayView<const LocalOrdinal> indices;
+      ArrayView<const Scalar> vals;
+      A.getLocalRowView(row, indices, vals);
+
+      Scalar rowsum = STS::zero();
+      Scalar diagval = STS::zero();
+      for (LocalOrdinal colID = 0; colID < Teuchos::as<LocalOrdinal>(nnz); colID++) {
+        LocalOrdinal col = indices[colID];
+        if (row == col)
+          diagval = vals[colID];
+        rowsum += vals[colID];
+      }
+      if (STS::real(rowsum) > STS::magnitude(diagval) * rowSumTol)
+        dirichletRows(row) = true;
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  Utilities_kokkos<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  ApplyRowSumCriterion(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A,
+                       const typename Teuchos::ScalarTraits<Scalar>::magnitudeType rowSumTol,
+                       Kokkos::View<bool*, typename Node::device_type> & dirichletRows)
+  {
+    MueLu::ApplyRowSumCriterion<Scalar, LocalOrdinal, GlobalOrdinal, Node>(A,rowSumTol,dirichletRows);
+  }
+
+
+  template <class Node>
+  void
+  Utilities_kokkos<double,int,int,Node>::
+  ApplyRowSumCriterion(const Xpetra::Matrix<double,int,int,Node>& A,
+                       const typename Teuchos::ScalarTraits<double>::magnitudeType rowSumTol,
+                       Kokkos::View<bool*, typename Node::device_type> & dirichletRows)
+  {
+    MueLu::ApplyRowSumCriterion<double, int, int, Node>(A,rowSumTol,dirichletRows);
+  }
+
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   RCP<Xpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> >
diff --git a/packages/muelu/test/scaling/CMakeLists.txt b/packages/muelu/test/scaling/CMakeLists.txt
index c12f096e6241..a91a7771e2d3 100644
--- a/packages/muelu/test/scaling/CMakeLists.txt
+++ b/packages/muelu/test/scaling/CMakeLists.txt
@@ -69,13 +69,13 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV
 
  TRIBITS_ADD_EXECUTABLE(
    ImportPerformance
-   SOURCES ImportPerformance
+   SOURCES ImportPerformance.cpp
    COMM mpi
    )
 
  TRIBITS_ADD_EXECUTABLE(
    TAFCPerformance
-   SOURCES TAFCPerformance
+   SOURCES TAFCPerformance.cpp
    COMM mpi
    )
 
diff --git a/packages/muelu/test/structured/structured_1dof.xml b/packages/muelu/test/structured/structured_1dof.xml
index 5623b7cecbca..f052b0d282ee 100644
--- a/packages/muelu/test/structured/structured_1dof.xml
+++ b/packages/muelu/test/structured/structured_1dof.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_1dof_kokkos.xml b/packages/muelu/test/structured/structured_1dof_kokkos.xml
index 1939c9f66aa7..1bdb6908514b 100644
--- a/packages/muelu/test/structured/structured_1dof_kokkos.xml
+++ b/packages/muelu/test/structured/structured_1dof_kokkos.xml
@@ -32,7 +32,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_1dof_shift.xml b/packages/muelu/test/structured/structured_1dof_shift.xml
index 96340b3e7189..b40b5e855985 100644
--- a/packages/muelu/test/structured/structured_1dof_shift.xml
+++ b/packages/muelu/test/structured/structured_1dof_shift.xml
@@ -37,7 +37,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_2dof.xml b/packages/muelu/test/structured/structured_2dof.xml
index 416bb4e46098..d7ca998d7c83 100644
--- a/packages/muelu/test/structured/structured_2dof.xml
+++ b/packages/muelu/test/structured/structured_2dof.xml
@@ -37,8 +37,8 @@
     <!-- Note that ParameterLists must be defined prior to being used -->
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
-      <Parameter name="interp: build coarse coordinates"       type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"            type="int"    value="1"/>
+      <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_3dof.xml b/packages/muelu/test/structured/structured_3dof.xml
index b7b5d7a9398d..0f25311713f7 100644
--- a/packages/muelu/test/structured/structured_3dof.xml
+++ b/packages/muelu/test/structured/structured_3dof.xml
@@ -38,7 +38,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_interp_kokkos.xml b/packages/muelu/test/structured/structured_interp_kokkos.xml
index 94617de97252..04376dbafb5c 100644
--- a/packages/muelu/test/structured/structured_interp_kokkos.xml
+++ b/packages/muelu/test/structured/structured_interp_kokkos.xml
@@ -38,7 +38,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory_kokkos"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="indexManager"                        type="string" value="myAggregationFact"/>
     </ParameterList>
diff --git a/packages/muelu/test/structured/structured_interp_sa_kokkos.xml b/packages/muelu/test/structured/structured_interp_sa_kokkos.xml
index 6f606856826f..5998a42599fa 100644
--- a/packages/muelu/test/structured/structured_interp_sa_kokkos.xml
+++ b/packages/muelu/test/structured/structured_interp_sa_kokkos.xml
@@ -38,7 +38,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory_kokkos"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="0"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="indexManager"                        type="string" value="myAggregationFact"/>
     </ParameterList>
diff --git a/packages/muelu/test/structured/structured_scp_1dof.xml b/packages/muelu/test/structured/structured_scp_1dof.xml
index fe66d3227288..4a111b9ff6c5 100644
--- a/packages/muelu/test/structured/structured_scp_1dof.xml
+++ b/packages/muelu/test/structured/structured_scp_1dof.xml
@@ -38,7 +38,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/structured/structured_sparc_1dof.xml b/packages/muelu/test/structured/structured_sparc_1dof.xml
index 175bf5998345..0d7c126aaa2b 100644
--- a/packages/muelu/test/structured/structured_sparc_1dof.xml
+++ b/packages/muelu/test/structured/structured_sparc_1dof.xml
@@ -38,7 +38,7 @@
     <ParameterList name="myProlongatorFact">
       <Parameter name="factory"                             type="string" value="GeometricInterpolationPFactory"/>
       <Parameter name="interp: build coarse coordinates"    type="bool"   value="true"/>
-      <Parameter name="interp: interpolation order"         type="int"    value="1"/>
+      <Parameter name="structuredInterpolationOrder"        type="string" value="myAggregationFact"/>
       <Parameter name="prolongatorGraph"                    type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesFineMap"            type="string" value="myAggregationFact"/>
       <Parameter name="coarseCoordinatesMap"                type="string" value="myAggregationFact"/>
diff --git a/packages/muelu/test/unit_tests/CMakeLists.txt b/packages/muelu/test/unit_tests/CMakeLists.txt
index 53d15dfcfd45..ecd053ff220e 100644
--- a/packages/muelu/test/unit_tests/CMakeLists.txt
+++ b/packages/muelu/test/unit_tests/CMakeLists.txt
@@ -29,6 +29,7 @@ APPEND_SET(SOURCES
   BlackBoxPFactory.cpp
   CoalesceDropFactory.cpp
   CoarseMapFactory.cpp
+  ClassicalPFactory.cpp
 # CoupledAggregationFactory.cpp
   FineLevelInputDataFactory.cpp
   GeneralGeometricPFactory.cpp
@@ -57,7 +58,7 @@ APPEND_SET(SOURCES
   TransPFactory.cpp
   UnsmooshFactory.cpp
   UserData/CreateXpetraPreconditioner.cpp
-  Utilities
+  Utilities.cpp
   VariableContainer.cpp
   VariableDofLaplacianFactory.cpp
 )
diff --git a/packages/muelu/test/unit_tests/ClassicalPFactory.cpp b/packages/muelu/test/unit_tests/ClassicalPFactory.cpp
new file mode 100644
index 000000000000..8e92698451e0
--- /dev/null
+++ b/packages/muelu/test/unit_tests/ClassicalPFactory.cpp
@@ -0,0 +1,298 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#include <Teuchos_UnitTestHarness.hpp>
+#include <Teuchos_DefaultComm.hpp>
+#include <Teuchos_ScalarTraits.hpp>
+
+#include "MueLu_TestHelpers.hpp"
+#include "MueLu_Version.hpp"
+
+#include <Xpetra_MultiVectorFactory.hpp>
+#include <Xpetra_VectorFactory.hpp>
+#include <Xpetra_Vector.hpp>
+#include <Xpetra_MatrixMatrix.hpp>
+#include <Xpetra_IO.hpp>
+
+#include "MueLu_CoalesceDropFactory.hpp"
+#include "MueLu_AmalgamationFactory.hpp"
+#include "MueLu_ClassicalMapFactory.hpp"
+#include "MueLu_ClassicalPFactory.hpp"
+#include "MueLu_Utilities.hpp"
+
+
+namespace MueLuTests {
+
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, Constructor, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include "MueLu_UseShortNames.hpp"
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+
+    out << "version: " << MueLu::Version() << std::endl;
+
+    RCP<ClassicalPFactory> PFact = rcp(new ClassicalPFactory);
+    TEST_EQUALITY(PFact != Teuchos::null, true);
+
+  } //Constructor
+
+
+
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_Direct, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include "MueLu_UseShortNames.hpp"
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+
+    using TST                   = Teuchos::ScalarTraits<SC>;
+    using magnitude_type        = typename TST::magnitudeType;
+    using TMT                   = Teuchos::ScalarTraits<magnitude_type>;
+    using real                  = typename TST::coordinateType;
+    using RealValuedMultiVector = Xpetra::MultiVector<real,LO,GO,NO>;
+    using test_factory          = TestHelpers::TestFactory<SC,LO,GO,NO>;
+
+    out << "version: " << MueLu::Version() << std::endl;
+
+    Level fineLevel, coarseLevel;
+    test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel);
+    fineLevel.SetFactoryManager(Teuchos::null);  // factory manager is not used on this test
+    coarseLevel.SetFactoryManager(Teuchos::null);
+
+    GO nx = 29;
+    RCP<Matrix> A = test_factory::Build1DPoisson(nx);
+    A->SetFixedBlockSize(1);
+    fineLevel.Set("A", A);
+   
+    // This test only works in parallel if we have Zoltan2 & Tpetra
+#ifndef HAVE_MUELU_ZOLTAN2
+    if(A->getRowMap()->getComm()->getRank() > 1)
+      return;
+#else
+    if(A->getRowMap()->lib() == Xpetra::UseEpetra)
+      return;
+#endif
+
+    Teuchos::ParameterList galeriList;
+    galeriList.set("nx", nx);
+    RCP<RealValuedMultiVector> coordinates
+      = Galeri::Xpetra::Utils::CreateCartesianCoordinates<real,LO,GO,Map,RealValuedMultiVector>("1D", A->getRowMap(), galeriList);
+    fineLevel.Set("Coordinates", coordinates);
+
+    LocalOrdinal NSdim = 2;
+    RCP<MultiVector> nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim);
+    nullSpace->randomize();
+    fineLevel.Set("Nullspace", nullSpace);
+
+    RCP<AmalgamationFactory> amalgFact = rcp(new AmalgamationFactory());
+    RCP<CoalesceDropFactory> dropFact = rcp(new CoalesceDropFactory());
+    dropFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+    RCP<ClassicalMapFactory> cmFact = rcp(new ClassicalMapFactory());
+    cmFact->SetFactory("Graph", dropFact);
+    cmFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+    Teuchos::ParameterList cp_params;
+    cp_params.set("aggregation: classical scheme","direct");
+    RCP<ClassicalPFactory> PFact = rcp(new ClassicalPFactory());
+    PFact->SetParameterList(cp_params);
+    PFact->SetFactory("UnAmalgamationInfo", amalgFact);
+    PFact->SetFactory("Graph", dropFact);
+    PFact->SetFactory("DofsPerNode", dropFact);
+    PFact->SetFactory("FC Splitting", cmFact);
+    PFact->SetFactory("CoarseMap", cmFact);
+
+    coarseLevel.Request("P",PFact.get());         // request Ptent
+    coarseLevel.Request(*PFact);
+    PFact->Build(fineLevel,coarseLevel);
+
+    RCP<Matrix> P;
+    coarseLevel.Get("P",P,PFact.get());
+
+  } //BuildP
+
+
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_ClassicalModified, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include "MueLu_UseShortNames.hpp"
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+
+    using TST                   = Teuchos::ScalarTraits<SC>;
+    using magnitude_type        = typename TST::magnitudeType;
+    using TMT                   = Teuchos::ScalarTraits<magnitude_type>;
+    using real                  = typename TST::coordinateType;
+    using RealValuedMultiVector = Xpetra::MultiVector<real,LO,GO,NO>;
+    using test_factory          = TestHelpers::TestFactory<SC,LO,GO,NO>;
+
+    out << "version: " << MueLu::Version() << std::endl;
+
+    Level fineLevel, coarseLevel;
+    test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel);
+    fineLevel.SetFactoryManager(Teuchos::null);  // factory manager is not used on this test
+    coarseLevel.SetFactoryManager(Teuchos::null);
+
+    GO nx = 29;
+    RCP<Matrix> A = test_factory::Build1DPoisson(nx);
+    A->SetFixedBlockSize(1);
+    fineLevel.Set("A", A);
+
+    Teuchos::ParameterList galeriList;
+    galeriList.set("nx", nx);
+    RCP<RealValuedMultiVector> coordinates
+      = Galeri::Xpetra::Utils::CreateCartesianCoordinates<real,LO,GO,Map,RealValuedMultiVector>("1D", A->getRowMap(), galeriList);
+    fineLevel.Set("Coordinates", coordinates);
+
+    LocalOrdinal NSdim = 2;
+    RCP<MultiVector> nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim);
+    nullSpace->randomize();
+    fineLevel.Set("Nullspace", nullSpace);
+
+    RCP<AmalgamationFactory> amalgFact = rcp(new AmalgamationFactory());
+    RCP<CoalesceDropFactory> dropFact = rcp(new CoalesceDropFactory());
+    dropFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+    RCP<ClassicalMapFactory> cmFact = rcp(new ClassicalMapFactory());
+    cmFact->SetFactory("Graph", dropFact);
+    cmFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+
+    Teuchos::ParameterList cp_params;
+    cp_params.set("aggregation: classical scheme","classical modified");
+    RCP<ClassicalPFactory> PFact = rcp(new ClassicalPFactory());
+    PFact->SetParameterList(cp_params);
+    PFact->SetFactory("UnAmalgamationInfo", amalgFact);
+    PFact->SetFactory("Graph", dropFact);
+    PFact->SetFactory("DofsPerNode", dropFact);
+    PFact->SetFactory("FC Splitting", cmFact);
+    PFact->SetFactory("CoarseMap", cmFact);
+
+    coarseLevel.Request("P",PFact.get());         // request Ptent
+    coarseLevel.Request(*PFact);
+    PFact->Build(fineLevel,coarseLevel);
+
+    RCP<Matrix> P;
+    coarseLevel.Get("P",P,PFact.get());
+
+  } //BuildP
+
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ClassicalPFactory, BuildP_Ext, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include "MueLu_UseShortNames.hpp"
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+
+    using TST                   = Teuchos::ScalarTraits<SC>;
+    using magnitude_type        = typename TST::magnitudeType;
+    using TMT                   = Teuchos::ScalarTraits<magnitude_type>;
+    using real                  = typename TST::coordinateType;
+    using RealValuedMultiVector = Xpetra::MultiVector<real,LO,GO,NO>;
+    using test_factory          = TestHelpers::TestFactory<SC,LO,GO,NO>;
+
+    out << "version: " << MueLu::Version() << std::endl;
+
+    Level fineLevel, coarseLevel;
+    test_factory::createTwoLevelHierarchy(fineLevel, coarseLevel);
+    fineLevel.SetFactoryManager(Teuchos::null);  // factory manager is not used on this test
+    coarseLevel.SetFactoryManager(Teuchos::null);
+
+    GO nx = 29;
+    RCP<Matrix> A = test_factory::Build1DPoisson(nx);
+    A->SetFixedBlockSize(1);
+    fineLevel.Set("A", A);
+
+    Teuchos::ParameterList galeriList;
+    galeriList.set("nx", nx);
+    RCP<RealValuedMultiVector> coordinates
+      = Galeri::Xpetra::Utils::CreateCartesianCoordinates<real,LO,GO,Map,RealValuedMultiVector>("1D", A->getRowMap(), galeriList);
+    fineLevel.Set("Coordinates", coordinates);
+
+    LocalOrdinal NSdim = 2;
+    RCP<MultiVector> nullSpace = MultiVectorFactory::Build(A->getRowMap(),NSdim);
+    nullSpace->randomize();
+    fineLevel.Set("Nullspace", nullSpace);
+
+    RCP<AmalgamationFactory> amalgFact = rcp(new AmalgamationFactory());
+    RCP<CoalesceDropFactory> dropFact = rcp(new CoalesceDropFactory());
+    dropFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+    RCP<ClassicalMapFactory> cmFact = rcp(new ClassicalMapFactory());
+    cmFact->SetFactory("Graph", dropFact);
+    cmFact->SetFactory("UnAmalgamationInfo", amalgFact);
+
+
+    Teuchos::ParameterList cp_params;
+    cp_params.set("aggregation: classical scheme","ext+i");
+    RCP<ClassicalPFactory> PFact = rcp(new ClassicalPFactory());
+    PFact->SetParameterList(cp_params);
+    PFact->SetFactory("UnAmalgamationInfo", amalgFact);
+    PFact->SetFactory("Graph", dropFact);
+    PFact->SetFactory("DofsPerNode", dropFact);
+    PFact->SetFactory("FC Splitting", cmFact);
+    PFact->SetFactory("CoarseMap", cmFact);
+
+    coarseLevel.Request("P",PFact.get());         // request Ptent
+    coarseLevel.Request(*PFact);
+    PFact->Build(fineLevel,coarseLevel);
+
+    RCP<Matrix> P;
+    coarseLevel.Get("P",P,PFact.get());
+
+  } //BuildP
+
+
+#  define MUELU_ETI_GROUP(Scalar, LO, GO, Node) \
+      TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,Constructor,Scalar,LO,GO,Node) \
+      TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_Direct,Scalar,LO,GO,Node) 
+
+  // Disabled until we actually have code to run these
+#if 0
+      TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_ClassicalModified,Scalar,LO,GO,Node) \
+      TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ClassicalPFactory,BuildP_Ext,Scalar,LO,GO,Node)
+#endif
+
+#include <MueLu_ETI_4arg.hpp>
+
+
+} // namespace MueLuTests
diff --git a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp
index 4668b3cfc97d..69fd22ead2eb 100644
--- a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp
+++ b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp
@@ -56,11 +56,14 @@
 #include "LOCA_Thyra_Group.H"
 #include "NOX_TpetraTypedefs.hpp"
 #include "NOX_Thyra_MultiVector.H"
+#include "Thyra_TpetraVectorSpace.hpp"
 #include "Thyra_TpetraMultiVector.hpp"
 #include "Thyra_TpetraLinearOp.hpp"
+#include "Thyra_DefaultLinearOpSource.hpp"
 #include "LOCA_Tpetra_LowRankUpdateRowMatrix.hpp"
 
 #include "Teuchos_ParameterList.hpp"
+#include "Teuchos_StandardParameterEntryValidators.hpp"
 #include "LOCA_BorderedSolver_LowerTriangularBlockElimination.H"
 #include "LOCA_BorderedSolver_UpperTriangularBlockElimination.H"
 #include "LOCA_Abstract_TransposeSolveGroup.H"
@@ -71,6 +74,16 @@
 // To suppress unreachable return warnings on cuda
 #include "Teuchos_CompilerCodeTweakMacros.hpp"
 
+// For debugging output
+#include <fstream>
+
+// Forward declaration needed for ParameterList validation
+namespace LOCA {
+  namespace MultiContinuation {
+    class ConstraintModelEvaluator;
+  }
+}
+
 // Utility for extracting tpetra vector from nox vector
 using ST = NOX::Scalar;
 using LO = NOX::LocalOrdinal;
@@ -155,9 +168,23 @@ TpetraHouseholder(const Teuchos::RCP<LOCA::GlobalData>& global_data,
   isComplex(false),
   omega(0.0)
 {
-  scale_rows = solverParams->get("Scale Augmented Rows", true);
+  Teuchos::ParameterList validParams;
+  validParams.set("Bordered Solver Method", "Householder");
+  validParams.set("Constraint Object",Teuchos::RCP<LOCA::MultiContinuation::ConstraintModelEvaluator>(Teuchos::null));
+  validParams.set("Constraint Parameter Names",Teuchos::RCP<std::vector<std::string>>(Teuchos::null));
+  validParams.set("Scale Augmented Rows", true);
+  Teuchos::setStringToIntegralParameter<int>("Preconditioner Method",
+                                             "Jacobian",
+                                             "Matrix to use for Preconditioning",
+                                             Teuchos::tuple<std::string> ("Jacobian","SWM"),
+                                             &validParams);
+  validParams.set("Include UV In Preconditioner", false);
+  validParams.set("Use P For Preconditioner", false);
+  solverParams->validateParametersAndSetDefaults(validParams);
+
+  scale_rows = solverParams->get<bool>("Scale Augmented Rows");
   std::string prec_method =
-    solverParams->get("Preconditioner Method", "Jacobian");
+    solverParams->get<std::string>("Preconditioner Method");
   if (prec_method == "Jacobian")
     precMethod = JACOBIAN;
   else if (prec_method == "SMW")
@@ -166,10 +193,11 @@ TpetraHouseholder(const Teuchos::RCP<LOCA::GlobalData>& global_data,
     globalData->locaErrorCheck->throwError(
         "LOCA::BorderedSolver::TpetraHouseholder::TpetraHouseholder()",
         "Unknown preconditioner method!  Choices are Jacobian, SMW");
+
   includeUV =
-    solverParams->get("Include UV In Preconditioner", false);
+    solverParams->get<bool>("Include UV In Preconditioner");
   use_P_For_Prec =
-    solverParams->get("Use P For Preconditioner", false);
+    solverParams->get<bool>("Use P For Preconditioner");
 }
 
 LOCA::BorderedSolver::TpetraHouseholder::~TpetraHouseholder()
@@ -725,14 +753,61 @@ LOCA::BorderedSolver::TpetraHouseholder::solve(
     //                                                        false));
   }
 
-  // Overwrite J with J + U*V^T if it's a CRS matrix and we aren't
-  // using P for the preconditioner
-  Teuchos::RCP<NOX::TCrsMatrix> jac_crs;
+  // Allocate a separate matrix for the preconditioner. Don't want to
+  // corrupt J with U*V^T if not using P for Prec (we can't for
+  // tpetra).  Copy J values and add in U*V^T if it's a CRS matrix
+  // and we aren't using P for the preconditioner
   if (includeUV && !use_P_For_Prec) {
-    jac_crs = Teuchos::rcp_dynamic_cast<NOX::TCrsMatrix>(tpetraOp);
-    if (jac_crs != Teuchos::null) {
-      updateJacobianForPreconditioner(*U, *V, *jac_crs);
+    auto jac_crs = Teuchos::rcp_dynamic_cast<NOX::TCrsMatrix>(tpetraOp,true);
+    if (tpetraPrecMatrix.is_null()) {
+      tpetraPrecMatrix = Teuchos::rcp(new NOX::TCrsMatrix(*jac_crs,Teuchos::Copy));
+      Teuchos::RCP<::Thyra::VectorSpaceBase<double>> domain = ::Thyra::tpetraVectorSpace<double>(tpetraPrecMatrix->getDomainMap());
+      Teuchos::RCP<::Thyra::VectorSpaceBase<double>> range = ::Thyra::tpetraVectorSpace<double>(tpetraPrecMatrix->getRangeMap());
+      auto prec_thyra_op = Teuchos::rcp(new ::Thyra::TpetraLinearOp<ST,LO,GO,NT>);
+      prec_thyra_op->initialize(range,domain,tpetraPrecMatrix);
+      Teuchos::RCP<::Thyra::LinearOpBase<double>> tmp_for_thyra_ambiguity = prec_thyra_op;
+      prec_losb = Teuchos::rcp(new ::Thyra::DefaultLinearOpSource<double>(tmp_for_thyra_ambiguity));
+    }
+
+    // Copy J values into preconditioner matrix
+    //*tpetraPrecMatrix = *jac_crs;
+    {
+      tpetraPrecMatrix->resumeFill();
+      auto jac_view = jac_crs->getLocalMatrix().values;
+      auto prec_view = tpetraPrecMatrix->getLocalMatrix().values;
+      Kokkos::deep_copy(prec_view,jac_view);
+      tpetraPrecMatrix->fillComplete();
+    }
+
+    bool print_debug = false;
+    if (print_debug) {
+      std::fstream fsj("jac_matrix_before.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsj(Teuchos::rcpFromRef(fsj));
+      jac_crs->describe(tfsj,Teuchos::VERB_EXTREME);
+      std::fstream fsp("prec_matrix_before.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsp(Teuchos::rcpFromRef(fsp));
+      tpetraPrecMatrix->describe(tfsp,Teuchos::VERB_EXTREME);
+      std::fstream fsu("u_vector.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsu(Teuchos::rcpFromRef(fsu));
+      tpetra_U->describe(tfsu,Teuchos::VERB_EXTREME);
+      std::fstream fsv("v_vector.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsv(Teuchos::rcpFromRef(fsv));
+      tpetra_V->describe(tfsv,Teuchos::VERB_EXTREME);
+    }
+
+    // Update locally owned non-zero values for U*V^T
+    updateCrsMatrixForPreconditioner(*U, *V, *tpetraPrecMatrix);
+
+    if (print_debug) {
+      std::fstream fsj("jac_matrix_after.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsj(Teuchos::rcpFromRef(fsj));
+      jac_crs->describe(tfsj,Teuchos::VERB_EXTREME);
+      std::fstream fsp("prec_matrix_after.out",std::fstream::out|std::fstream::trunc);
+      Teuchos::FancyOStream tfsp(Teuchos::rcpFromRef(fsp));
+      tpetraPrecMatrix->describe(tfsp,Teuchos::VERB_EXTREME);
     }
+
+    grp->setPreconditionerMatrix(prec_losb);
   }
 
   // Set operator in solver to compute preconditioner
@@ -1032,13 +1107,58 @@ LOCA::BorderedSolver::TpetraHouseholder::computeUV(
 }
 
 void
-LOCA::BorderedSolver::TpetraHouseholder::updateJacobianForPreconditioner(
-              const NOX::Abstract::MultiVector& UU,
-              const NOX::Abstract::MultiVector& VV,
-              NOX::TCrsMatrix& jac) const
+LOCA::BorderedSolver::TpetraHouseholder::
+updateCrsMatrixForPreconditioner(const NOX::Abstract::MultiVector& UU,
+                                 const NOX::Abstract::MultiVector& VV,
+                                 NOX::TCrsMatrix& matrix) const
 {
-  TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error,
-                             "ERROR: LOCA::BorderedSolver::TpetraHouseholder::updateJacobianForPreconditioner - NOT IMPLEMENTED YET!");
+  matrix.resumeFill();
+
+  auto& UU_tpetra = NOX::Tpetra::getTpetraMultiVector(UU);
+  auto& VV_tpetra = NOX::Tpetra::getTpetraMultiVector(VV);
+  const_cast<NOX::TMultiVector&>(UU_tpetra).sync_device();
+  const_cast<NOX::TMultiVector&>(VV_tpetra).sync_device();
+  const auto uu = UU_tpetra.getLocalViewDevice();
+  const auto vv = VV_tpetra.getLocalViewDevice();
+
+  const auto numRows = matrix.getNodeNumRows();
+  const auto rowMap = matrix.getRowMap()->getLocalMap();
+  const auto colMap = matrix.getColMap()->getLocalMap();
+  const auto uMap = UU_tpetra.getMap()->getLocalMap();
+  const auto vMap = VV_tpetra.getMap()->getLocalMap();
+  auto J_view = matrix.getLocalMatrix();
+  auto numConstraintsLocal = numConstraints; // for cuda lambda capture
+
+  TEUCHOS_ASSERT(static_cast<size_t>(matrix.getRowMap()->getNodeNumElements()) == uu.extent(0));
+  TEUCHOS_ASSERT(static_cast<size_t>(matrix.getRowMap()->getNodeNumElements()) == vv.extent(0));
+  TEUCHOS_ASSERT(numConstraintsLocal == static_cast<int>(uu.extent(1)));
+  TEUCHOS_ASSERT(numConstraintsLocal == static_cast<int>(vv.extent(1)));
+
+  Kokkos::parallel_for("Add UV^T to M",Kokkos::RangePolicy<NOX::DeviceSpace>(0,numRows),KOKKOS_LAMBDA (const int row) {
+    const GO row_gid = rowMap.getGlobalElement(row);
+    const LO u_row_lid = uMap.getLocalElement(row_gid);
+    auto rowView = J_view.row(row);
+
+    const auto numEntries = rowView.length;
+    for (int col=0; col<numEntries; ++col) {
+
+      // Only included contributions from U*V^T on this proc
+      const GO col_gid = colMap.getGlobalElement(rowView.colidx(col));
+      int v_row_lid = vMap.getLocalElement(col_gid);
+      if (v_row_lid != ::Tpetra::Details::OrdinalTraits<LO>::invalid()) {
+
+        // val = sum_{k=0}^m U(i,k)*V(j,k)
+        ST val = 0.0;
+        for (int k=0; k<numConstraintsLocal; ++k)
+          val += uu(u_row_lid,k) * vv(v_row_lid,k);
+
+        // replace J(row,col) with J(row,col) + U*V^T
+        rowView.value(col) += val;
+      }
+    }
+  });
+  Kokkos::fence();
+  matrix.fillComplete();
 
   /*
   // Get number of rows on this processor
diff --git a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.hpp b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.hpp
index 670ce05d248a..de6091261da3 100644
--- a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.hpp
+++ b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.hpp
@@ -57,6 +57,9 @@
 #include "Teuchos_BLAS.hpp"                        // class data element
 
 // forward declarations
+namespace Thyra {
+  template<typename T> class DefaultLinearOpSource;
+}
 namespace LOCA {
   class GlobalData;
   namespace Parameter {
@@ -421,14 +424,18 @@ namespace LOCA {
                 NOX::Abstract::MultiVector& V,
                 bool use_jac_transpose);
 
+    public:
       /*!
        * \brief Overwrites the Jacobian \f$J\f$ with \f$J + U V^T\f$
        * for computing the preconditioner of \f$P\f$.
+       *
+       * NOTE: This should be a protected method, but cuda lambda forces this to be public!
        */
-      void updateJacobianForPreconditioner(const NOX::Abstract::MultiVector& U,
-                                           const NOX::Abstract::MultiVector& V,
-                                           NOX::TCrsMatrix& jac) const;
+      void updateCrsMatrixForPreconditioner(const NOX::Abstract::MultiVector& U,
+                                            const NOX::Abstract::MultiVector& V,
+                                            NOX::TCrsMatrix& mat) const;
 
+    protected:
       Teuchos::RCP<NOX::Abstract::MultiVector>
       createBlockMV(const NOX::Abstract::MultiVector& v) const;
 
@@ -534,7 +541,10 @@ namespace LOCA {
       Teuchos::RCP<NOX::TOperator> tpetraOp;
 
       //! Pointer to Tpetra Preconditioner operator
-      Teuchos::RCP<NOX::TRowMatrix> tpetraPrecOp;
+      mutable Teuchos::RCP<NOX::TCrsMatrix> tpetraPrecMatrix;
+
+      //! Thyra wrapped preconditioner matrix (tpetraPrecMatrix) for when includeUV is true and use_P_for_Prec is false
+      mutable Teuchos::RCP<::Thyra::DefaultLinearOpSource<double>> prec_losb;
 
       //! Number of constraint equations
       int numConstraints;
@@ -583,7 +593,6 @@ namespace LOCA {
 
       //! Frequency for complex systems
       double omega;
-
     };
   } // namespace BorderedSolver
 } // namespace LOCA
diff --git a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp
index fe7e9c861c59..a8ddbb534a48 100644
--- a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp
+++ b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.cpp
@@ -102,6 +102,44 @@ namespace LOCA {
     bool LowRankUpdateRowMatrix::supportsRowViews() const
     {return J_rowMatrix->supportsRowViews();}
 
+    void
+    LowRankUpdateRowMatrix::getGlobalRowCopy(NOX::GlobalOrdinal GlobalRow,
+                                             NOX::TRowMatrix::nonconst_global_inds_host_view_type &Indices,
+                                             NOX::TRowMatrix::nonconst_values_host_view_type &Values,
+                                             size_t &NumEntries) const
+    {
+      TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error,
+                                 "ERROR - LOCA::LowRankRowMatrix::getGlobalRowCopy() - NOT implemented yet!");
+    }
+
+    void
+    LowRankUpdateRowMatrix::getLocalRowCopy (NOX::LocalOrdinal LocalRow,
+                                             NOX::TRowMatrix::nonconst_local_inds_host_view_type &Indices,
+                                             NOX::TRowMatrix::nonconst_values_host_view_type &Values,
+                                             size_t &NumEntries) const
+    {
+      TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error,
+                                 "ERROR - LOCA::LowRankRowMatrix::getLocalRowCopy() - NOT implemented yet!");
+    }
+
+    void
+    LowRankUpdateRowMatrix::getGlobalRowView (NOX::GlobalOrdinal GlobalRow,
+                                              NOX::TRowMatrix::global_inds_host_view_type &indices,
+                                              NOX::TRowMatrix::values_host_view_type &values) const
+    {
+      TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error,
+                                 "ERROR - LOCA::LowRankRowMatrix::getGlobalRowView() - NOT implemented yet!");
+    }
+
+    void
+    LowRankUpdateRowMatrix::getLocalRowView(NOX::LocalOrdinal LocalRow,
+                                            NOX::TRowMatrix::local_inds_host_view_type &indices,
+                                            NOX::TRowMatrix::values_host_view_type &values) const
+    {
+      TEUCHOS_TEST_FOR_EXCEPTION(true,std::runtime_error,
+                                 "ERROR - LOCA::LowRankRowMatrix::getLocalRowView() - NOT implemented yet!");
+    }
+
     void
     LowRankUpdateRowMatrix::getGlobalRowCopy(NOX::GlobalOrdinal GlobalRow,
                                              const Teuchos::ArrayView<NOX::GlobalOrdinal> &Indices,
diff --git a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp
index 28b091979b66..1ccf3984a935 100644
--- a/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp
+++ b/packages/nox/src-loca/src-tpetra/LOCA_Tpetra_LowRankUpdateRowMatrix.hpp
@@ -91,6 +91,24 @@ namespace LOCA {
       virtual bool isFillComplete() const override;
       virtual bool supportsRowViews() const override;
       virtual void
+      getGlobalRowCopy (NOX::GlobalOrdinal GlobalRow,
+                        NOX::TRowMatrix::nonconst_global_inds_host_view_type &Indices,
+                        NOX::TRowMatrix::nonconst_values_host_view_type &Values,
+                        size_t &NumEntries) const override;
+      virtual void
+      getLocalRowCopy (NOX::LocalOrdinal LocalRow,
+                        NOX::TRowMatrix::nonconst_local_inds_host_view_type &Indices,
+                        NOX::TRowMatrix::nonconst_values_host_view_type &Values,
+                       size_t &NumEntries) const override;
+      virtual void
+      getGlobalRowView (NOX::GlobalOrdinal GlobalRow,
+                        NOX::TRowMatrix::global_inds_host_view_type &Indices,
+                        NOX::TRowMatrix::values_host_view_type &Values) const override;
+      virtual void
+      getLocalRowView (NOX::LocalOrdinal LocalRow,
+                       NOX::TRowMatrix::local_inds_host_view_type &Indices,
+                       NOX::TRowMatrix::values_host_view_type &Values) const override;
+      virtual void
       getGlobalRowCopy (NOX::GlobalOrdinal GlobalRow,
                         const Teuchos::ArrayView<NOX::GlobalOrdinal> &Indices,
                         const Teuchos::ArrayView<NOX::Scalar> &Values,
@@ -133,13 +151,13 @@ namespace LOCA {
       //***************************************
       // Derived from Tpetra::Operator interface
       //***************************************
-      virtual Teuchos::RCP<const NOX::TMap> getDomainMap() const;
-      virtual Teuchos::RCP<const NOX::TMap> getRangeMap() const;
+      virtual Teuchos::RCP<const NOX::TMap> getDomainMap() const override;
+      virtual Teuchos::RCP<const NOX::TMap> getRangeMap() const override;
       virtual void apply(const NOX::TMultiVector &X,
                          NOX::TMultiVector &Y,
                          Teuchos::ETransp mode = Teuchos::NO_TRANS,
                          NOX::Scalar alpha = Teuchos::ScalarTraits<NOX::Scalar>::one(),
-                         NOX::Scalar beta = Teuchos::ScalarTraits<NOX::Scalar>::zero()) const;
+                         NOX::Scalar beta = Teuchos::ScalarTraits<NOX::Scalar>::zero()) const override;
 
     protected:
 
diff --git a/packages/nox/src-thyra/NOX_Thyra_Group.C b/packages/nox/src-thyra/NOX_Thyra_Group.C
index f8839f7a4a8e..83dedfe0d0d5 100644
--- a/packages/nox/src-thyra/NOX_Thyra_Group.C
+++ b/packages/nox/src-thyra/NOX_Thyra_Group.C
@@ -436,9 +436,9 @@ void NOX::Thyra::Group::setJacobianOperator(const Teuchos::RCP<::Thyra::LinearOp
   lop_ = op;
 }
 
-void NOX::Thyra::Group::setPreconditionerOperator(const Teuchos::RCP<::Thyra::PreconditionerBase<double>>& op)
+void NOX::Thyra::Group::setPreconditionerMatrix(const Teuchos::RCP<const ::Thyra::DefaultLinearOpSource<double>>& op)
 {
-  prec_ = op;
+  losb_ = op;
 }
 
 void NOX::Thyra::Group::setX(const NOX::Abstract::Vector& y)
diff --git a/packages/nox/src-thyra/NOX_Thyra_Group.H b/packages/nox/src-thyra/NOX_Thyra_Group.H
index 6d8738bd2487..72e047c2fa7c 100644
--- a/packages/nox/src-thyra/NOX_Thyra_Group.H
+++ b/packages/nox/src-thyra/NOX_Thyra_Group.H
@@ -191,8 +191,8 @@ namespace NOX {
       /// Dangerous power user function for LOCA Householder bordered algorithm.
       void setJacobianOperator(const Teuchos::RCP<::Thyra::LinearOpBase<double>>& op);
 
-      /// Dangerous power user function for LOCA Householder bordered algorithm.
-      void setPreconditionerOperator(const Teuchos::RCP<::Thyra::PreconditionerBase<double>>& op);
+      /// Dangerous power user function for LOCA Householder bordered algorithm. This is the Matrix M that is used to initialize a stratimikos preconditioner. NOTE: this sets the losb_ object used to update prec_!
+      void setPreconditionerMatrix(const Teuchos::RCP<const ::Thyra::DefaultLinearOpSource<double>>& op);
 
       /** @name "Compute" functions. */
       //@{
diff --git a/packages/nox/test/tpetra/CMakeLists.txt b/packages/nox/test/tpetra/CMakeLists.txt
index 46dd0b11452f..efb2602b15f6 100644
--- a/packages/nox/test/tpetra/CMakeLists.txt
+++ b/packages/nox/test/tpetra/CMakeLists.txt
@@ -50,6 +50,11 @@ IF(NOX_ENABLE_ABSTRACT_IMPLEMENTATION_THYRA AND
       SOURCES  ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_HouseholderBorderedSolve.cpp
       )
 
+    TRIBITS_ADD_EXECUTABLE_AND_TEST(
+      Tpetra_HouseholderBorderedSolve_WithUVInPrec
+      SOURCES  ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp
+      )
+
     TRIBITS_ADD_EXECUTABLE_AND_TEST(
       Tpetra_ConstraintModelEvaluator
       SOURCES  ${UNIT_TEST_DRIVER} ME_Tpetra_1DFEM.hpp ME_Tpetra_1DFEM_def.hpp tTpetra_ConstraintModelEvaluator.cpp
diff --git a/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp b/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp
new file mode 100644
index 000000000000..0710b2ce08d2
--- /dev/null
+++ b/packages/nox/test/tpetra/tTpetra_HouseholderBorderedSolve_WithUVInPrec.cpp
@@ -0,0 +1,310 @@
+//@HEADER
+// ************************************************************************
+//
+//            NOX: An Object-Oriented Nonlinear Solver Package
+//                 Copyright (2002) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Roger Pawlowski (rppawlo@sandia.gov) or
+// Eric Phipps (etphipp@sandia.gov), Sandia National Laboratories.
+// ************************************************************************
+//  CVS Information
+//  $Source$
+//  $Author$
+//  $Date$
+//  $Revision$
+// ************************************************************************
+//@HEADER
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_UnitTestHarness.hpp"
+#include "Teuchos_StackedTimer.hpp"
+
+// NOX Objects
+#include "NOX.H"
+#include "NOX_Thyra.H"
+
+// Trilinos Objects
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_ParameterList.hpp"
+#include "Teuchos_RCP.hpp"
+#include "Teuchos_FancyOStream.hpp"
+#include "Teuchos_AbstractFactoryStd.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Vector.hpp"
+
+#include "BelosTypes.hpp"
+#include "Stratimikos_DefaultLinearSolverBuilder.hpp"
+#include "Thyra_LinearOpWithSolveFactoryHelpers.hpp"
+#include "Thyra_Ifpack2PreconditionerFactory.hpp"
+#include "ME_Tpetra_1DFEM.hpp"
+
+#include "NOX_Thyra_MatrixFreeJacobianOperator.hpp"
+#include "NOX_MatrixFree_ModelEvaluatorDecorator.hpp"
+#include "NOX_TpetraTypedefs.hpp"
+#include "LOCA_Tpetra_Factory.hpp"
+#include "LOCA_Thyra_Group.H"
+#include "LOCA_MultiContinuation_ConstrainedGroup.H"
+#include "LOCA_Tpetra_ConstraintModelEvaluator.hpp"
+#include "LOCA_Parameter_SublistParser.H"
+#include "NOX_SolverStats.hpp"
+
+// For solution io
+#include "Thyra_TpetraVector.hpp"
+#include <iostream>
+#include <fstream>
+
+TEUCHOS_UNIT_TEST(NOX_Tpetra_Householder, BasicSolve)
+{
+  Teuchos::RCP<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
+
+  // Get default Tpetra template types
+  using Scalar = NOX::Scalar;
+  using LO = NOX::LocalOrdinal;
+  using GO = NOX::GlobalOrdinal;
+  using Node = NOX::NodeType;
+
+  // Create the model evaluator object
+  Scalar x00 = 0.0;
+  Scalar x01 = 1.0;
+  const Tpetra::global_size_t numGlobalElements = 100;
+  Teuchos::RCP<EvaluatorTpetra1DFEM<Scalar,LO,GO,Node> > model =
+    evaluatorTpetra1DFEM<Scalar,LO,GO,Node>(comm, numGlobalElements, x00, x01);
+
+  // Create the linear solver and register on model evaluator
+  {
+    Stratimikos::DefaultLinearSolverBuilder builder;
+    typedef Thyra::PreconditionerFactoryBase<Scalar> Base;
+    typedef Thyra::Ifpack2PreconditionerFactory<Tpetra::CrsMatrix<Scalar, LO, GO, Node> > Impl;
+    builder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd<Base, Impl>(), "Ifpack2");
+
+    Teuchos::RCP<Teuchos::ParameterList> p = Teuchos::parameterList();
+    p->set("Linear Solver Type", "Belos");
+    Teuchos::ParameterList& belosList = p->sublist("Linear Solver Types").sublist("Belos");
+    belosList.set("Solver Type", "Pseudo Block GMRES");
+    belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set<int>("Maximum Iterations", 200);
+    belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set<int>("Num Blocks", 200);
+    belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Verbosity", Belos::Errors+Belos::IterationDetails+Belos::FinalSummary);
+    belosList.sublist("Solver Types").sublist("Pseudo Block GMRES").set("Output Frequency", 5);
+    belosList.sublist("VerboseObject").set("Verbosity Level", "medium");
+    p->set("Preconditioner Type", "Ifpack2");
+    // p->set("Preconditioner Type", "None");
+    Teuchos::ParameterList& ifpackList = p->sublist("Preconditioner Types").sublist("Ifpack2");
+    ifpackList.set("Prec Type", "ILUT");
+
+    builder.setParameterList(p);
+
+    Teuchos::RCP<Thyra::LinearOpWithSolveFactoryBase<Scalar> >
+      lowsFactory = builder.createLinearSolveStrategy("");
+
+    model->set_W_factory(lowsFactory);
+  }
+
+  // Create the initial guess
+  Teuchos::RCP<Thyra::VectorBase<Scalar> >
+    initial_guess = model->getNominalValues().get_x()->clone_v();
+  Thyra::V_S(initial_guess.ptr(),Teuchos::ScalarTraits<Scalar>::one());
+
+  // Create top level nox/loca solver parameter list
+  Teuchos::RCP<Teuchos::ParameterList> pList = Teuchos::parameterList("Top Level");
+
+  // Create nox parameter list
+  auto& nl_params = pList->sublist("NOX");
+  nl_params.set("Nonlinear Solver", "Line Search Based");
+  nl_params.sublist("Direction").sublist("Newton").sublist("Linear Solver").set("Tolerance", 1.0e-8);
+  auto& ls_params = nl_params.sublist("Line Search");
+  ls_params.set("Method","Full Step");
+  auto& output_list = nl_params.sublist("Printing").sublist("Output Information");
+  output_list.set("Debug",true);
+  output_list.set("Warning",true);
+  output_list.set("Error",true);
+  output_list.set("Test Details",true);
+  output_list.set("Details",true);
+  output_list.set("Parameters",true);
+  output_list.set("Linear Solver Details",true);
+  output_list.set("Inner Iteration",true);
+  output_list.set("Outer Iteration",true);
+  output_list.set("Outer Iteration StatusTest",true);
+
+  // Create the LOCA Group:
+  // (NOX::Thyra::Group-->LOCA::Thyra::Group-->LOCA::Constrained::Group)
+  // For Tpetra Householder, we need to actively set the
+  // preconditioner and preconditioner factory so that it uses the
+  // precOp separate from the Jacobian operator. Householder replaces
+  // the Jacobian operator with a matrix-free version that has the
+  // uv^T tacked on.
+  auto explicit_jacobian = model->create_W_op();
+  auto prec_matrix = Teuchos::rcp(new Thyra::DefaultPreconditioner<NOX::Scalar>(Teuchos::null,explicit_jacobian));
+  TEST_ASSERT(nonnull(model->get_W_factory()->getPreconditionerFactory()));
+  Teuchos::RCP<NOX::Thyra::Group> nox_group =
+    Teuchos::rcp(new NOX::Thyra::Group(*initial_guess,
+                                       model,
+                                       explicit_jacobian,
+                                       model->get_W_factory(),
+                                       prec_matrix, // Reuse Jac for approx preconditioner
+                                       model->get_W_factory()->getPreconditionerFactory(),
+                                       Teuchos::null));
+
+  Teuchos::RCP<LOCA::Abstract::Factory> tpetra_factory = Teuchos::rcp(new LOCA::Tpetra::Factory);
+
+  Teuchos::RCP<LOCA::GlobalData> global_data = LOCA::createGlobalData(pList, tpetra_factory);
+
+  Teuchos::RCP<LOCA::ParameterVector> p_vec = Teuchos::rcp(new LOCA::ParameterVector);
+  p_vec->addParameter("k", 1.0); // Source term multiplier
+  p_vec->addParameter("T_left", 1.2); // Source term multiplier
+
+  std::vector<int> me_p_indices;
+  me_p_indices.push_back(2);
+  me_p_indices.push_back(4);
+  Teuchos::RCP<LOCA::Thyra::Group> loca_group = Teuchos::rcp(new LOCA::Thyra::Group(global_data,
+                                                                                    *nox_group,
+                                                                                    *p_vec,
+                                                                                    me_p_indices));
+
+  auto g_names = Teuchos::rcp(new std::vector<std::string>);
+  g_names->push_back("Constraint: T_right=2");
+  g_names->push_back("Constraint: 2*T_left=T_right");
+  auto x_thyra = ::Thyra::createMember(model->get_x_space(),"x");
+  NOX::Thyra::Vector x(x_thyra);
+  auto constraints = Teuchos::rcp(new LOCA::MultiContinuation::ConstraintModelEvaluator(model,*p_vec,*g_names,x));
+
+  // Set initial parameter conditions
+  constraints->setX(x);
+  constraints->setParam(0,1.0);
+  constraints->setParam(1,1.2);
+
+  // Create the constraints list
+  auto& locaParamsList = pList->sublist("LOCA");
+  auto& constraint_list = locaParamsList.sublist("Constraints");
+  constraint_list.set("Bordered Solver Method", "Householder");
+  constraint_list.set("Constraint Object", constraints);
+  constraint_list.set("Constraint Parameter Names", g_names);
+  constraint_list.set("Include UV In Preconditioner",true);
+
+  auto loca_parser = Teuchos::rcp(new LOCA::Parameter::SublistParser(global_data));
+  loca_parser->parseSublists(pList);
+
+  std::vector<int> param_ids(2);
+  param_ids[0] = 0;
+  param_ids[1] = 1;
+  auto constraint_list_ptr = Teuchos::rcpFromRef(constraint_list);
+  Teuchos::RCP<LOCA::MultiContinuation::ConstrainedGroup> loca_constrained_group =
+    Teuchos::rcp(new LOCA::MultiContinuation::ConstrainedGroup(global_data,
+                                                               loca_parser,
+                                                               constraint_list_ptr,
+                                                               loca_group,
+                                                               constraints,
+                                                               param_ids,
+                                                               false));
+
+  loca_constrained_group->computeF();
+
+  // Create the NOX status tests and the solver
+  // Create the convergence tests
+  Teuchos::RCP<NOX::StatusTest::NormF> absresid =
+    Teuchos::rcp(new NOX::StatusTest::NormF(1.0e-8));
+  Teuchos::RCP<NOX::StatusTest::NormWRMS> wrms =
+    Teuchos::rcp(new NOX::StatusTest::NormWRMS(1.0e-2, 1.0e-8));
+  Teuchos::RCP<NOX::StatusTest::Combo> converged =
+    Teuchos::rcp(new NOX::StatusTest::Combo(NOX::StatusTest::Combo::AND));
+  converged->addStatusTest(absresid);
+  converged->addStatusTest(wrms);
+  Teuchos::RCP<NOX::StatusTest::MaxIters> maxiters =
+    Teuchos::rcp(new NOX::StatusTest::MaxIters(10));
+  Teuchos::RCP<NOX::StatusTest::FiniteValue> fv =
+    Teuchos::rcp(new NOX::StatusTest::FiniteValue);
+  Teuchos::RCP<NOX::StatusTest::Combo> combo =
+    Teuchos::rcp(new NOX::StatusTest::Combo(NOX::StatusTest::Combo::OR));
+  combo->addStatusTest(fv);
+  combo->addStatusTest(converged);
+  combo->addStatusTest(maxiters);
+
+  // Create the solver
+  // auto solver = NOX::Solver::buildSolver(nox_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX")));
+  // auto solver = NOX::Solver::buildSolver(loca_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX")));
+  auto solver = NOX::Solver::buildSolver(loca_constrained_group, combo, Teuchos::rcpFromRef(pList->sublist("NOX")));
+
+  NOX::StatusTest::StatusType solvStatus = solver->solve();
+
+  // Output
+  {
+    Teuchos::TimeMonitor::getStackedTimer()->stopBaseTimer();
+    Teuchos::StackedTimer::OutputOptions options;
+    options.output_fraction = true;
+    options.output_minmax = true;
+    Teuchos::TimeMonitor::getStackedTimer()->report(out,comm,options);
+  }
+
+  // Write solution to file
+  const bool printSolution = true;
+  if (printSolution) {
+    for (int i=0; i < comm->getSize(); ++i) {
+      if (comm->getRank() == i) {
+        std::ofstream file;
+        if (comm->getRank() == 0)
+          file.open("householder_solution.txt",std::ios::trunc);
+        else
+          file.open("householder_solution.txt",std::ios::app);
+        
+        const auto& final_x = solver->getSolutionGroup().getX();
+        const auto& final_x_nox = *(dynamic_cast<const LOCA::MultiContinuation::ExtendedVector&>(final_x).getXVec());
+        const auto& final_x_thyra = dynamic_cast<const NOX::Thyra::Vector&>(final_x_nox).getThyraVector();
+        const auto& final_x_tpetra_const = *(dynamic_cast<const ::Thyra::TpetraVector<NOX::Scalar,NOX::LocalOrdinal,NOX::GlobalOrdinal,NOX::NodeType>&>(final_x_thyra).getConstTpetraVector());
+        auto& final_x_tpetra = const_cast<::Tpetra::Vector<NOX::Scalar,NOX::LocalOrdinal,NOX::GlobalOrdinal,NOX::NodeType>&>(final_x_tpetra_const);
+        final_x_tpetra.sync_host();
+        const auto& final_x_view = final_x_tpetra.getLocalViewHost();
+        for (size_t j=0; j < final_x_view.extent(0); ++j)
+          file << final_x_view(j,0) << std::endl;
+      }
+      comm->barrier();
+    }
+  }
+
+  TEST_ASSERT(solvStatus == NOX::StatusTest::Converged);
+  TEST_EQUALITY(solver->getSolverStatistics()->numNonlinearIterations,5);
+
+  // Check final values
+  {
+    const auto& group = solver->getSolutionGroup();
+    const auto& c_group = dynamic_cast<const LOCA::MultiContinuation::ConstrainedGroup&>(group);
+
+    out << "\nFinal Parameter Value for \"k\" = " << std::setprecision(10) << c_group.getParam(0) << std::endl;
+    out << "Final Parameter Value for \"T_left\" = " << std::setprecision(10) << c_group.getParam(1) << std::endl;
+
+    const double tol = 1.0e-3;
+    TEST_FLOATING_EQUALITY(c_group.getParam(0),-0.5993277206,tol);
+    TEST_FLOATING_EQUALITY(c_group.getParam(1),1.0,tol);
+  }
+
+  // Breaks RCP cyclic dependency
+  LOCA::destroyGlobalData(global_data);
+}
diff --git a/packages/panzer/doc/Doxyfile b/packages/panzer/doc/Doxyfile
index 15f4409367e7..5b06862638a5 100644
--- a/packages/panzer/doc/Doxyfile
+++ b/packages/panzer/doc/Doxyfile
@@ -11,7 +11,7 @@ TAGFILES += \
   $(TRILINOS_HOME)/packages/common/tag_files/ifpack.tag=$(TRILINOS_HOME)/packages/ifpack/doc/html \
   $(TRILINOS_HOME)/packages/common/tag_files/ml.tag=$(TRILINOS_HOME)/packages/ml/doc/html \
   $(TRILINOS_HOME)/packages/common/tag_files/nox.tag=$(TRILINOS_HOME)/packages/nox/doc/html \
-  $(TRILINOS_HOME)/packages/common/tag_files/ml.tag=$(TRILINOS_HOME)/packages/phalanx/doc/html
+  $(TRILINOS_HOME)/packages/common/tag_files/phalanx.tag=$(TRILINOS_HOME)/packages/phalanx/doc/html
 #
 # Package options
 #
diff --git a/packages/percept/src/adapt/FixSideSets.cpp b/packages/percept/src/adapt/FixSideSets.cpp
index e1156fb68b40..d3d8a81b9656 100644
--- a/packages/percept/src/adapt/FixSideSets.cpp
+++ b/packages/percept/src/adapt/FixSideSets.cpp
@@ -881,6 +881,11 @@ namespace percept {
         reduced_mod_end = false;
       (void)reduced_mod_end;
 
+      bool skip_side_part_fixes = false;
+      if (m_eMesh.getProperty("Refiner_skip_side_part_fixes") == "true")
+        skip_side_part_fixes = true;
+
+
       // loop over all sides that are leaves (not parent or have no family tree),
       //   loop over their nodes and their associated elements,
       //     connect element and side if they share a face
@@ -909,6 +914,7 @@ namespace percept {
       fix_permutation(side_set);
 
       end_begin(msg+"moveSides");
-      move_sides_to_correct_surfaces();
+      if (!skip_side_part_fixes)
+        move_sides_to_correct_surfaces();
     }
 }
diff --git a/packages/percept/src/adapt/SerializeNodeRegistry.hpp b/packages/percept/src/adapt/SerializeNodeRegistry.hpp
index 43123ad13519..02e7b260030d 100644
--- a/packages/percept/src/adapt/SerializeNodeRegistry.hpp
+++ b/packages/percept/src/adapt/SerializeNodeRegistry.hpp
@@ -322,10 +322,10 @@
               {
                 for(YAML::const_iterator iter=doc.begin();iter!=doc.end();++iter)
                   {
-                    const YAML::Node& key = iter->first;
+                    const YAML::Node key = iter->first;
                     PartName part_name = key.as<PartName>();
 
-                    const YAML::Node& valSeq = iter->second;
+                    const YAML::Node valSeq = iter->second;
                     UInt rank_input;
                     TopologyName topo_name;
                     YAML::const_iterator itv=valSeq.begin();
@@ -334,7 +334,7 @@
                     topo_name = itv->as<TopologyName>();
                     ++itv;
                     stk::mesh::EntityRank rank = static_cast<stk::mesh::EntityRank>(rank_input);
-                    const YAML::Node& subsetSeq = *itv;
+                    const YAML::Node subsetSeq = *itv;
                     YAML::const_iterator iss;
                     PartSubsets subsets;
                     for (iss = subsetSeq.begin(); iss != subsetSeq.end(); ++iss)
@@ -503,10 +503,10 @@
               {
                 for(YAML::const_iterator iter=doc.begin();iter!=doc.end();++iter)
                   {
-                    const YAML::Node& key = iter->first;
+                    const YAML::Node key = iter->first;
                     stk::mesh::EntityId id = key.as<stk::mesh::EntityId>();
                     NodeMapValue procs;
-                    const YAML::Node& val = iter->second;
+                    const YAML::Node val = iter->second;
                     procs = val.as<NodeMapValue>();
                     //std::cout << "readNodeMap id= " << id << " procs= " << procs << std::endl;
                     if (is_local && procs.size() != 1)
@@ -1645,7 +1645,7 @@
 
                   //if (DEBUG_YAML) std::cout << "it.first().Type() = " << it.first().Type() << " it.first().Tag()= " << it.first().Tag() << std::endl;
                   //if (DEBUG_YAML) std::cout << "it.second().Type() = " << it.second().Type() << " it.second().Tag()= " << it.second().Tag() << std::endl;
-                  const YAML::Node& keySeq = it->first;
+                  const YAML::Node keySeq = it->first;
                   for(YAML::const_iterator itk=keySeq.begin();itk!=keySeq.end();++itk) {
                     key_quantum = itk->as<SDCEntityType_ID>();
                     if (DEBUG_YAML) std::cout << "s_r key_quantum= " << key_quantum << std::endl;
@@ -1667,7 +1667,7 @@
                   }
 
                   int iseq=0;
-                  const YAML::Node& valSeq = it->second;
+                  const YAML::Node valSeq = it->second;
                   stk::mesh::EntityRank rank = stk::topology::INVALID_RANK;
                   size_t id;
                   for(YAML::const_iterator itv=valSeq.begin();itv!=valSeq.end();++itv,++iseq) {
diff --git a/packages/percept/src/adapt/TransitionElementAdapter.hpp b/packages/percept/src/adapt/TransitionElementAdapter.hpp
index 4ed706754a81..bc397071a7ff 100644
--- a/packages/percept/src/adapt/TransitionElementAdapter.hpp
+++ b/packages/percept/src/adapt/TransitionElementAdapter.hpp
@@ -26,7 +26,7 @@
 #include "/usr/netpub/valgrind-3.8.1/include/valgrind/callgrind.h"
 #endif
 
-#define DO_ALT_TIMER 1
+#define DO_ALT_TIMER 0
 
 #define TIMING(code) code
 #define TIMER(name) stk::diag::Timer timer ## name ( #name, Base::rootTimer());  stk::diag::TimeBlock tbTimer ## name (timer ## name)
diff --git a/packages/percept/src/adapt/UniformRefinerPattern.cpp b/packages/percept/src/adapt/UniformRefinerPattern.cpp
index c6c9f079d804..808e82278751 100644
--- a/packages/percept/src/adapt/UniformRefinerPattern.cpp
+++ b/packages/percept/src/adapt/UniformRefinerPattern.cpp
@@ -865,6 +865,9 @@
       if (eMesh.get_spatial_dim() == 2)
         return;
 
+      if (eMesh.getProperty("Refiner_skip_side_part_fixes") == "true")
+        return;
+
       std::vector<const stk::mesh::Part*> surfaces = eMesh.get_fem_meta_data()->get_surfaces_in_surface_to_block_map();
       for (unsigned isu = 0; isu < surfaces.size(); ++isu)
         {
diff --git a/packages/percept/src/adapt/main/MeshAdapt.cpp b/packages/percept/src/adapt/main/MeshAdapt.cpp
index d7eabdaa3d9b..e782d24d20b7 100644
--- a/packages/percept/src/adapt/main/MeshAdapt.cpp
+++ b/packages/percept/src/adapt/main/MeshAdapt.cpp
@@ -2352,7 +2352,7 @@ void MeshAdapt::initialize_m2g_geometry(std::string input_geometry)
       bool toDeclare = true;
 
       int lowestRank = std::numeric_limits<int>::max();
-      std::vector<stk::mesh::EntityKey> keysToCheck;
+      std::vector<stk::mesh::Entity> entitiesToCheck;
      
       procsSharedTo.clear(); //std::vector<int> procsSharedTo;
 
@@ -2361,11 +2361,10 @@ void MeshAdapt::initialize_m2g_geometry(std::string input_geometry)
 
         cur_node = bd->get_entity(stk::topology::NODE_RANK, shellNodeIDs[j]);
 
-        stk::mesh::EntityKey key = bd->entity_key(cur_node);
-        keysToCheck.push_back(key);
+        entitiesToCheck.push_back(cur_node);
       }
 
-      bd->shared_procs_intersection(keysToCheck, procsSharedTo);
+      bd->shared_procs_intersection(entitiesToCheck, procsSharedTo);
       procsSharedTo.push_back(THIS_PROC_NUM);//find all processes that either own or have these nodes shared to them
       for (size_t iii = 0; iii < procsSharedTo.size(); iii++) {
         if (procsSharedTo[iii] < lowestRank)
diff --git a/packages/percept/src/percept/PerceptMesh.cpp b/packages/percept/src/percept/PerceptMesh.cpp
index ed9b33802bd6..b1af440fd263 100644
--- a/packages/percept/src/percept/PerceptMesh.cpp
+++ b/packages/percept/src/percept/PerceptMesh.cpp
@@ -2142,9 +2142,12 @@
     void PerceptMesh::
     createEntities(stk::mesh::EntityRank entityRank, int count, std::vector<stk::mesh::Entity>& requested_entities)
     {
-      std::vector<size_t> requests(  m_metaData->entity_rank_count() , 0 );
-      requests[entityRank] = count;
-      get_bulk_data()->generate_new_entities( requests, requested_entities );
+      std::vector<stk::mesh::EntityId> requestedIds;
+      get_bulk_data()->generate_new_ids(entityRank, count, requestedIds);
+      stk::mesh::PartVector addParts;
+      requested_entities.clear();
+      get_bulk_data()->declare_entities(entityRank, requestedIds, addParts, requested_entities);
+
       if (entityRank == node_rank())
         {
           stk::mesh::Part& nodePart = get_fem_meta_data()->get_topology_root_part(stk::topology::NODE);
@@ -7206,8 +7209,8 @@
         {
           std::string K, V;
           for (YAML::const_iterator i = node.begin(); i != node.end(); ++i) {
-            const YAML::Node & key   = i->first;
-            const YAML::Node & value = i->second;
+            const YAML::Node key   = i->first;
+            const YAML::Node value = i->second;
             K = key.as<std::string>();
             V = value.as<std::string>();
             setProperty(K, V);
diff --git a/packages/percept/src/percept/ShardsInterfaceTable.hpp b/packages/percept/src/percept/ShardsInterfaceTable.hpp
index 0ff90232842a..08369accd227 100644
--- a/packages/percept/src/percept/ShardsInterfaceTable.hpp
+++ b/packages/percept/src/percept/ShardsInterfaceTable.hpp
@@ -25,7 +25,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 
diff --git a/packages/percept/src/percept/YamlUtils.hpp b/packages/percept/src/percept/YamlUtils.hpp
index 8ae8c6e16f0b..6d0f60baec2a 100644
--- a/packages/percept/src/percept/YamlUtils.hpp
+++ b/packages/percept/src/percept/YamlUtils.hpp
@@ -72,8 +72,8 @@
           case YAML::NodeType::Map:
             emout << YAML::BeginMap ;
             for (YAML::const_iterator i = node.begin(); i != node.end(); ++i) {
-              const YAML::Node & key   = i->first;
-              const YAML::Node & value = i->second;
+              const YAML::Node key   = i->first;
+              const YAML::Node value = i->second;
               out = key.as<std::string>();
               emout << YAML::Key << out;
               emout << YAML::Value;
diff --git a/packages/percept/src/percept/fixtures/BeamFixture.hpp b/packages/percept/src/percept/fixtures/BeamFixture.hpp
index 26b213a1a617..97ece636ed53 100644
--- a/packages/percept/src/percept/fixtures/BeamFixture.hpp
+++ b/packages/percept/src/percept/fixtures/BeamFixture.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 #include <stk_mesh/base/MetaData.hpp>
 #include <stk_mesh/base/FEMHelpers.hpp>
diff --git a/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp b/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp
index 27b5136849dd..f4b4b75b8cbb 100644
--- a/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp
+++ b/packages/percept/src/percept/fixtures/HeterogeneousFixture.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <stk_mesh/base/Part.hpp>
diff --git a/packages/percept/src/percept/fixtures/PyramidFixture.hpp b/packages/percept/src/percept/fixtures/PyramidFixture.hpp
index dbc5bdda5a4c..73d78e54a313 100644
--- a/packages/percept/src/percept/fixtures/PyramidFixture.hpp
+++ b/packages/percept/src/percept/fixtures/PyramidFixture.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <stk_mesh/base/Part.hpp>
diff --git a/packages/percept/src/percept/fixtures/QuadFixture.hpp b/packages/percept/src/percept/fixtures/QuadFixture.hpp
index c048c32f88fb..11583bb60e16 100644
--- a/packages/percept/src/percept/fixtures/QuadFixture.hpp
+++ b/packages/percept/src/percept/fixtures/QuadFixture.hpp
@@ -32,7 +32,6 @@
 
 #include <stk_mesh/base/BulkModification.hpp>
 
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/BoundaryAnalysis.hpp>
 #include <stk_io/IossBridge.hpp>
 
diff --git a/packages/percept/src/percept/fixtures/SingleTetFixture.hpp b/packages/percept/src/percept/fixtures/SingleTetFixture.hpp
index 56ed34a69c7a..554e9ccf58ad 100644
--- a/packages/percept/src/percept/fixtures/SingleTetFixture.hpp
+++ b/packages/percept/src/percept/fixtures/SingleTetFixture.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <stk_mesh/base/Part.hpp>
diff --git a/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp b/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp
index e6b2c7f8c926..f4e07767519a 100644
--- a/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp
+++ b/packages/percept/src/percept/fixtures/TetWedgeFixture.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <stk_mesh/base/Part.hpp>
diff --git a/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp b/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp
index c9df7c41fd89..960d4891258c 100644
--- a/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp
+++ b/packages/percept/src/percept/fixtures/TriQuadSurfaceMesh3D.hpp
@@ -21,7 +21,6 @@
 #include <stk_mesh/base/Field.hpp>
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <stk_mesh/base/Part.hpp>
diff --git a/packages/percept/src/percept/fixtures/WedgeFixture.hpp b/packages/percept/src/percept/fixtures/WedgeFixture.hpp
index 333f5b5addd6..722f742c2f07 100644
--- a/packages/percept/src/percept/fixtures/WedgeFixture.hpp
+++ b/packages/percept/src/percept/fixtures/WedgeFixture.hpp
@@ -28,7 +28,6 @@
 
 #include <stk_mesh/base/BulkModification.hpp>
 
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/BoundaryAnalysis.hpp>
 #include <stk_io/IossBridge.hpp>
 
diff --git a/packages/percept/src/percept/function/MDArray.hpp b/packages/percept/src/percept/function/MDArray.hpp
index 2309f429e05b..b17779c3ee68 100644
--- a/packages/percept/src/percept/function/MDArray.hpp
+++ b/packages/percept/src/percept/function/MDArray.hpp
@@ -18,13 +18,6 @@
   namespace percept
   {
 
-    //     class MDArray : public FieldContainer<double>
-    //     {
-    //       public:
-    //       typedef FieldContainer<double> base;
-    //       MDArray(std::vector<int> dimensions) : FieldContainer<double>( Teuchos::Array<int>(dimensions.begin(), dimensions.end()) ) {}
-    //     };
-
     typedef Intrepid::FieldContainer<double> MDArray;
     typedef Intrepid::FieldContainer<int> MDArrayInt;
     typedef Intrepid::FieldContainer<unsigned> MDArrayUInt;
@@ -73,7 +66,6 @@
     }
 
 
-    //typedef Intrepid::FieldContainer<std::string> MDArrayString;
     class MDArrayString
     {
       typedef std::vector<std::string > VecOfString;
diff --git a/packages/percept/src/percept/mesh/gen/SweepMesher.cpp b/packages/percept/src/percept/mesh/gen/SweepMesher.cpp
index 2a103b03895d..c5a855ce0fc7 100644
--- a/packages/percept/src/percept/mesh/gen/SweepMesher.cpp
+++ b/packages/percept/src/percept/mesh/gen/SweepMesher.cpp
@@ -27,7 +27,6 @@
 #include <stk_mesh/base/GetEntities.hpp>
 
 #include <stk_mesh/base/FEMHelpers.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 
 #include <stk_io/StkMeshIoBroker.hpp>
 #include <stk_io/IossBridge.hpp>
diff --git a/packages/percept/src/percept/mesh/gen/SweepMesher.hpp b/packages/percept/src/percept/mesh/gen/SweepMesher.hpp
index f3de5c9405af..0f8c308c6fdc 100644
--- a/packages/percept/src/percept/mesh/gen/SweepMesher.hpp
+++ b/packages/percept/src/percept/mesh/gen/SweepMesher.hpp
@@ -28,7 +28,6 @@
 
 #include <stk_mesh/base/CoordinateSystems.hpp>
 #include <stk_mesh/base/MetaData.hpp>
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <percept/ShardsInterfaceTable.hpp>
diff --git a/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp b/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp
index 76144f9e2a3b..b3ec57c70eb1 100644
--- a/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp
+++ b/packages/percept/src/percept/mesh/geometry/stk_geom/3D/FitGregoryPatches.cpp
@@ -1916,8 +1916,8 @@ namespace percept {
               VERIFY_OP_ON(y_surface_set.Type(), ==, YAML::NodeType::Map, "bad surface_set data");
               for (YAML::const_iterator i = y_surface_set.begin(); i != y_surface_set.end(); ++i)
                 {
-                  const YAML::Node & key   = i->first;
-                  const YAML::Node & value = i->second;
+                  const YAML::Node key   = i->first;
+                  const YAML::Node value = i->second;
                   std::string v_key;
                   v_key = key.as<std::string>();
                   VERIFY_OP_ON(value.Type(), ==, YAML::NodeType::Sequence, "bad surface_set value data in [surfaceSetName: [s1,s2...]]");
@@ -1938,8 +1938,8 @@ namespace percept {
           VERIFY_OP_ON(y_angle_map.Type(), ==, YAML::NodeType::Map, "bad angle_map data in yaml file");
           for (YAML::const_iterator i = y_angle_map.begin(); i != y_angle_map.end(); ++i)
             {
-              const YAML::Node & key   = i->first;
-              const YAML::Node & value = i->second;
+              const YAML::Node key   = i->first;
+              const YAML::Node value = i->second;
               std::string v_key = key.as<std::string>();
               double v_value = value.as<double>();
               m_angleMap[v_key] = v_value;
diff --git a/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp b/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp
index 4d5353ad1c0c..1b002c752cf6 100644
--- a/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp
+++ b/packages/percept/src/percept/mesh/mod/smoother/ReferenceMeshSmootherBase.cpp
@@ -84,10 +84,7 @@
   template <typename MeshType>
     ReferenceMeshSmootherBaseImpl<MeshType>::
 	~ReferenceMeshSmootherBaseImpl()
-  {
-	  if(Base::m_eMesh->get_rank() == 0)
-		  myFile.close();
-  }
+  {}
 
   template<>
     void ReferenceMeshSmootherBaseImpl<STKMesh>::sync_fields(int iter)
diff --git a/packages/percept/src/percept/verifier/mesh/Verifier.hpp b/packages/percept/src/percept/verifier/mesh/Verifier.hpp
index 72051339524c..aab26ff527e4 100644
--- a/packages/percept/src/percept/verifier/mesh/Verifier.hpp
+++ b/packages/percept/src/percept/verifier/mesh/Verifier.hpp
@@ -30,7 +30,6 @@
 #include <stk_mesh/base/FieldParallel.hpp>
 #include <stk_mesh/base/Comm.hpp>
 
-#include <stk_mesh/base/Stencils.hpp>
 #include <stk_mesh/base/TopologyDimensions.hpp>
 
 #include <percept/TopologyVerifier.hpp>
diff --git a/packages/percept/src/percept/xfer/LinInterp.hpp b/packages/percept/src/percept/xfer/LinInterp.hpp
index c661c4b2b84e..663e4149901d 100644
--- a/packages/percept/src/percept/xfer/LinInterp.hpp
+++ b/packages/percept/src/percept/xfer/LinInterp.hpp
@@ -163,7 +163,7 @@ LinInterp<FROM,TO>::filter_to_nearest (
       if (topo.getKey()==shards::Particle::key) {
         dist = 0.0;
         for ( unsigned j = 0; j < nDim; ++j ) {
-          dist += std::pow(cellWorkset(0,0,j) - inputPhysicalPoints(j), 2);
+          dist += std::pow(cellWorkset(0,0,j) - inputPhysicalPoints(0,j), 2);
         }
         dist = std::sqrt(dist);
       }
@@ -177,7 +177,7 @@ LinInterp<FROM,TO>::filter_to_nearest (
                                                          topo,
                                                          cellOrd);
         
-        dist = parametricDistanceToEntity(&outputParametricPoints(0), topo);
+        dist = parametricDistanceToEntity(&outputParametricPoints(0,0), topo);
       }
 
       if ( dist < (1.0 + parametric_tolerance) && dist < best_dist ) {
@@ -185,7 +185,7 @@ LinInterp<FROM,TO>::filter_to_nearest (
         best_dist = dist;
 
 	for ( unsigned j = 0; j < nDim; ++j ) {
-	  isoParCoords[j] = outputParametricPoints(j);
+	  isoParCoords[j] = outputParametricPoints(0,j);
 	}
 
         ToPoints.TransferInfo_[thePt] = isoParCoords;
@@ -343,7 +343,6 @@ LinInterp<FROM,TO>::apply_from_nodal_field (
     }
   }
 
-  Intrepid::FieldContainer<double> outVals(1, 1);
   Intrepid::FieldContainer<double> inputParametricPoints(1, nDim);
 
   inputParametricPoints.setValues(&isoParCoords[0], nDim);
diff --git a/packages/seacas/cmake/FortranSettings.cmake b/packages/seacas/cmake/FortranSettings.cmake
index c3447d57fe9e..7a73ce5f3128 100644
--- a/packages/seacas/cmake/FortranSettings.cmake
+++ b/packages/seacas/cmake/FortranSettings.cmake
@@ -8,6 +8,8 @@ IF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "GNU")
   SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fcray-pointer -fdefault-real-8 -fdefault-integer-8 -fno-range-check")
 ELSEIF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "XL")
   SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -WF,-D__XLF__ -qintsize=8 -qrealsize=8 -qfixed")
+ELSEIF ("${CMAKE_Fortran_COMPILER_ID}" MATCHES "Cray")
+  SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -sdefault64")
 ELSE()
   SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -r8 -i8")
 ENDIF()
diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp
index d1c4317d9284..a8a8719341e7 100644
--- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_decl.hpp
@@ -45,6 +45,8 @@
 //#include <Xpetra_Operator_fwd.hpp>
 #include <Xpetra_MapFactory_fwd.hpp>
 
+#include<KokkosKernels_Utils.hpp>
+
 #include <FROSch_Tools_def.hpp>
 
 
@@ -108,7 +110,7 @@ namespace FROSch {
         int buildGlobalBasisMatrix(ConstXMapPtr rowMap,
                                    ConstXMapPtr rangeMap,
                                    ConstXMapPtr repeatedMap,
-                                   SC treshold);
+                                   SC tresholdDropping);
 
         int clearCoarseSpace();
 
diff --git a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp
index 3e7b05c5da51..ed391fab6a10 100644
--- a/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/CoarseSpaces/FROSch_CoarseSpace_def.hpp
@@ -198,12 +198,12 @@ namespace FROSch {
     int CoarseSpace<SC,LO,GO,NO>::buildGlobalBasisMatrix(ConstXMapPtr rowMap,
                                                          ConstXMapPtr rangeMap,
                                                          ConstXMapPtr repeatedMap,
-                                                         SC treshold)
+                                                         SC tresholdDropping)
     {
         FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"FROSch::CoarseSpace: AssembledBasisMap_.is_null().");
         FROSCH_ASSERT(!AssembledBasis_.is_null(),"FROSch::CoarseSpace: AssembledBasis_.is_null().");
 
-        #if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
+#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
         if (rowMap->lib() == UseTpetra) {
             UN numRows = AssembledBasis_->getLocalLength();
             UN numCols = AssembledBasis_->getNumVectors();
@@ -234,7 +234,7 @@ namespace FROSch {
                     if (lo != -1) {
                         for (UN j=0; j<numCols; j++) {
                             SC valueTmp=AssembledBasisView(i, j);
-                            if (fabs(valueTmp) > treshold) {
+                            if (fabs(valueTmp) > tresholdDropping) {
                                 Rowptr[lo+1] ++;
                             }
                         }
@@ -260,7 +260,7 @@ namespace FROSch {
                         UN nnz_i = Rowptr[lo];
                         for (UN j=0; j<numCols; j++) {
                             SC valueTmp=AssembledBasisView(i, j);
-                            if (fabs(valueTmp) > treshold) {
+                            if (fabs(valueTmp) > tresholdDropping) {
                                 Values[nnz_i] = valueTmp;
                                 Indices[nnz_i] = j;
 
@@ -283,7 +283,7 @@ namespace FROSch {
                                                                    AssembledBasisMapUnique_, rangeMap,
                                                                    params);
         } else
-        #endif
+#endif
         {
             if (rowMap->lib()==UseEpetra) {
                 GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!!
@@ -294,7 +294,7 @@ namespace FROSch {
                     SCVec values;
                     for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
                         valueTmp=AssembledBasis_->getData(j)[i];
-                        if (fabs(valueTmp)>treshold) {
+                        if (fabs(valueTmp)>tresholdDropping) {
                             indices.push_back(AssembledBasisMap_->getGlobalElement(j));
                             values.push_back(valueTmp);
                         }
@@ -314,7 +314,7 @@ namespace FROSch {
                     SCVec values;
                     for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
                         valueTmp=AssembledBasis_->getData(j)[i];
-                        if (fabs(valueTmp)>treshold) {
+                        if (fabs(valueTmp)>tresholdDropping) {
                             indices.push_back(j);
                             values.push_back(valueTmp);
                         }
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp
index b8fb721c5c76..2a9df6fbb913 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_AlgebraicOverlappingOperator_def.hpp
@@ -160,8 +160,8 @@ namespace FROSch {
         this->OverlappingMap_ = repeatedMap;
         this->OverlappingMatrix_ = this->K_;
 
-        GO global;
-        LO local,sum,minVal,maxVal;
+        GO global,sum;
+        LO local,minVal,maxVal;
         SC avg;
         if (verbosity==All) {
             FROSCH_DETAILTIMER_START_LEVELID(printStatisticsTime,"print statistics");
@@ -172,7 +172,7 @@ namespace FROSch {
             }
 
             local = (LO) max((LO) this->OverlappingMap_->getNodeNumElements(),(LO) 0);
-            reduceAll(*this->MpiComm_,REDUCE_SUM,local,ptr(&sum));
+            reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum));
             avg = max(sum/double(this->MpiComm_->getSize()),0.0);
             reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal));
             reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal));
@@ -231,7 +231,7 @@ namespace FROSch {
             if (verbosity==All) {
                 FROSCH_DETAILTIMER_START_LEVELID(printStatisticsTime,"print statistics");
                 local = (LO) max((LO) this->OverlappingMap_->getNodeNumElements(),(LO) 0);
-                reduceAll(*this->MpiComm_,REDUCE_SUM,local,ptr(&sum));
+                reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum));
                 avg = max(sum/double(this->MpiComm_->getSize()),0.0);
                 reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal));
                 reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal));
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp
index 243d8e90b1d9..4a0b5c5323a4 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_CoarseOperator_def.hpp
@@ -89,7 +89,7 @@ namespace FROSch {
             if (CoarseSpace_->hasUnassembledMaps()) { // If there is no unassembled basis, the current Phi_ should already be correct
                 CoarseSpace_->assembleCoarseSpace();
                 FROSCH_ASSERT(CoarseSpace_->hasAssembledBasis(),"FROSch::CoarseOperator : !CoarseSpace_->hasAssembledBasis()");
-                CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRowMap(),this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Threshold Phi",1.e-8));
+                CoarseSpace_->buildGlobalBasisMatrix(this->K_->getRowMap(),this->K_->getRangeMap(),subdomainMap,this->ParameterList_->get("Phi: Dropping Threshold",1.e-8));
                 FROSCH_ASSERT(CoarseSpace_->hasGlobalBasisMatrix(),"FROSch::CoarseOperator : !CoarseSpace_->hasGlobalBasisMatrix()");
                 Phi_ = CoarseSpace_->getGlobalBasisMatrix();
             }
@@ -106,6 +106,13 @@ namespace FROSch {
             this->ParameterList_->set("RCP(Phi)", Phi_);
         }
 
+        // Store current Coarse Matrix in ParameterList_
+        if ( this->ParameterList_->get("Store Coarse Matrix",false) ) {
+            FROSCH_NOTIFICATION("FROSch::CoarseOperator",this->Verbose_,"Storing current Coarse Matrix in Parameterlist.");
+            this->ParameterList_->set("RCP(Coarse Matrix)", CoarseMatrix_);
+            this->ParameterList_->set("bool(CoarseSolveComm)", OnCoarseSolveComm_);
+        }
+
         return 0;
     }
 
@@ -605,8 +612,11 @@ namespace FROSch {
 #endif
 
             LO numProcsGatheringStep = this->MpiComm_->getSize();
-            GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex()+1;
-            int numMyRows;
+            GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex();
+            if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) {
+                numGlobalIndices += 1;
+            }
+            LO numMyRows;
             double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps));
 
             for (int i=0; i<gatheringSteps-1; i++) {
@@ -711,24 +721,19 @@ namespace FROSch {
             GatheringMaps_.resize(gatheringSteps);
             CoarseSolveExporters_.resize(gatheringSteps);
 
-            double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps));
-            LO numProcsGatheringStep = this->MpiComm_->getSize();
-            GO numGlobalIndices = CoarseMap_->getMaxAllGlobalIndex();
-            GO numMyRows;
-            numMyRows = 0;
-
             if (this->MpiComm_->getRank()%(this->MpiComm_->getSize()/NumProcsCoarseSolve_) == 0 && this->MpiComm_->getRank()/(this->MpiComm_->getSize()/NumProcsCoarseSolve_) < NumProcsCoarseSolve_) {
-                if (this->MpiComm_->getRank()==0) {
-                    numMyRows = numGlobalIndices - (numGlobalIndices/NumProcsCoarseSolve_)*(NumProcsCoarseSolve_-1);
-                } else {
-                    numMyRows = numGlobalIndices/NumProcsCoarseSolve_;
-                }
-            }
-
-            XMapPtr tmpCoarseMap = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_);
-            if (tmpCoarseMap->getNodeNumElements()>0) {
+                // if (this->MpiComm_->getRank()==0) {
+                //     numMyRows = numGlobalIndices - (numGlobalIndices/NumProcsCoarseSolve_)*(NumProcsCoarseSolve_-1);
+                // } else {
+                //     numMyRows = numGlobalIndices/NumProcsCoarseSolve_;
+                // }
                 OnCoarseSolveComm_=true;
             }
+
+            // // XMapPtr tmpCoarseMap = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_);
+            // if (tmpCoarseMap->getNodeNumElements()>0) {
+            //     OnCoarseSolveComm_=true;
+            // }
             CoarseSolveComm_ = this->MpiComm_->split(!OnCoarseSolveComm_,this->MpiComm_->getRank());
 
             //Gathering Steps for RepeatedMap#################################################
@@ -742,6 +747,7 @@ namespace FROSch {
             GO MLnumGlobalIndices = SubdomainConnectGraph_->getRowMap()->getMaxAllGlobalIndex()+1;
             GO MLnumMyRows;
 
+            LO numProcsGatheringStep = this->MpiComm_->getSize();
             MLGatheringMaps_[0] =  Xpetra::MapFactory<LO,GO,NO>::Build(this->K_->getMap()->lib(),-1,1,0,this->K_->getMap()->getComm());
             for (int i=1; i<MLgatheringSteps-1; i++) {
                 MLnumMyRows = 0;
@@ -754,7 +760,6 @@ namespace FROSch {
                     }
                 }
                 MLGatheringMaps_[i] = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,MLnumMyRows,0,this->MpiComm_);
-
             }
 
             MLnumMyRows = 0;
@@ -800,7 +805,11 @@ namespace FROSch {
             if (OnCoarseSolveComm_) {
                 //Coarse DofsMaps so far only one Block will work
                 ConstXMapPtrVecPtr2D CoarseDofsMaps(1);
-                FROSch::BuildRepMapZoltan(SubdomainConnectGraph_,ElementNodeList_, DistributionList_,MLCoarseMap_->getComm(),CoarseSolveRepeatedMap_);
+#ifdef HAVE_SHYLU_DDFROSCH_ZOLTAN2
+                BuildRepMapZoltan(SubdomainConnectGraph_,ElementNodeList_, DistributionList_,MLCoarseMap_->getComm(),CoarseSolveRepeatedMap_);
+#else
+                ThrowErrorMissingPackage("FROSch::CoarseOperator","Zoltan2");
+#endif
                 ConstRepMap = CoarseSolveRepeatedMap_;
                 ConstXMapPtrVecPtr NodesMapVector(1);
                 //MapVector for next Level
@@ -838,7 +847,26 @@ namespace FROSch {
                 sublist(sublist(sublist(this->ParameterList_,"CoarseSolver"),"FROSchPreconditioner"),"TwoLevelPreconditioner")->set("Nodes Map Vector",NodesMapVector);
             }
 
-            Teuchos::RCP<Xpetra::Map<LO,GO,NO> > tmpMap = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,uniEle,0,this->MpiComm_);
+            GatheringMaps_[gatheringSteps-1] = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,uniEle,0,this->MpiComm_);
+
+            GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex();
+            if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) {
+                numGlobalIndices += 1;
+            }
+            LO numMyRows;
+            double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps));
+
+            //
+            // double gatheringFactor = pow(double(this->MpiComm_->getSize())/double(NumProcsCoarseSolve_),1.0/double(gatheringSteps));
+            // LO numProcsGatheringStep = this->MpiComm_->getSize();
+            // GO numGlobalIndices = CoarseMap_->getMaxAllGlobalIndex();
+            //
+            // LO numProcsGatheringStep = this->MpiComm_->getSize();
+            // GO numGlobalIndices = coarseMapUnique->getMaxAllGlobalIndex();
+            // if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) {
+            //     numGlobalIndices += 1;
+            // }
+            // GO numMyRows;
 
             for (int i=0; i<gatheringSteps-1; i++) {
                 numMyRows = 0;
@@ -852,8 +880,9 @@ namespace FROSch {
                 }
                 GatheringMaps_[i] = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,numMyRows,0,this->MpiComm_);
             }
-            GatheringMaps_[gatheringSteps-1] = tmpMap;
-            CoarseSolveMap_ = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,tmpMap->getNodeElementList(),0,CoarseSolveComm_);
+
+            CoarseSolveMap_ = Xpetra::MapFactory<LO,GO,NO>::Build(CoarseMap_->lib(),-1,GatheringMaps_[gatheringSteps-1]->getNodeElementList(),0,CoarseSolveComm_);
+
         } else if (!DistributionList_->get("Type","linear").compare("Zoltan2")) {
 #ifdef HAVE_SHYLU_DDFROSCH_ZOLTAN2
             GatheringMaps_.resize(1);
@@ -868,6 +897,88 @@ namespace FROSch {
             FROSCH_ASSERT(false,"FROSch::CoarseOperator: Distribution type unknown.");
         }
 
+        // Output information about the Gatherin Steps
+        GO global,sum,numRanks;
+        LO local,minVal,maxVal;
+        SC avg;
+
+        global = coarseMapUnique->getMaxAllGlobalIndex();
+        if (coarseMapUnique->lib()==UseEpetra || coarseMapUnique->getGlobalNumElements()>0) {
+            global += 1;
+        }
+
+        local = (LO) max((LO) coarseMapUnique->getNodeNumElements(),(LO) 0);
+        reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum));
+        avg = max(sum/SC(this->MpiComm_->getSize()),0.0);
+        reduceAll(*this->MpiComm_,REDUCE_MIN,local,ptr(&minVal));
+        reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal));
+
+        if (this->Verbose_) {
+            cout
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << setw(89) << "-----------------------------------------------------------------------------------------"
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << "| "
+            << left << setw(74) << "> Gathering Steps Statistics " << right << setw(8) << "(Level " << setw(2) << this->LevelID_ << ")"
+            << " |"
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << setw(89) << "========================================================================================="
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << "| " << left << setw(7) << " " << right
+            << " | " << setw(10) << "ranks"
+            << " | " << setw(10) << "total"
+            << " | " << setw(10) << "avg"
+            << " | " << setw(10) << "min"
+            << " | " << setw(10) << "max"
+            << " | " << setw(10) << "global sum"
+            << " |"
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << setw(89) << "-----------------------------------------------------------------------------------------"
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << "| " << left << setw(4) << "Map " << setw(3) << "0" << right
+            << " | " << setw(10) << this->MpiComm_->getSize()
+            << " | " << setw(10) << global
+            << " | " << setw(10) << setprecision(5) << avg
+            << " | " << setw(10) << minVal
+            << " | " << setw(10) << maxVal
+            << " | " << setw(10) << sum
+            << " |";
+        }
+
+        for (int i=0; i<GatheringMaps_.size(); i++) {
+            global = GatheringMaps_[i]->getMaxAllGlobalIndex();
+            if (GatheringMaps_[i]->lib()==UseEpetra || GatheringMaps_[i]->getGlobalNumElements()>0) {
+                global += 1;
+            }
+
+            local = (LO) max((LO) GatheringMaps_[i]->getNodeNumElements(),(LO) 0);
+            reduceAll(*this->MpiComm_,REDUCE_SUM,GO(local),ptr(&sum));
+            reduceAll(*this->MpiComm_,REDUCE_SUM,GO(GatheringMaps_[i]->getNodeNumElements()>0),ptr(&numRanks));
+            avg = max(sum/SC(numRanks),0.0);
+            reduceAll(*this->MpiComm_,REDUCE_MIN,(GatheringMaps_[i]->getNodeNumElements()>0 ? local : numeric_limits<LO>::max()),ptr(&minVal));
+            reduceAll(*this->MpiComm_,REDUCE_MAX,local,ptr(&maxVal));
+
+            if (this->Verbose_) {
+                cout
+                << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+                << "| " << setw(4) << left << "Map " << setw(3) << i+1 << right
+                << " | " << setw(10) << numRanks
+                << " | " << setw(10) << global
+                << " | " << setw(10) << setprecision(3) << avg
+                << " | " << setw(10) << minVal
+                << " | " << setw(10) << maxVal
+                << " | " << setw(10) << sum
+                << " |";
+            }
+        }
+
+        if (this->Verbose_) {
+            cout
+            << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
+            << setw(89) << "-----------------------------------------------------------------------------------------"
+            << endl;
+        }
+
         return 0;
     }
 
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp
index c0bb5c01d11c..427bdceeb4aa 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_GDSWCoarseOperator_def.hpp
@@ -548,39 +548,39 @@ namespace FROSch {
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
                     << setw(89) << "========================================================================================="
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Vertices " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "Vertices " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useVertexTranslations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "ShortEdges " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "ShortEdges " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useShortEdgeTranslations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "ShortEdges " << " | " << setw(19) << " Rotations" << right
+                    << "| " << left << setw(19) << "ShortEdges " << " | " << setw(19) << "Rotations " << right
                     << " | " << setw(41) << boolalpha << useShortEdgeRotations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "StraightEdges " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "StraightEdges " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useStraightEdgeTranslations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "StraightEdges " << " | " << setw(19) << " Rotations" << right
+                    << "| " << left << setw(19) << "StraightEdges " << " | " << setw(19) << "Rotations " << right
                     << " | " << setw(41) << boolalpha << useStraightEdgeRotations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Edges " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "Edges " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useEdgeTranslations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Edges " << " | " << setw(19) << " Rotations" << right
+                    << "| " << left << setw(19) << "Edges " << " | " << setw(19) << "Rotations " << right
                     << " | " << setw(41) << boolalpha << useEdgeRotations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Faces " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "Faces " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useFaceTranslations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Faces " << " | " << setw(19) << " Rotations" << right
+                    << "| " << left << setw(19) << "Faces " << " | " << setw(19) << "Rotations " << right
                     << " | " << setw(41) << boolalpha << useFaceRotations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp
index 5d833762ca7d..0b1255b647c0 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_decl.hpp
@@ -179,7 +179,8 @@ namespace FROSch {
                                                         ConstXMapPtr rowMap,
                                                         ConstXMapPtr rangeMap,
                                                         ConstXMapPtr repeatedMap,
-                                                        SC treshold);
+                                                        SC tresholdDropping,
+                                                        SC tresholdOrthogonalization);
 
         virtual XMultiVectorPtr computeExtensions(ConstXMapPtr localMap,
                                                   GOVecView indicesGammaDofsAll,
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp
index 6f38352159e0..87ddaddbe5d6 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_HarmonicCoarseOperator_def.hpp
@@ -102,7 +102,7 @@ namespace FROSch {
 
         //Detect linear dependencies
         if (!this->ParameterList_->get("Skip DetectLinearDependencies",false)) {
-            LOVecPtr linearDependentVectors = detectLinearDependencies(indicesGammaDofsAll(),this->K_->getRowMap(),this->K_->getRangeMap(),repeatedMap,this->ParameterList_->get("Threshold Phi",1.e-8));
+            LOVecPtr linearDependentVectors = detectLinearDependencies(indicesGammaDofsAll(),this->K_->getRowMap(),this->K_->getRangeMap(),repeatedMap,this->ParameterList_->get("Phi: Dropping Threshold",1.e-8),this->ParameterList_->get("Phi: Orthogonalization Threshold",1.e-12));
             // cout << this->MpiComm_->getRank() << " " << linearDependentVectors.size() << endl;
             AssembledInterfaceCoarseSpace_->zeroOutBasisVectors(linearDependentVectors());
         }
@@ -263,11 +263,11 @@ namespace FROSch {
                 << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
                 << setw(89) << "========================================================================================="
                 << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                << "| " << left << setw(20) << "Volumes " << " | " << setw(19) << " Translations" << right
+                << "| " << left << setw(19) << "Volumes " << " | " << setw(19) << "Translations " << right
                 << " | " << setw(41) << boolalpha << useForCoarseSpace << noboolalpha
                 << " |"
                 << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                << "| " << left << setw(20) << "Volumes " << " | " << setw(19) << " Rotations" << right
+                << "| " << left << setw(19) << "Volumes " << " | " << setw(19) << "Rotations " << right
                 << " | " << setw(41) << boolalpha << useRotations << noboolalpha
                 << " |"
                 << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
@@ -491,7 +491,8 @@ namespace FROSch {
                                                                                                                          ConstXMapPtr rowMap,
                                                                                                                          ConstXMapPtr rangeMap,
                                                                                                                          ConstXMapPtr repeatedMap,
-                                                                                                                         SC treshold)
+                                                                                                                         SC tresholdDropping,
+                                                                                                                         SC tresholdOrthogonalization)
     {
         FROSCH_DETAILTIMER_START_LEVELID(detectLinearDependenciesTime,"HarmonicCoarseOperator::detectLinearDependencies");
         LOVecPtr linearDependentVectors(AssembledInterfaceCoarseSpace_->getBasisMap()->getNodeNumElements()); //if (this->Verbose_) cout << AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors() << " " << AssembledInterfaceCoarseSpace_->getAssembledBasis()->getLocalLength() << " " << indicesGammaDofsAll.size() << endl;
@@ -499,6 +500,16 @@ namespace FROSch {
             //Construct matrix phiGamma
             XMatrixPtr phiGamma = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledInterfaceCoarseSpace_->getBasisMap()->getNodeNumElements());
 
+            // Array for scaling the columns of PhiGamma (1/norm(PhiGamma(:,i)))
+            SCVec scale(AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors(),0.0);
+            for (UN i = 0; i < AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors(); i++) {
+                ConstSCVecPtr assembledInterfaceCoarseSpaceData = AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(i);
+                for (UN j = 0; j < AssembledInterfaceCoarseSpace_->getAssembledBasis()->getLocalLength(); j++) {
+                    scale[i] += assembledInterfaceCoarseSpaceData[j]*assembledInterfaceCoarseSpaceData[j];
+                }
+                scale[i] = 1.0/sqrt(scale[i]);
+            }
+
             LO iD;
             SC valueTmp;
             for (UN i=0; i<AssembledInterfaceCoarseSpace_->getAssembledBasis()->getLocalLength(); i++) {
@@ -506,9 +517,9 @@ namespace FROSch {
                 SCVec values;
                 for (UN j=0; j<AssembledInterfaceCoarseSpace_->getAssembledBasis()->getNumVectors(); j++) {
                     valueTmp=AssembledInterfaceCoarseSpace_->getAssembledBasis()->getData(j)[i];
-                    if (fabs(valueTmp)>treshold) {
+                    if (fabs(valueTmp)>tresholdDropping) {
                         indices.push_back(AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(j));
-                        values.push_back(valueTmp);
+                        values.push_back(valueTmp*scale[j]);
                     }
                 }
                 iD = repeatedMap->getGlobalElement(indicesGammaDofsAll[i]);
@@ -560,11 +571,16 @@ namespace FROSch {
             TSerialDenseMatrixPtr r = qRSolver->getR();
             LO tmp = 0;
             for (LO i=0; i<r->numRows(); i++) {
-                SC normRow = 0.0;
-                for (LO j=0; j<r->numCols(); j++) {
-                    normRow += (*r)(i,j)*(*r)(i,j);
-                }
-                if (sqrt(normRow)<treshold) {
+                // SC normRow = 0.0;
+                // for (LO j=0; j<r->numCols(); j++) {
+                //     normRow += (*r)(i,j)*(*r)(i,j);
+                // }
+                // if (sqrt(normRow)<treshold) {
+                //     //cout << this->MpiComm_->getRank() << " " << i << " " << AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(i) << " " << sqrt(normRow) << std::endl;
+                //     linearDependentVectors[tmp] = i;
+                //     tmp++;
+                // }
+                if (fabs((*r)(i,i))<tresholdOrthogonalization) {
                     //cout << this->MpiComm_->getRank() << " " << i << " " << AssembledInterfaceCoarseSpace_->getBasisMap()->getGlobalElement(i) << " " << sqrt(normRow) << std::endl;
                     linearDependentVectors[tmp] = i;
                     tmp++;
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp
index 6752e5eb2dea..a0456f38d271 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_RGDSWCoarseOperator_def.hpp
@@ -227,11 +227,11 @@ namespace FROSch {
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
                     << setw(89) << "========================================================================================="
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Coarse nodes " << " | " << setw(19) << " Translations" << right
+                    << "| " << left << setw(19) << "Coarse nodes " << " | " << setw(19) << "Translations " << right
                     << " | " << setw(41) << boolalpha << useForCoarseSpace << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
-                    << "| " << left << setw(20) << "Coarse nodes " << " | " << setw(19) << " Rotations" << right
+                    << "| " << left << setw(19) << "Coarse nodes " << " | " << setw(19) << "Rotations " << right
                     << " | " << setw(41) << boolalpha << useRotations << noboolalpha
                     << " |"
                     << "\n" << setw(FROSCH_OUTPUT_INDENT) << " "
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp
index dc953563acec..05be7eb645fa 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_decl.hpp
@@ -49,6 +49,7 @@
 #include <Teuchos_DefaultSerialComm.hpp>
 
 #include <Teuchos_SerialQRDenseSolver.hpp>
+#include <Teuchos_TwoDArray.hpp>
 
 #include <ShyLU_DDFROSch_config.h>
 
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp
index ec275a55609f..05bda9f7f47c 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzOperators/FROSch_SchwarzOperator_def.hpp
@@ -55,7 +55,7 @@ namespace FROSch {
     MpiComm_ (comm),
     Verbose_ (comm->getRank()==0)
     {
-        
+
     }
 
     template<class SC,class LO,class GO,class NO>
@@ -122,10 +122,11 @@ namespace FROSch {
     template<class SC,class LO,class GO,class NO>
     void SchwarzOperator<SC,LO,GO,NO>::residual(const XMultiVector & X,
                                                 const XMultiVector & B,
-                                                XMultiVector& R) const {
-      SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one;
-      apply(X,R);
-      R.update(one,B,negone);
+                                                XMultiVector& R) const
+    {
+        SC one = ScalarTraits<SC>::one(), negone = -one;
+        apply(X,R);
+        R.update(one,B,negone);
     }
 }
 
diff --git a/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp b/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp
index 24ddaa7681a6..40c130de7106 100644
--- a/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SchwarzPreconditioners/FROSch_TwoLevelBlockPreconditioner_def.hpp
@@ -61,17 +61,17 @@ namespace FROSch {
             // Set the LevelID in the sublist
             parameterList->sublist("IPOUHarmonicCoarseOperator").set("Level ID",this->LevelID_);
             //                FROSCH_ASSERT(false,"not implemented for block.");
-            this->ParameterList_->sublist("IPOUHarmonicCoarseOperator").sublist("InterfacePartitionOfUnity").set("Test Unconnected Interface",false);
+            this->ParameterList_->sublist("IPOUHarmonicCoarseOperator").sublist("InterfacePartitionOfUnity").set("Test Unconnected Interface",true);
             CoarseOperator_ = IPOUHarmonicCoarseOperatorPtr(new IPOUHarmonicCoarseOperator<SC,LO,GO,NO>(k,sublist(parameterList,"IPOUHarmonicCoarseOperator")));
         } else if (!this->ParameterList_->get("CoarseOperator Type","IPOUHarmonicCoarseOperator").compare("GDSWCoarseOperator")) {
             // Set the LevelID in the sublist
             parameterList->sublist("GDSWCoarseOperator").set("Level ID",this->LevelID_);
-            this->ParameterList_->sublist("GDSWCoarseOperator").set("Test Unconnected Interface",false);
+            this->ParameterList_->sublist("GDSWCoarseOperator").set("Test Unconnected Interface",true);
             CoarseOperator_ = GDSWCoarseOperatorPtr(new GDSWCoarseOperator<SC,LO,GO,NO>(k,sublist(parameterList,"GDSWCoarseOperator")));
         } else if (!this->ParameterList_->get("CoarseOperator Type","IPOUHarmonicCoarseOperator").compare("RGDSWCoarseOperator")) {
             // Set the LevelID in the sublist
             parameterList->sublist("RGDSWCoarseOperator").set("Level ID",this->LevelID_);
-            this->ParameterList_->sublist("RGDSWCoarseOperator").set("Test Unconnected Interface",false);
+            this->ParameterList_->sublist("RGDSWCoarseOperator").set("Test Unconnected Interface",true);
             CoarseOperator_ = RGDSWCoarseOperatorPtr(new RGDSWCoarseOperator<SC,LO,GO,NO>(k,sublist(parameterList,"RGDSWCoarseOperator")));
         } else {
             FROSCH_ASSERT(false,"CoarseOperator Type unkown.");
diff --git a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp
index ea897ab9361f..a5e28b1c7723 100644
--- a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraPreconditioner_def.hpp
@@ -92,6 +92,11 @@ namespace FROSch {
         }
 
         ThyraPreconditioner_->getUnspecifiedPrecOp()->apply(tMode,*xThyra,yThyra.ptr(),alpha,beta);
+
+        // It seems that we have to convert the Thyra vector back to Xpetra. Is there a cheaper/more elegant way?
+        // Same for ThyraSolver
+        XMultiVectorPtr yXpetra = ThyraUtils<SC,LO,GO,NO>::toXpetra(yThyra,y.getMap()->getComm());
+        y = *yXpetra;
     }
 
     template<class SC,class LO,class GO,class NO>
diff --git a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp
index 060a6b35f76e..ef94aba9d78a 100644
--- a/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/SolverInterfaces/FROSch_ThyraSolver_def.hpp
@@ -93,6 +93,12 @@ namespace FROSch {
         }
 
         SolveStatus<double> status = solve<double>(*ThyraSolver_,tMode,*xThyra,YT_.ptr());
+
+        // It seems that we have to convert the Thyra vector back to Xpetra. Is there a cheaper/more elegant way?
+        // Same for ThyraPreconditioner
+        XMultiVectorPtr yXpetra = ThyraUtils<SC,LO,GO,NO>::toXpetra(YT_,y.getMap()->getComm());
+        y = *yXpetra;
+
         y.update(alpha,*YX_,beta);
     }
 
diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h
index f5f61805487d..f9c8d97f44c7 100644
--- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h
+++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Output.h
@@ -52,22 +52,22 @@
 #ifndef FROSCH_ASSERT
     #define FROSCH_ASSERT(COND,MSG) \
     { \
-      const bool throw_exception = !(COND); \
-      if(throw_exception) { \
-        Teuchos::TestForException_incrThrowNumber(); \
-        std::ostringstream omsg; \
-        omsg \
-          << std::setw(FROSCH_OUTPUT_INDENT) << " " << __FILE__ << ":" << __LINE__ << ":\n\n" \
-          << "Throw number = " << Teuchos::TestForException_getThrowNumber() \
-          << "\n\n" \
-          << std::setw(FROSCH_OUTPUT_INDENT) << " " << "Throw test that evaluated to true: "#COND \
-          << "\n\n" \
-          << std::setw(FROSCH_OUTPUT_INDENT) << " " << "[ERROR] " << MSG; \
-        const std::string &omsgstr = omsg.str(); \
-        TEUCHOS_STORE_STACKTRACE(); \
-        Teuchos::TestForException_break(omsgstr); \
-        throw std::logic_error(omsgstr); \
-      } \
+        const bool throw_exception = !(COND); \
+        if(throw_exception) { \
+            Teuchos::TestForException_incrThrowNumber(); \
+            std::ostringstream omsg; \
+            omsg \
+                << std::setw(FROSCH_OUTPUT_INDENT) << " " << __FILE__ << ":" << __LINE__ << ":\n\n" \
+                << "Throw number = " << Teuchos::TestForException_getThrowNumber() \
+                << "\n\n" \
+                << std::setw(FROSCH_OUTPUT_INDENT) << " " << "Throw test that evaluated to true: "#COND \
+                << "\n\n" \
+                << std::setw(FROSCH_OUTPUT_INDENT) << " " << "[ERROR] " << MSG; \
+            const std::string &omsgstr = omsg.str(); \
+            TEUCHOS_STORE_STACKTRACE(); \
+            Teuchos::TestForException_break(omsgstr); \
+            throw std::logic_error(omsgstr); \
+        } \
     }
 #endif
 
diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp
index dc147819f3bb..b5a0095c3526 100644
--- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_decl.hpp
@@ -195,6 +195,13 @@ namespace FROSch {
     template <class SC, class LO, class GO, class NO>
     void readMM(std::string fileName, Teuchos::RCP<Xpetra::Matrix<SC,LO,GO,NO> > &matrix_,RCP<const Comm<int> > &comm);
 
+    template <class SC,class LO,class GO,class NO>
+    RCP<Map<LO,GO,NO> > BuildRepeatedMapGaleriStruct2D(RCP<const Matrix<SC,LO,GO,NO> > matrix,int M,int Dim);
+
+
+    template <class SC,class LO,class GO,class NO>
+    RCP<Map<LO,GO,NO> > BuildRepeatedMapGaleriStruct3D(RCP<const Map<LO,GO,NO> > matrix,int M,int Dim);
+
     template <class LO,class GO,class NO>
     RCP<const Map<LO,GO,NO> > BuildUniqueMap(const RCP<const Map<LO,GO,NO> > map,
                                              bool useCreateOneToOneMap = true,
diff --git a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp
index 6d81c41d0903..52360fc17016 100644
--- a/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp
+++ b/packages/shylu/shylu_dd/frosch/src/Tools/FROSch_Tools_def.hpp
@@ -243,6 +243,191 @@ namespace FROSch {
         matrix_ = rcp_dynamic_cast<Matrix<SC,LO,GO,NO> >(tmpMatrix);
     }
 
+    template <class SC,class LO,class GO,class NO>
+    RCP<Map<LO,GO,NO> > BuildRepeatedMapGaleriStruct2D(RCP<const Matrix<SC,LO,GO,NO> > matrix,int M,int Dim)
+    {
+        Teuchos::ArrayView< const GO> eleList;
+        eleList = matrix->getMap()->getNodeElementList();
+        Teuchos::RCP< const Teuchos::Comm< int > > Comm = matrix->getMap()->getComm();
+
+        int size = Comm->getSize();
+        int rank = Comm->getRank();
+
+        Teuchos::Array<GO> vert;
+        vert.reserve(M*Dim);
+        Teuchos::Array<GO> horz;
+        horz.reserve((M+1)*Dim);
+        int numSubPerRow  = sqrt(size);
+        GO nodesInRow = M*Dim*numSubPerRow;
+        Teuchos::Array<GO> newEle;
+        newEle.reserve(eleList.size()+M*Dim+(M+1)*Dim);
+        int count = 0;
+        for (int i = 0;i<eleList.size();i++) {
+            newEle.push_back(eleList[i]);
+            count++;
+        }
+
+        if (rank%numSubPerRow != numSubPerRow-1) {
+            for (int j = 0;j<M;j++) {
+                for (int i = 0;i<Dim;i++) {
+                    vert.push_back(eleList[Dim*M*(j+1)-1]+(i+1));
+                    newEle.push_back(vert[j*Dim+i]);
+                    count++;
+                }
+            }
+        }
+
+        if (rank<size-numSubPerRow){
+            if (rank%numSubPerRow == numSubPerRow-1) {
+                for (int j=0;j<Dim*M;j++) {
+                    horz.push_back(eleList[eleList.size()-Dim*M]+nodesInRow+j);
+                    newEle.push_back(horz[j]);
+                    count++;
+                }
+            } else {
+                for (int j=0;j<Dim*M+2;j++) {
+                    horz.push_back(eleList[eleList.size()-Dim*M]+nodesInRow+j);
+                    newEle.push_back(horz[j]);
+                    count++;
+                }
+            }
+        }
+
+        return Xpetra::MapFactory<LO,GO,NO>::Build(matrix->getMap()->lib(),matrix->getMap()->getGlobalNumElements(),newEle(),0,Comm);
+
+    }
+
+    template <class SC,class LO,class GO,class NO>
+    RCP<Map<LO,GO,NO> > BuildRepeatedMapGaleriStruct3D(RCP<const Map<LO,GO,NO> > matrix,int M,int Dim)
+    {
+
+        FROSCH_DETAILTIMER_START(Galeri3DMap,"BuildGeometricMap3D");
+
+        Teuchos::ArrayView< const GO> eleList;
+        eleList = matrix->getNodeElementList();
+        Teuchos::RCP< const Teuchos::Comm< int > > Comm = matrix->getComm();
+
+        int size = Comm->getSize();
+        int rank = Comm->getRank();
+
+        Teuchos::Array<GO> vert;
+        vert.reserve(M*Dim);
+        Teuchos::Array<GO> horz;
+        horz.reserve((M+1)*Dim);
+        int numSubPerRow  = std::pow(size,1/3.)+0.7;
+        //int numSubPerRow = numSubPerRow1;
+        //  numSubPerRow = numSubPerRow+1;
+        //if(Comm->getRank() == 0) std::cout<<"Size "<<size<<"\n";
+        //if(Comm->getRank() == 0) std::cout<<"numSubPerRow1 "<<numSubPerRow1<<"  numSubPerRow "<<numSubPerRow<<"\n";
+        int subInLev = numSubPerRow*numSubPerRow;
+        //if(Comm->getRank() == 0) std::cout<<"subInLev "<<subInLev<<"\n";
+        GO nodesInRow = M*numSubPerRow;
+        //if(Comm->getRank() == 0) std::cout<<"nodesInRow "<<nodesInRow<<"\n";
+        GO nodesInLev = nodesInRow*nodesInRow;
+        //if(Comm->getRank() == 0) std::cout<<"nodesInLev "<<nodesInLev<<"\n";
+        int subLevel = rank/(numSubPerRow*numSubPerRow);
+
+
+        Teuchos::Array<GO> newEle;
+        newEle.reserve(eleList.size()+M*Dim+(M+1)*Dim);
+        GO startval = eleList[0]/Dim;
+
+        //Differentiate between locations of the sub
+        //not back
+        if (rank<size-subInLev) {
+            //not right boundary
+            if (rank%numSubPerRow != numSubPerRow-1) {
+                //not top
+                if (subLevel*subInLev <=rank && rank<(subLevel+1)*subInLev-numSubPerRow) {
+                    for (int k = 0;k<M+1;k++) {
+                        for (int j = 0;j<M+1;j++) {
+                            for (int i = 0;i<M+1;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                } else {
+                    //top
+                    for (int k = 0;k<M+1;k++) {
+                        for (int j = 0;j<M;j++) {
+                            for (int i = 0;i<M+1;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                }
+            } else {
+                //rightboundary
+                if(subLevel*subInLev <=rank && rank<(subLevel+1)*subInLev-numSubPerRow){
+                    for (int k = 0;k<M+1;k++) {
+                        for (int j = 0;j<M+1;j++) {
+                            for (int i = 0;i<M;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                } else {
+                    //top
+                    for (int k = 0;k<M+1;k++) {
+                        for (int j = 0;j<M;j++) {
+                            for (int i = 0;i<M;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                }
+            }
+            //#########################################
+        } else{
+            //back
+            if(rank%numSubPerRow != numSubPerRow-1){
+                //not top
+                if (subLevel*subInLev <=rank && rank<(subLevel+1)*subInLev-numSubPerRow){
+                    for (int k = 0;k<M;k++) {
+                        for (int j = 0;j<M+1;j++) {
+                            for (int i = 0;i<M+1;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                } else {
+                    //top
+                    for (int k = 0;k<M;k++) {
+                        for (int j = 0;j<M;j++) {
+                            for (int i = 0;i<M+1;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                }
+            } else {
+                //rightboundary
+                if (subLevel*subInLev <=rank && rank<(subLevel+1)*subInLev-numSubPerRow) {
+                    for (int k = 0;k<M;k++) {
+                        for (int j = 0;j<M+1;j++) {
+                            for (int i = 0;i<M;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                } else {
+                    //top
+                    for (int k = 0;k<M;k++) {
+                        for (int j = 0;j<M;j++) {
+                            for (int i = 0;i<M;i++) {
+                                newEle.push_back(startval+i+j*nodesInRow+nodesInLev*k);
+                            }
+                        }
+                    }
+                }
+            }
+
+        }
+
+        return Xpetra::MapFactory<LO,GO,NO>::Build(matrix->getMap()->lib(),matrix->getMap()->getGlobalNumElements(),newEle(),0,Comm);
+
+    }
+
     template <class LO,class GO,class NO>
     RCP<const Map<LO,GO,NO> > BuildUniqueMap(const RCP<const Map<LO,GO,NO> > map,
                                              bool useCreateOneToOneMap,
@@ -1032,8 +1217,8 @@ namespace FROSch {
 
     template <class LO,class GO,class NO>
     ArrayRCP<RCP<const Map<LO,GO,NO> > > BuildNodeMapsFromDofMaps(ArrayRCP<ArrayRCP<RCP<const Map<LO,GO,NO> > > > dofsMapsVecVec,
-                                                            ArrayRCP<unsigned> dofsPerNodeVec,
-                                                            ArrayRCP<DofOrdering> dofOrderingVec)
+                                                                  ArrayRCP<unsigned> dofsPerNodeVec,
+                                                                  ArrayRCP<DofOrdering> dofOrderingVec)
     {
 
         typedef Map<LO,GO,NO> Map;
@@ -1107,8 +1292,7 @@ namespace FROSch {
 
                 }
                 nodeMapsVec[block] = MapFactory<LO,GO,NO>::Build( dofsMapsVecVec[block][0]->lib(), -1,globalIndicesNode(), 0, dofsMapsVecVec[block][0]->getComm() );
-            }
-            else{ //DimensionWise
+            } else { //DimensionWise
                 GO minGID = dofsMapsVecVec[block][0]->getMinAllGlobalIndex();
                 ArrayView< const GO > globalIndices = dofsMapsVecVec[block][0]->getNodeElementList();
                 Array<GO> globalIndicesNode( globalIndices );
diff --git a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp
index 10bd24bfd97f..8a834b5178c5 100644
--- a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp
+++ b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Elasticity/main.cpp
@@ -125,7 +125,8 @@ int main(int argc, char *argv[])
     My_CLP.setOption("PLIST",&xmlFile,"File name of the parameter list.");
     bool useepetra = false;
     My_CLP.setOption("USEEPETRA","USETPETRA",&useepetra,"Use Epetra infrastructure for the linear algebra.");
-
+    bool useGeoMap = false;
+    My_CLP.setOption("useGeoMap","useAlgMap",&useGeoMap,"Use Geometric Map");
     My_CLP.recogniseAllOptions(true);
     My_CLP.throwExceptions(false);
     CommandLineProcessor::EParseCommandLineReturn parseReturn = My_CLP.parse(argc,argv);
@@ -200,7 +201,25 @@ int main(int argc, char *argv[])
             RCP<Galeri::Xpetra::Problem<Map<LO,GO,NO>,CrsMatrixWrap<SC,LO,GO,NO>,MultiVector<SC,LO,GO,NO> > > Problem = Galeri::Xpetra::BuildProblem<SC,LO,GO,Map<LO,GO,NO>,CrsMatrixWrap<SC,LO,GO,NO>,MultiVector<SC,LO,GO,NO> >("Elasticity3D",UniqueMap,GaleriList);
             K = Problem->BuildMatrix();
         }
-        RCP<Map<LO,GO,NO> > RepeatedMap = BuildRepeatedMapNonConst<LO,GO,NO>(K->getCrsGraph());
+
+
+        RCP<Map<LO,GO,NO> > FullRepeatedMap;
+        RCP<Map<LO,GO,NO> > RepeatedMap;
+        RCP<const Map<LO,GO,NO> > FullRepeatedMapNode;
+        if (useGeoMap) {
+            if (Dimension == 2) {
+                FullRepeatedMap = BuildRepeatedMapGaleriStruct2D<SC,LO,GO,NO>(K,M,Dimension);
+                RepeatedMap = FullRepeatedMap;
+            } else if (Dimension == 3) {
+                FullRepeatedMapNode = BuildRepeatedMapGaleriStruct3D<SC,LO,GO,NO>(K->getMap(),M,Dimension);
+                FullRepeatedMap = BuildMapFromNodeMap(FullRepeatedMapNode,Dimension,NodeWise);
+                //FullRepeatedMapNode->describe(*fancy,Teuchos::VERB_EXTREME);
+                RepeatedMap = FullRepeatedMap;
+            }
+        } else {
+            RepeatedMap = BuildRepeatedMapNonConst<LO,GO,NO>(K->getCrsGraph());
+        }
+
 
         RCP<MultiVector<SC,LO,GO,NO> > xSolution = MultiVectorFactory<SC,LO,GO,NO>::Build(UniqueMap,1);
         RCP<MultiVector<SC,LO,GO,NO> > xRightHandSide = MultiVectorFactory<SC,LO,GO,NO>::Build(UniqueMap,1);
@@ -214,7 +233,7 @@ int main(int argc, char *argv[])
         RCP<const MultiVectorBase<SC> >thyraB = ThyraUtils<SC,LO,GO,NO>::toThyraMultiVector(xRightHandSide);
 
         //-----------Set Coordinates and RepMap in ParameterList--------------------------
-        RCP<ParameterList> plList =  sublist(parameterList,"Preconditioner Types");
+        RCP<ParameterList> plList = sublist(parameterList,"Preconditioner Types");
         sublist(plList,"FROSch")->set("Dimension",Dimension);
         sublist(plList,"FROSch")->set("Overlap",Overlap);
         sublist(plList,"FROSch")->set("DofOrdering","NodeWise");
diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp
index ef87eaf00942..9730acaf0525 100644
--- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp
+++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp
@@ -524,7 +524,7 @@ namespace Tacho {
       track_free(_factorize_mode.span()*sizeof(ordinal_type));
       track_free(_solve_mode.span()*sizeof(ordinal_type));
       track_free(_level_sids.span()*sizeof(ordinal_type));
-      if (verbose || true) {
+      if (verbose) {
         printf("Summary: LevelSetTools-Variant-%d (Release)\n", variant);
         printf("============================================\n");
         print_stat_memory();
@@ -616,7 +616,7 @@ namespace Tacho {
       for (ordinal_type i=0;i<_nstreams;++i) {
         ExecSpaceFactory<exec_space>::createInstance(_cuda_streams[i], _exec_instances[i]);
       }
-      if (verbose || true) {
+      if (verbose) {
         printf("Summary: CreateStream : %3d\n", _nstreams);
         printf("===========================\n");          
       }
@@ -1726,7 +1726,18 @@ namespace Tacho {
         const ordinal_type half_level = _nlevel/2;
         //const ordinal_type team_size_factor[2] = { 64, 16 }, vector_size_factor[2] = { 8, 8};
         //const ordinal_type team_size_factor[2] = { 16, 16 }, vector_size_factor[2] = { 32, 32};
+#if defined (CUDA_VERSION)
+#if (11000 > CUDA_VERSION)
+        /// cuda 11.1 below
+        const ordinal_type team_size_factor[2] = { 32, 64 }, vector_size_factor[2] = { 8, 4};        
+#else 
+        /// cuda 11.1 and higher
+        const ordinal_type team_size_factor[2] = { 64, 64 }, vector_size_factor[2] = { 8, 4};
+#endif
+#else
+        /// not cuda ... whatever..
         const ordinal_type team_size_factor[2] = { 64, 64 }, vector_size_factor[2] = { 8, 4};
+#endif
         const ordinal_type team_size_update[2] = { 16, 8 }, vector_size_update[2] = { 32, 32};
         {
           typedef TeamFunctor_FactorizeLDL<supernode_info_type> functor_type;
@@ -1848,7 +1859,18 @@ namespace Tacho {
 #endif
         // this should be considered with average problem sizes in levels
         const ordinal_type half_level = _nlevel/2;
+#if defined (CUDA_VERSION)
+#if (11000 > CUDA_VERSION)
+        /// cuda 11.1 below
+        const ordinal_type team_size_solve[2] = { 32, 16 }, vector_size_solve[2] = { 8, 8};
+#else
+        /// cuda 11.1 and higher
+        const ordinal_type team_size_solve[2] = { 32, 16 }, vector_size_solve[2] = { 8, 8};
+#endif
+#else
+        /// not cuda whatever...
         const ordinal_type team_size_solve[2] = { 64, 16 }, vector_size_solve[2] = { 8, 8};
+#endif
         const ordinal_type team_size_update[2] = { 128, 32}, vector_size_update[2] = { 1, 1};
         {
           typedef TeamFunctor_SolveLowerLDL<supernode_info_type> functor_type;
diff --git a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp
index a6d2b21b2b61..b3f44635b715 100644
--- a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp
+++ b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest.hpp
@@ -55,6 +55,7 @@
 #include "Tpetra_Vector.hpp"
 #include "Tpetra_CrsGraph.hpp"
 #include "Tpetra_CrsMatrix.hpp"
+#include "Tpetra_Details_WrappedDualView.hpp"
 #include "Stokhos_Tpetra_CG.hpp"
 
 // Belos solver
@@ -1001,6 +1002,46 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(
   }
 }
 
+//
+// Test interaction between Tpetra WrappedDualView and MP::Vector
+//
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(
+  Tpetra_CrsMatrix_MP, WrappedDualView, Storage, LocalOrdinal, GlobalOrdinal, Node )
+{
+  //BMK 6-2021: This test is required because a View of MP::Vector has slightly different behavior than a typical Kokkos::View.
+  //If you construct a Kokkos::View with a label and 0 extent, it gets a non-null allocation.
+  //But for View<MP::Vector>, the same constructor produces a null data pointer but
+  //an active reference counting node (use_count() > 0).
+  //This test makes sure that Tpetra WrappedDualView works correctly with a View where data() == nullptr but use_count() > 0.
+  using Teuchos::RCP;
+  using Teuchos::rcp;
+  using Teuchos::ArrayView;
+  using Teuchos::Array;
+  using Teuchos::ArrayRCP;
+
+  typedef typename Storage::value_type BaseScalar;
+  typedef Sacado::MP::Vector<Storage> Scalar;
+
+  using DualViewType = Kokkos::DualView<Scalar*, typename Node::device_type>;
+  using WDV = Tpetra::Details::WrappedDualView<DualViewType>;
+  using values_view = typename DualViewType::t_dev;
+
+  // Ensure device is initialized
+  if ( !Kokkos::is_initialized() )
+    Kokkos::initialize();
+
+  WDV wdv;
+  {
+    values_view myView("emptyTestView", 0);
+    wdv = WDV(myView);
+  }
+  size_t use_h = wdv.getHostView(Tpetra::Access::ReadOnly).use_count();
+  size_t use_d = wdv.getDeviceView(Tpetra::Access::ReadOnly).use_count();
+  //The WrappedDualView is now the only object holding references to the host and device views,
+  //so they should have identical use counts.
+  TEST_EQUALITY(use_h, use_d);
+}
+
 //
 // Test simple CG solve without preconditioning for a 1-D Laplacian matrix
 //
@@ -2448,6 +2489,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MultiVectorDotSub, S, LO, GO, N ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MatrixVectorMultiply, S, LO, GO, N ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, MatrixMultiVectorMultiply, S, LO, GO, N ) \
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, WrappedDualView, S, LO, GO, N ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, Flatten, S, LO, GO, N ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, SimpleCG, S, LO, GO, N ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Tpetra_CrsMatrix_MP, SimplePCG_Muelu, S, LO, GO, N ) \
diff --git a/packages/tempus/src/Tempus_Integrator.hpp b/packages/tempus/src/Tempus_Integrator.hpp
index 05e6e5d80d4f..4dbcd4a2f79d 100644
--- a/packages/tempus/src/Tempus_Integrator.hpp
+++ b/packages/tempus/src/Tempus_Integrator.hpp
@@ -82,6 +82,8 @@ class Integrator
     virtual void setTempusParameterList(Teuchos::RCP<Teuchos::ParameterList> pl) = 0;
     /// Returns the SolutionHistory for this Integrator
     virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const = 0;
+    /// Returns the SolutionHistory for this Integrator
+    virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() = 0;
     /// Returns the TimeStepControl for this Integrator
     virtual Teuchos::RCP<const TimeStepControl<Scalar> > getTimeStepControl() const = 0;
     virtual Teuchos::RCP<TimeStepControl<Scalar> > getNonConstTimeStepControl() = 0;
diff --git a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp
index b8d3479e7ac6..7780cb750c7f 100644
--- a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_decl.hpp
@@ -108,6 +108,8 @@ class IntegratorAdjointSensitivity :
   virtual void setTempusParameterList(Teuchos::RCP<Teuchos::ParameterList> pl) override;
   /// Get the SolutionHistory
   virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override;
+  /// Get the SolutionHistory
+  virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override;
    /// Get the TimeStepControl
   virtual Teuchos::RCP<const TimeStepControl<Scalar> > getTimeStepControl() const override;
   virtual Teuchos::RCP<TimeStepControl<Scalar> > getNonConstTimeStepControl() override;
diff --git a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp
index 4339983292cd..b33e26d13289 100644
--- a/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorAdjointSensitivity_impl.hpp
@@ -280,6 +280,14 @@ getSolutionHistory() const
   return solutionHistory_;
 }
 
+template<class Scalar>
+Teuchos::RCP<SolutionHistory<Scalar> >
+IntegratorAdjointSensitivity<Scalar>::
+getNonConstSolutionHistory()
+{
+  return solutionHistory_;
+}
+
 template<class Scalar>
 Teuchos::RCP<const TimeStepControl<Scalar> >
 IntegratorAdjointSensitivity<Scalar>::
@@ -376,12 +384,13 @@ describe(
   Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
-
   auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
   l_out->setOutputToRootOnly(0);
+
   *l_out << description() << "::describe" << std::endl;
-  state_integrator_->describe(out, verbLevel);
-  adjoint_integrator_->describe(out, verbLevel);
+  state_integrator_->describe(*l_out, verbLevel);
+  adjoint_integrator_->describe(*l_out, verbLevel);
 }
 
 template<class Scalar>
diff --git a/packages/tempus/src/Tempus_IntegratorBasic.cpp b/packages/tempus/src/Tempus_IntegratorBasic.cpp
index 7607f8f8bde2..bbfc6b59bf73 100644
--- a/packages/tempus/src/Tempus_IntegratorBasic.cpp
+++ b/packages/tempus/src/Tempus_IntegratorBasic.cpp
@@ -16,6 +16,10 @@ namespace Tempus {
 
   TEMPUS_INSTANTIATE_TEMPLATE_CLASS(IntegratorBasic)
 
+  // Nonmember ctor
+  template Teuchos::RCP<IntegratorBasic<double> > createIntegratorBasic(
+    Teuchos::RCP<Teuchos::ParameterList>        parameterList);
+
   // Nonmember ctor
   template Teuchos::RCP<IntegratorBasic<double> > createIntegratorBasic(
     Teuchos::RCP<Teuchos::ParameterList>        parameterList,
diff --git a/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp b/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp
index 5f1d13677a05..b28288b2fae4 100644
--- a/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorBasicOld_decl.hpp
@@ -111,6 +111,9 @@ class IntegratorBasicOld
     /// Get the SolutionHistory
     virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override
       {return solutionHistory_;}
+    /// Get the SolutionHistory
+    virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override
+      {return solutionHistory_;}
     /// Set the SolutionHistory
     virtual void setSolutionHistory(
       Teuchos::RCP<SolutionHistory<Scalar> > sh = Teuchos::null);
diff --git a/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp b/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp
index 3c73274007b2..68c0379e70ec 100644
--- a/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorBasicOld_impl.hpp
@@ -387,8 +387,8 @@ bool IntegratorBasicOld<Scalar>::advanceTime()
     startIntegrator();
     integratorObserver_->observeStartIntegrator(*this);
 
-    while (integratorStatus_ == WORKING and
-        timeStepControl_->timeInRange (solutionHistory_->getCurrentTime()) and
+    while (integratorStatus_ == WORKING &&
+        timeStepControl_->timeInRange (solutionHistory_->getCurrentTime()) &&
         timeStepControl_->indexInRange(solutionHistory_->getCurrentIndex())){
 
       stepperTimer_->reset();
@@ -483,11 +483,11 @@ void IntegratorBasicOld<Scalar>::checkTimeStep()
   }
 
   // Check Stepper failure.
-  if (ws->getSolutionStatus() == Status::FAILED or
+  if (ws->getSolutionStatus() == Status::FAILED ||
        // Constant time step failure
-       ((timeStepControl_->getStepType() == "Constant") and
-        (ws->getTimeStep() != timeStepControl_->getInitTimeStep()) and
-        (ws->getOutput() != true) and
+       ((timeStepControl_->getStepType() == "Constant") &&
+        (ws->getTimeStep() != timeStepControl_->getInitTimeStep()) &&
+        (ws->getOutput() != true) &&
         (ws->getTime() != timeStepControl_->getFinalTime())
        )
      )
@@ -502,7 +502,7 @@ void IntegratorBasicOld<Scalar>::checkTimeStep()
     if (ws->getSolutionStatus() == Status::FAILED) {
       *out << "Solution Status = " << toString(ws->getSolutionStatus())
            << std::endl;
-    } else if ((timeStepControl_->getStepType() == "Constant") and
+    } else if ((timeStepControl_->getStepType() == "Constant") &&
                (ws->getTimeStep() != timeStepControl_->getInitTimeStep())) {
       *out << "dt != Constant dt (="<<timeStepControl_->getInitTimeStep()<<")"
            << std::endl;
@@ -526,7 +526,7 @@ void IntegratorBasicOld<Scalar>::endIntegrator()
 {
   std::string exitStatus;
   if (solutionHistory_->getCurrentState()->getSolutionStatus() ==
-      Status::FAILED or integratorStatus_ == Status::FAILED) {
+      Status::FAILED || integratorStatus_ == Status::FAILED) {
     exitStatus = "Time integration FAILURE!";
   } else {
     integratorStatus_ = Status::PASSED;
diff --git a/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp b/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp
index a6bbfbd0e1ed..1170b5f65146 100644
--- a/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorBasic_decl.hpp
@@ -100,6 +100,9 @@ class IntegratorBasic : virtual public Tempus::Integrator<Scalar>
     /// Get the SolutionHistory
     virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override
       {return solutionHistory_;}
+    /// Get the SolutionHistory
+    virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override
+      {return solutionHistory_;}
     /// Set the SolutionHistory
     virtual void setSolutionHistory(
       Teuchos::RCP<SolutionHistory<Scalar> > sh = Teuchos::null);
@@ -208,6 +211,12 @@ class IntegratorBasic : virtual public Tempus::Integrator<Scalar>
 };
 
 
+/// Nonmember constructor
+template<class Scalar>
+Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
+  Teuchos::RCP<Teuchos::ParameterList>                pList);
+
+
 /// Nonmember constructor
 template<class Scalar>
 Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
diff --git a/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp b/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp
index b07d696e1379..a38565776c35 100644
--- a/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorBasic_impl.hpp
@@ -250,25 +250,33 @@ std::string IntegratorBasic<Scalar>::description() const
 
 template<class Scalar>
 void IntegratorBasic<Scalar>::describe(
-  Teuchos::FancyOStream          &in_out,
+  Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  out->setOutputToRootOnly(0);
-  *out << description() << "::describe" << std::endl;
-  *out << "solutionHistory= " << solutionHistory_->description()<<std::endl;
-  *out << "timeStepControl= " << timeStepControl_->description()<<std::endl;
-  *out << "stepper        = " << stepper_        ->description()<<std::endl;
-
-  if (Teuchos::as<int>(verbLevel) >=
-              Teuchos::as<int>(Teuchos::VERB_HIGH)) {
-    *out << "solutionHistory= " << std::endl;
-    solutionHistory_->describe(in_out,verbLevel);
-    *out << "timeStepControl= " << std::endl;
-    timeStepControl_->describe(in_out,verbLevel);
-    *out << "stepper        = " << std::endl;
-    stepper_        ->describe(in_out,verbLevel);
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
+  if ( solutionHistory_ != Teuchos::null ) {
+    solutionHistory_->describe(*l_out,verbLevel);
+  } else {
+    *l_out << "solutionHistory = " << solutionHistory_ << std::endl;
   }
+
+  if ( timeStepControl_ != Teuchos::null ) {
+    timeStepControl_->describe(out,verbLevel);
+  } else {
+    *l_out << "timeStepControl = " << timeStepControl_ << std::endl;
+  }
+
+  if ( stepper_ != Teuchos::null ) {
+    stepper_->describe(out,verbLevel);
+  } else {
+    *l_out << "stepper         = " << stepper_ << std::endl;
+  }
+  *l_out << std::string(this->description().length()+8, '-') <<std::endl;
 }
 
 
@@ -393,7 +401,7 @@ void IntegratorBasic<Scalar>::checkTimeStep()
   if (ws->getNFailures() >= timeStepControl_->getMaxFailures()) {
     RCP<Teuchos::FancyOStream> out = this->getOStream();
     out->setOutputToRootOnly(0);
-    Teuchos::OSTab ostab(out,2,"checkTimeStep");
+    Teuchos::OSTab ostab(out, 2, "checkTimeStep");
     *out << "Failure - Stepper has failed more than the maximum allowed.\n"
          << "  (nFailures = "<<ws->getNFailures()<< ") >= (nFailuresMax = "
          << timeStepControl_->getMaxFailures()<<")" << std::endl;
@@ -404,7 +412,7 @@ void IntegratorBasic<Scalar>::checkTimeStep()
       >= timeStepControl_->getMaxConsecFailures()){
     RCP<Teuchos::FancyOStream> out = this->getOStream();
     out->setOutputToRootOnly(0);
-    Teuchos::OSTab ostab(out,1,"checkTimeStep");
+    Teuchos::OSTab ostab(out, 1, "checkTimeStep");
     *out << "Failure - Stepper has failed more than the maximum "
          << "consecutive allowed.\n"
          << "  (nConsecutiveFailures = "<<ws->getNConsecutiveFailures()
@@ -427,7 +435,7 @@ void IntegratorBasic<Scalar>::checkTimeStep()
   {
     RCP<Teuchos::FancyOStream> out = this->getOStream();
     out->setOutputToRootOnly(0);
-    Teuchos::OSTab ostab(out,0,"checkTimeStep");
+    Teuchos::OSTab ostab(out, 0, "checkTimeStep");
     *out <<std::scientific
       <<std::setw( 6)<<std::setprecision(3)<<ws->getIndex()
       <<std::setw(11)<<std::setprecision(3)<<ws->getTime()
@@ -550,8 +558,7 @@ IntegratorBasic<Scalar>::getValidParameters() const
 // ------------------------------------------------------------------------
 template<class Scalar>
 Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
-  Teuchos::RCP<Teuchos::ParameterList>                     tempusPL,
-  const Teuchos::RCP<Thyra::ModelEvaluator<Scalar> >&      model)
+  Teuchos::RCP<Teuchos::ParameterList>                     tempusPL)
 {
   auto integratorName = tempusPL->get<std::string>("Integrator Name");
   auto integratorPL = Teuchos::sublist(tempusPL, integratorName, true);
@@ -573,12 +580,11 @@ Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
     auto stepperPL = Teuchos::sublist(tempusPL, stepperName, true);
     stepperPL->setName(stepperName);
     auto sf = Teuchos::rcp(new StepperFactory<Scalar>());
-    integrator->setStepper(sf->createStepper(stepperPL, model));
+    integrator->setStepper(sf->createStepper(stepperPL));
   } else {
     // Construct default Stepper
-    Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > constModel = model;
-    integrator->setStepper(
-      createStepperForwardEuler(constModel, Teuchos::null));
+    auto stepper = Teuchos::rcp(new StepperForwardEuler<Scalar>());
+    integrator->setStepper(stepper);
   }
 
   // Set TimeStepControl
@@ -591,23 +597,16 @@ Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
     integrator->setTimeStepControl(rcp(new TimeStepControl<Scalar>()));
   }
 
-  // Construct default IC state from the application model and TimeStepControl
-  auto newState = createSolutionStateME(integrator->getStepper()->getModel(),
-    integrator->getStepper()->getDefaultStepperState());
-  newState->setTime    (integrator->getTimeStepControl()->getInitTime());
-  newState->setIndex   (integrator->getTimeStepControl()->getInitIndex());
-  newState->setTimeStep(integrator->getTimeStepControl()->getInitTimeStep());
-  newState->setTolRel  (integrator->getTimeStepControl()->getMaxRelError());
-  newState->setTolAbs  (integrator->getTimeStepControl()->getMaxAbsError());
-  newState->setOrder   (integrator->getStepper()->getOrder());
-  newState->setSolutionStatus(Status::PASSED);  // ICs are considered passing.
-
   // Set SolutionHistory
-  auto shPL = Teuchos::sublist(integratorPL, "Solution History", true);
-  auto sh   = createSolutionHistoryPL<Scalar>(shPL);
-  sh->addState(newState);
-  integrator->getStepper()->setInitialConditions(sh);
-  integrator->setSolutionHistory(sh);
+  if (integratorPL->isSublist("Solution History")) {
+    // Construct from Integrator ParameterList
+    auto shPL = Teuchos::sublist(integratorPL, "Solution History", true);
+    auto sh   = createSolutionHistoryPL<Scalar>(shPL);
+    integrator->setSolutionHistory(sh);
+  } else {
+    // Construct default SolutionHistory
+    integrator->setSolutionHistory(createSolutionHistory<Scalar>());
+  }
 
   // Set Observer to default.
   integrator->setObserver(Teuchos::null);
@@ -635,6 +634,39 @@ Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
   auto vStepperPL   = Teuchos::sublist(validPL, vStepperName, true);
   stepperPL->validateParametersAndSetDefaults(*vStepperPL);
 
+  return integrator;  // integrator is not initialized (missing model and IC).
+}
+
+
+// Nonmember constructor
+// ------------------------------------------------------------------------
+template<class Scalar>
+Teuchos::RCP<IntegratorBasic<Scalar> > createIntegratorBasic(
+  Teuchos::RCP<Teuchos::ParameterList>                     tempusPL,
+  const Teuchos::RCP<Thyra::ModelEvaluator<Scalar> >&      model)
+{
+  auto integrator = createIntegratorBasic<Scalar>(tempusPL);
+  if ( model == Teuchos::null ) return integrator;
+
+  Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > constModel = model;
+  integrator->setModel(constModel);
+
+  // Construct default IC state from the application model and TimeStepControl
+  auto newState = createSolutionStateME(integrator->getStepper()->getModel(),
+    integrator->getStepper()->getDefaultStepperState());
+  newState->setTime    (integrator->getTimeStepControl()->getInitTime());
+  newState->setIndex   (integrator->getTimeStepControl()->getInitIndex());
+  newState->setTimeStep(integrator->getTimeStepControl()->getInitTimeStep());
+  newState->setTolRel  (integrator->getTimeStepControl()->getMaxRelError());
+  newState->setTolAbs  (integrator->getTimeStepControl()->getMaxAbsError());
+  newState->setOrder   (integrator->getStepper()->getOrder());
+  newState->setSolutionStatus(Status::PASSED);  // ICs are considered passing.
+
+  // Set SolutionHistory IC
+  auto sh = integrator->getNonConstSolutionHistory();
+  sh->addState(newState);
+  integrator->getStepper()->setInitialConditions(sh);
+
   integrator->initialize();
 
   return integrator;
diff --git a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp
index 199c9a07c0ab..c556e796a6dd 100644
--- a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_decl.hpp
@@ -164,6 +164,9 @@ class IntegratorForwardSensitivity
   /// Get the SolutionHistory
   virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override
     { return integrator_->getSolutionHistory(); }
+  /// Get the SolutionHistory
+  virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override
+    { return integrator_->getNonConstSolutionHistory(); }
   /// Set the SolutionHistory
   virtual void setSolutionHistory(
     Teuchos::RCP<SolutionHistory<Scalar> > sh = Teuchos::null)
diff --git a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp
index c71273a2910e..7b8cb372d7ea 100644
--- a/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorForwardSensitivity_impl.hpp
@@ -242,13 +242,15 @@ template<class Scalar>
 void
 IntegratorForwardSensitivity<Scalar>::
 describe(
-  Teuchos::FancyOStream          &in_out,
+  Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  out->setOutputToRootOnly(0);
-  *out << description() << "::describe" << std::endl;
-  integrator_->describe(in_out, verbLevel);
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << description() << "::describe" << std::endl;
+  integrator_->describe(*l_out, verbLevel);
 }
 
 template<class Scalar>
diff --git a/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp b/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp
index 4472e3285e40..fc8e559d7e44 100644
--- a/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorObserverBasic_impl.hpp
@@ -76,7 +76,7 @@ observeEndTimeStep(const Integrator<Scalar>& integrator){
 
      const Teuchos::RCP<Teuchos::FancyOStream> out = integrator.getOStream();
      out->setOutputToRootOnly(0);
-     Teuchos::OSTab ostab(out,0,"ScreenOutput");
+     Teuchos::OSTab ostab(out, 0, "ScreenOutput");
      *out<<std::scientific
         <<std::setw( 6)<<std::setprecision(3)<<cs->getIndex()
         <<std::setw(11)<<std::setprecision(3)<<cs->getTime()
diff --git a/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp b/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp
index 287913a77fdf..87060ddac28b 100644
--- a/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorObserverSubcycling_impl.hpp
@@ -25,7 +25,7 @@ observeStartIntegrator(const Integrator<Scalar>& integrator){
 
   const Teuchos::RCP<Teuchos::FancyOStream> out = integrator.getOStream();
   out->setOutputToRootOnly(0);
-  Teuchos::OSTab ostab(out,0,"ScreenOutput");
+  Teuchos::OSTab ostab(out, 0, "ScreenOutput");
   *out << "\n    Begin Subcycling -------------------------------------------------------\n";
     // << "  Step       Time         dt  Abs Error  Rel Error  Order  nFail  dCompTime"
     // << std::endl;
@@ -68,7 +68,7 @@ observeEndTimeStep(const Integrator<Scalar>& integrator){
 
      const Teuchos::RCP<Teuchos::FancyOStream> out = integrator.getOStream();
      out->setOutputToRootOnly(0);
-     Teuchos::OSTab ostab(out,0,"ScreenOutput");
+     Teuchos::OSTab ostab(out, 0, "ScreenOutput");
      *out<<std::scientific
         <<std::setw( 6)<<std::setprecision(3)<<cs->getIndex()
         <<std::setw(11)<<std::setprecision(3)<<cs->getTime()
@@ -89,7 +89,7 @@ observeEndIntegrator(const Integrator<Scalar>& integrator){
 
   const Teuchos::RCP<Teuchos::FancyOStream> out = integrator.getOStream();
   out->setOutputToRootOnly(0);
-  Teuchos::OSTab ostab(out,0,"ScreenOutput");
+  Teuchos::OSTab ostab(out, 0, "ScreenOutput");
   *out << "    End Subcycling ---------------------------------------------------------\n\n";
 }
 
diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp
index 187d37960096..ec2430a47cbf 100644
--- a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_decl.hpp
@@ -109,6 +109,8 @@ class IntegratorPseudoTransientAdjointSensitivity
   virtual void setTempusParameterList(Teuchos::RCP<Teuchos::ParameterList> pl) override;
   /// Get the SolutionHistory
   virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override;
+  /// Get the SolutionHistory
+  virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override;
    /// Get the TimeStepControl
   virtual Teuchos::RCP<const TimeStepControl<Scalar> > getTimeStepControl() const override;
   virtual Teuchos::RCP<TimeStepControl<Scalar> > getNonConstTimeStepControl() override;
diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp
index ca347ab8981d..0867093373e6 100644
--- a/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientAdjointSensitivity_impl.hpp
@@ -163,6 +163,14 @@ getSolutionHistory() const
   return solutionHistory_;
 }
 
+template<class Scalar>
+Teuchos::RCP<SolutionHistory<Scalar> >
+IntegratorPseudoTransientAdjointSensitivity<Scalar>::
+getNonConstSolutionHistory()
+{
+  return solutionHistory_;
+}
+
 template<class Scalar>
 Teuchos::RCP<const TimeStepControl<Scalar> >
 IntegratorPseudoTransientAdjointSensitivity<Scalar>::
@@ -271,13 +279,16 @@ template<class Scalar>
 void
 IntegratorPseudoTransientAdjointSensitivity<Scalar>::
 describe(
-  Teuchos::FancyOStream          &in_out,
+  Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  *out << description() << "::describe" << std::endl;
-  state_integrator_->describe(*out, verbLevel);
-  sens_integrator_->describe(*out, verbLevel);
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << description() << "::describe" << std::endl;
+  state_integrator_->describe(*l_out, verbLevel);
+  sens_integrator_->describe(*l_out, verbLevel);
 }
 
 template<class Scalar>
diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp
index 0bfeb3cff6e4..01dfd1996e73 100644
--- a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_decl.hpp
@@ -123,6 +123,8 @@ class IntegratorPseudoTransientForwardSensitivity
   virtual void setTempusParameterList(Teuchos::RCP<Teuchos::ParameterList> pl) override;
   /// Get the SolutionHistory
   virtual Teuchos::RCP<const SolutionHistory<Scalar> > getSolutionHistory() const override;
+  /// Get the SolutionHistory
+  virtual Teuchos::RCP<SolutionHistory<Scalar> > getNonConstSolutionHistory() override;
    /// Get the TimeStepControl
   virtual Teuchos::RCP<const TimeStepControl<Scalar> > getTimeStepControl() const override;
   virtual Teuchos::RCP<TimeStepControl<Scalar> > getNonConstTimeStepControl() override;
diff --git a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp
index 0e7d8496befa..142c007f06ac 100644
--- a/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp
+++ b/packages/tempus/src/Tempus_IntegratorPseudoTransientForwardSensitivity_impl.hpp
@@ -176,6 +176,14 @@ getSolutionHistory() const
   return solutionHistory_;
 }
 
+template<class Scalar>
+Teuchos::RCP<SolutionHistory<Scalar> >
+IntegratorPseudoTransientForwardSensitivity<Scalar>::
+getNonConstSolutionHistory()
+{
+  return solutionHistory_;
+}
+
 template<class Scalar>
 Teuchos::RCP<const TimeStepControl<Scalar> >
 IntegratorPseudoTransientForwardSensitivity<Scalar>::
@@ -343,14 +351,16 @@ template<class Scalar>
 void
 IntegratorPseudoTransientForwardSensitivity<Scalar>::
 describe(
-  Teuchos::FancyOStream          &in_out,
+  Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  out->setOutputToRootOnly(0);
-  *out << description() << "::describe" << std::endl;
-  state_integrator_->describe(in_out, verbLevel);
-  sens_integrator_->describe(in_out, verbLevel);
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << description() << "::describe" << std::endl;
+  state_integrator_->describe(*l_out, verbLevel);
+  sens_integrator_->describe(*l_out, verbLevel);
 }
 
 template<class Scalar>
diff --git a/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp b/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp
index 11ec213225f7..41bfec9f8223 100644
--- a/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp
+++ b/packages/tempus/src/Tempus_InterpolatorLagrange_decl.hpp
@@ -48,7 +48,10 @@ class InterpolatorLagrange : virtual public Interpolator<Scalar>
   std::string description() const { return "Tempus::InterpolatorLagrange"; }
   void describe(Teuchos::FancyOStream &out,
                 const Teuchos::EVerbosityLevel /* verbLevel */) const
-  { out << description() << "::describe" << std::endl; }
+  {
+    out.setOutputToRootOnly(0);
+    out << description() << "::describe" << std::endl;
+  }
   //@}
 
   /// \name Overridden from Teuchos::ParameterListAcceptor
diff --git a/packages/tempus/src/Tempus_PhysicsState_impl.hpp b/packages/tempus/src/Tempus_PhysicsState_impl.hpp
index 19da288f686e..5a06fbd1b054 100644
--- a/packages/tempus/src/Tempus_PhysicsState_impl.hpp
+++ b/packages/tempus/src/Tempus_PhysicsState_impl.hpp
@@ -55,18 +55,19 @@ void PhysicsState<Scalar>::setName(std::string pN)
 template<class Scalar>
 std::string PhysicsState<Scalar>::description() const
 {
-  return physicsName_;
+  return "Tempus::PhysicsState - '" + physicsName_ + "'";
 }
 
 template<class Scalar>
 void PhysicsState<Scalar>::describe(
-  Teuchos::FancyOStream        & in_out,
+  Teuchos::FancyOStream        & out,
   const Teuchos::EVerbosityLevel /* verbLevel */) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  out->setOutputToRootOnly(0);
-  *out << description() << "::describe" << std::endl
-      << "  physicsName   = " << physicsName_ << std::endl;
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "\n--- " << this->description() << " ---" << std::endl;
 }
 
 
diff --git a/packages/tempus/src/Tempus_RKButcherTableau.hpp b/packages/tempus/src/Tempus_RKButcherTableau.hpp
index e89578b822a5..c8dc901fcfc6 100644
--- a/packages/tempus/src/Tempus_RKButcherTableau.hpp
+++ b/packages/tempus/src/Tempus_RKButcherTableau.hpp
@@ -158,6 +158,8 @@ class RKButcherTableau :
       virtual void describe( Teuchos::FancyOStream &out,
                              const Teuchos::EVerbosityLevel verbLevel) const
       {
+        out.setOutputToRootOnly(0);
+
         if (verbLevel != Teuchos::VERB_NONE) {
           out << this->description() << std::endl;
           out << "number of Stages = " << this->numStages() << std::endl;
diff --git a/packages/tempus/src/Tempus_SolutionHistory.cpp b/packages/tempus/src/Tempus_SolutionHistory.cpp
index 47a554bc684f..7bb6e1e3b6ab 100644
--- a/packages/tempus/src/Tempus_SolutionHistory.cpp
+++ b/packages/tempus/src/Tempus_SolutionHistory.cpp
@@ -16,10 +16,13 @@ namespace Tempus {
 
   TEMPUS_INSTANTIATE_TEMPLATE_CLASS(SolutionHistory)
 
+  // Nonmember constructor
+  template Teuchos::RCP<SolutionHistory<double> >
+  createSolutionHistory();
+
   // Nonmember constructor from a ParameterList
   template Teuchos::RCP<SolutionHistory<double> >
-  createSolutionHistoryPL(
-    Teuchos::RCP<Teuchos::ParameterList> pList);
+  createSolutionHistoryPL(Teuchos::RCP<Teuchos::ParameterList> pList);
 
   // Nonmember contructor from a SolutionState.
   template Teuchos::RCP<SolutionHistory<double> >
diff --git a/packages/tempus/src/Tempus_SolutionHistory_decl.hpp b/packages/tempus/src/Tempus_SolutionHistory_decl.hpp
index b4fa8f88100f..bc911aa2cf0f 100644
--- a/packages/tempus/src/Tempus_SolutionHistory_decl.hpp
+++ b/packages/tempus/src/Tempus_SolutionHistory_decl.hpp
@@ -347,6 +347,11 @@ class SolutionHistory
 };
 
 
+/// Nonmember constructor
+template<class Scalar>
+Teuchos::RCP<SolutionHistory<Scalar> >
+createSolutionHistory();
+
 /// Nonmember constructor from a ParameterList
 template<class Scalar>
 Teuchos::RCP<SolutionHistory<Scalar> >
diff --git a/packages/tempus/src/Tempus_SolutionHistory_impl.hpp b/packages/tempus/src/Tempus_SolutionHistory_impl.hpp
index d6929af6ecb5..fcb3693627bf 100644
--- a/packages/tempus/src/Tempus_SolutionHistory_impl.hpp
+++ b/packages/tempus/src/Tempus_SolutionHistory_impl.hpp
@@ -158,10 +158,10 @@ void SolutionHistory<Scalar>::removeState(
       if (state->getTime() == (*state_it)->getTime()) break;
     }
 
-    TEUCHOS_TEST_FOR_EXCEPTION(state_it == history_->rend(), std::logic_error,
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      state_it == history_->rend(), std::logic_error,
       "Error - removeState() Could not remove state = "
-      // << state_it->describe()
-      );
+       << (*state_it)->description());
 
     // Need to be careful when erasing a reverse iterator.
     history_->erase(std::next(state_it).base());
@@ -394,10 +394,10 @@ SolutionHistory<Scalar>::getStateTimeIndexN(bool warn) const
   const int m = history_->size();
   if ( m < 1 ) {
     if ( warn ) {
-       Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream();
-       Teuchos::OSTab ostab(out,1,"SolutionHistory::getStateTimeIndexN");
-       *out << "Warning - getStateTimeIndexN() No states in SolutionHistory!"
-            << std::endl;
+      Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream();
+      Teuchos::OSTab ostab(out,1,"SolutionHistory::getStateTimeIndexN");
+      *out << "Warning - getStateTimeIndexN() No states in SolutionHistory!"
+           << std::endl;
     }
   } else {
     state = (*history_)[m-1];
@@ -514,7 +514,7 @@ SolutionHistory<Scalar>::getStateTimeIndex(int index, bool warn) const
 template<class Scalar>
 std::string SolutionHistory<Scalar>::description() const
 {
-  return ("Tempus::SolutionHistory - name = '" + name_ + "'");
+  return ("Tempus::SolutionHistory - '" + name_ + "'");
 }
 
 
@@ -523,24 +523,30 @@ void SolutionHistory<Scalar>::describe(
   Teuchos::FancyOStream          &out,
   const Teuchos::EVerbosityLevel verbLevel) const
 {
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
   if ((Teuchos::as<int>(verbLevel)==Teuchos::as<int>(Teuchos::VERB_DEFAULT)) ||
       (Teuchos::as<int>(verbLevel)>=Teuchos::as<int>(Teuchos::VERB_LOW)    )  ){
-    out << description() << std::endl;
-    //out << "interpolator     = " << interpolator->description() << std::endl;
-    out << "storageLimit     = " << storageLimit_ << std::endl;
-    out << "storageType      = " << getStorageTypeString() << std::endl;
-    out << "number of states = " << history_->size() << std::endl;
-    out << "time range       = (" << history_->front()->getTime() << ", "
-                                  << history_->back()->getTime() << ")"
-                                  << std::endl;
+    //*l_out << "  interpolator     = " << interpolator->description() << std::endl;
+    *l_out << "  storageLimit     = " << storageLimit_ << std::endl;
+    *l_out << "  storageType      = " << getStorageTypeString() << std::endl;
+    *l_out << "  number of states = " << history_->size() << std::endl;
+    if ( history_->size() > 0 ) {
+      *l_out<<"  time range       = (" << history_->front()->getTime() << ", "
+                                       << history_->back()->getTime() << ")"
+                                       << std::endl;
+    }
   }
 
   if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
-    for (int i=0; i<(int)history_->size() ; ++i) {
-      out << "SolutionState[" << i << "] -- ";
-      (*history_)[i]->describe(out, verbLevel);
-    }
+    for (int i=0; i<(int)history_->size() ; ++i)
+      (*history_)[i]->describe(*l_out, verbLevel);
   }
+  *l_out << std::string(this->description().length()+8, '-') << std::endl;
 }
 
 
@@ -682,11 +688,22 @@ void SolutionHistory<Scalar>::initialize() const
 // Nonmember constructors.
 // ------------------------------------------------------------------------
 
+template<class Scalar>
+Teuchos::RCP<SolutionHistory<Scalar> > createSolutionHistory()
+{
+  auto sh = rcp(new SolutionHistory<Scalar>());
+  sh->setName("From createSolutionHistory");
+
+  return sh;
+}
+
+
 template<class Scalar>
 Teuchos::RCP<SolutionHistory<Scalar> > createSolutionHistoryPL(
   Teuchos::RCP<Teuchos::ParameterList> pl)
 {
   auto sh = rcp(new SolutionHistory<Scalar>());
+  sh->setName("From createSolutionHistoryPL");
 
   if (pl == Teuchos::null) return sh;  // Return default SolutionHistory.
 
diff --git a/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp b/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp
index 71ee89134856..22c8b487ec62 100644
--- a/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp
+++ b/packages/tempus/src/Tempus_SolutionStateMetaData_impl.hpp
@@ -181,32 +181,36 @@ void SolutionStateMetaData<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
-  if (verbLevel == Teuchos::VERB_EXTREME) {
-    auto l_out = Teuchos::fancyOStream( out.getOStream() );
-    l_out->setOutputToRootOnly(0);
-    *l_out << description() << "::describe:" << std::endl
-           << "time           = " << time_ << std::endl
-           << "iStep          = " << iStep_ << std::endl
-           << "dt             = " << dt_ << std::endl
-           << "errorAbs       = " << errorAbs_ << std::endl
-           << "errorRel       = " << errorRel_ << std::endl
-           << "order          = " << order_ << std::endl
-           << "nFailures      = " << nFailures_ << std::endl
-           << "nRunningFailures = " << nRunningFailures_<< std::endl
-           << "nConsecutiveFailures = " << nConsecutiveFailures_ << std::endl
-           << "tolRel         = " << tolRel_ << std::endl
-           << "tolAbs         = " << tolAbs_ << std::endl
-           << "xNormL2        = " << xNormL2_ << std::endl
-           << "dxNormL2Rel    = " << dxNormL2Rel_ << std::endl
-           << "dxNormL2Abs    = " << dxNormL2Abs_ << std::endl
-           << "computeNorms   = " << computeNorms_ << std::endl
-           << "solutionStatus = " << toString(solutionStatus_) << std::endl
-           << "output         = " << output_ << std::endl
-           << "outputScreen   = " << outputScreen_ << std::endl
-           << "isSynced       = " << isSynced_ << std::endl
-           << "isInterpolated = " << isInterpolated_ << std::endl
-           << "accuracy       = " << accuracy_ << std::endl;
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "\n--- " << this->description() << " ---" <<std::endl;
+
+  if (verbLevel >= Teuchos::VERB_MEDIUM) {
+    *l_out << "  time           = " << time_ << std::endl
+           << "  iStep          = " << iStep_ << std::endl
+           << "  dt             = " << dt_ << std::endl
+           << "  errorAbs       = " << errorAbs_ << std::endl
+           << "  errorRel       = " << errorRel_ << std::endl
+           << "  order          = " << order_ << std::endl
+           << "  nFailures      = " << nFailures_ << std::endl
+           << "  nRunningFailures = " << nRunningFailures_<< std::endl
+           << "  nConsecutiveFailures = " << nConsecutiveFailures_ << std::endl
+           << "  tolRel         = " << tolRel_ << std::endl
+           << "  tolAbs         = " << tolAbs_ << std::endl
+           << "  xNormL2        = " << xNormL2_ << std::endl
+           << "  dxNormL2Rel    = " << dxNormL2Rel_ << std::endl
+           << "  dxNormL2Abs    = " << dxNormL2Abs_ << std::endl
+           << "  computeNorms   = " << computeNorms_ << std::endl
+           << "  solutionStatus = " << toString(solutionStatus_) << std::endl
+           << "  output         = " << output_ << std::endl
+           << "  outputScreen   = " << outputScreen_ << std::endl
+           << "  isSynced       = " << isSynced_ << std::endl
+           << "  isInterpolated = " << isInterpolated_ << std::endl
+           << "  accuracy       = " << accuracy_ << std::endl;
   }
+  *l_out << std::string(this->description().length()+8, '-') <<std::endl;
 }
 
 } // namespace Tempus
diff --git a/packages/tempus/src/Tempus_SolutionState_impl.hpp b/packages/tempus/src/Tempus_SolutionState_impl.hpp
index f1fc53cde6c5..d25a05e0120a 100644
--- a/packages/tempus/src/Tempus_SolutionState_impl.hpp
+++ b/packages/tempus/src/Tempus_SolutionState_impl.hpp
@@ -399,8 +399,13 @@ bool SolutionState<Scalar>::operator== (const Scalar& t) const
 template<class Scalar>
 std::string SolutionState<Scalar>::description() const
 {
-  std::string name = "Tempus::SolutionState";
-  return (name);
+  std::ostringstream out;
+  out << "SolutionState"
+      << " (index =" <<std::setw(6)<< this->getIndex()
+      << "; time =" <<std::setw(10)<<std::setprecision(3)<<this->getTime()
+      << "; dt ="   <<std::setw(10)<<std::setprecision(3)<<this->getTimeStep()
+      << ")";
+  return out.str();
 }
 
 template<class Scalar>
@@ -408,35 +413,33 @@ void SolutionState<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
-  if (verbLevel == Teuchos::VERB_MEDIUM) {
-    out << "(index =" <<std::setw(6)<< this->getIndex()
-        << "; time =" <<std::setw(10)<<std::setprecision(3)<<this->getTime()
-        << "; dt   =" <<std::setw(10)<<std::setprecision(3)<<this->getTimeStep()
-        << ")" << std::endl;
-  }
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
+  if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_EXTREME)) {
+
+    metaData_->describe(*l_out,verbLevel);
+    *l_out << "  x       = " << std::endl;
+    x_->describe(*l_out,verbLevel);
 
-  if (verbLevel == Teuchos::VERB_EXTREME) {
-    out << description() << "::describe:" << std::endl
-        << "metaData = " << std::endl;
-        metaData_->describe(out,verbLevel);
-    out << "x = " << std::endl;
-    x_->describe(out,verbLevel);
     if (xdot_ != Teuchos::null) {
-      out << "xdot_ = " << std::endl;
-      xdot_->describe(out,verbLevel);
+      *l_out << "  xdot_   = " << std::endl;
+      xdot_->describe(*l_out,verbLevel);
     }
     if (xdotdot_ != Teuchos::null) {
-      out << "xdotdot = " << std::endl;
-      xdotdot_->describe(out,verbLevel);
-    }
-    if (stepperState_ != Teuchos::null) {
-      out << "stepperState = " << std::endl;
-      stepperState_->describe(out,verbLevel);
-    }
-    if (physicsState_ != Teuchos::null) {
-      out << "physicsState = " << std::endl;
-      physicsState_->describe(out,verbLevel);
+      *l_out << "  xdotdot = " << std::endl;
+      xdotdot_->describe(*l_out,verbLevel);
     }
+
+    if (stepperState_ != Teuchos::null)
+      stepperState_->describe(*l_out,verbLevel);
+    if (physicsState_ != Teuchos::null)
+      physicsState_->describe(*l_out,verbLevel);
+
+    *l_out << std::string(this->description().length()+8, '-') <<std::endl;
   }
 }
 
diff --git a/packages/tempus/src/Tempus_StepperBDF2_decl.hpp b/packages/tempus/src/Tempus_StepperBDF2_decl.hpp
index c2e57accc11f..1a7018f6b177 100644
--- a/packages/tempus/src/Tempus_StepperBDF2_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperBDF2_decl.hpp
@@ -63,7 +63,7 @@ namespace Tempus {
  *  The startup stepper allows BDF2 to use user-specified Stepper for the
  *  first timestep in order to populate the SolutionHistory with past states.
  *  A one-step startup stepper is perfect for this situation, e.g., Backward
- *  Euler or RK4.  The default startup stepper is 'IRK 1 Stage Theta Method',
+ *  Euler or RK4.  The default startup stepper is 'DIRK 1 Stage Theta Method',
  *  which is second order accurate and allows an overall second-order solution.
  *
  *  The First-Same-As-Last (FSAL) principle is not needed for BDF2.
diff --git a/packages/tempus/src/Tempus_StepperBDF2_impl.hpp b/packages/tempus/src/Tempus_StepperBDF2_impl.hpp
index 74884899469a..209e91cbcd9d 100644
--- a/packages/tempus/src/Tempus_StepperBDF2_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperBDF2_impl.hpp
@@ -66,6 +66,7 @@ void StepperBDF2<Scalar>::setModel(
   const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel)
 {
   StepperImplicit<Scalar>::setModel(appModel);
+  // If the startUpStepper's model is not set, set it to the stepper model.
   if (startUpStepper_->getModel() == Teuchos::null) {
     startUpStepper_->setModel(appModel);
     startUpStepper_->initialize();
@@ -275,6 +276,7 @@ void StepperBDF2<Scalar>::describe(
   const Teuchos::EVerbosityLevel      verbLevel ) const
 {
   auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
   l_out->setOutputToRootOnly(0);
   *l_out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
@@ -340,13 +342,13 @@ createStepperBDF2(
   auto stepper = Teuchos::rcp(new StepperBDF2<Scalar>());
   stepper->setStepperImplicitValues(pl);
 
+  std::string startUpStepperName = "DIRK 1 Stage Theta Method";
+  if (pl != Teuchos::null) startUpStepperName =
+    pl->get<std::string>("Start Up Stepper Type", startUpStepperName);
+  stepper->setStartUpStepper(startUpStepperName);
+
   if (model != Teuchos::null) {
     stepper->setModel(model);
-
-    std::string startUpStepperName = "DIRK 1 Stage Theta Method";
-    if (pl != Teuchos::null) startUpStepperName =
-      pl->get<std::string>("Start Up Stepper Type", startUpStepperName);
-    stepper->setStartUpStepper(startUpStepperName);
     stepper->initialize();
   }
 
diff --git a/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp b/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp
index ef8b500f3aec..fdb3cfa8e0f9 100644
--- a/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperBackwardEuler_decl.hpp
@@ -108,76 +108,81 @@ class StepperBackwardEuler :
     void setPredictor(std::string predictorType = "None");
     void setPredictor(Teuchos::RCP<Stepper<Scalar> > predictorStepper);
 
+    /// Set the model
+    virtual void setModel(
+      const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel) override;
+
     /// Set the initial conditions and make them consistent.
     virtual void setInitialConditions (
-      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
+      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory) override;
 
     /// Take the specified timestep, dt, and return true if successful.
     virtual void takeStep(
-      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
+      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory) override;
 
     /// Get a default (initial) StepperState
-    virtual Teuchos::RCP<Tempus::StepperState<Scalar> > getDefaultStepperState();
-    virtual Scalar getOrder() const {return 1.0;}
-    virtual Scalar getOrderMin() const {return 1.0;}
-    virtual Scalar getOrderMax() const {return 1.0;}
-
-    virtual bool isExplicit()         const {return false;}
-    virtual bool isImplicit()         const {return true;}
-    virtual bool isExplicitImplicit() const
+    virtual Teuchos::RCP<Tempus::StepperState<Scalar> > getDefaultStepperState() override;
+    virtual Scalar getOrder() const override {return 1.0;}
+    virtual Scalar getOrderMin() const override {return 1.0;}
+    virtual Scalar getOrderMax() const override {return 1.0;}
+
+    virtual bool isExplicit()         const override {return false;}
+    virtual bool isImplicit()         const override {return true;}
+    virtual bool isExplicitImplicit() const override
       {return isExplicit() && isImplicit();}
-    virtual bool isOneStepMethod()   const {return true;}
-    virtual bool isMultiStepMethod() const {return !isOneStepMethod();}
-    virtual OrderODE getOrderODE()   const {return FIRST_ORDER_ODE;}
+    virtual bool isOneStepMethod()   const override {return true;}
+    virtual bool isMultiStepMethod() const override {return !isOneStepMethod();}
+    virtual OrderODE getOrderODE()   const override {return FIRST_ORDER_ODE;}
   //@}
 
     /// Return alpha = d(xDot)/dx.
-  virtual Scalar getAlpha(const Scalar dt) const { return Scalar(1.0)/dt; }
+  virtual Scalar getAlpha(const Scalar dt) const override { return Scalar(1.0)/dt; }
   /// Return beta  = d(x)/dx.
-  virtual Scalar getBeta (const Scalar   ) const { return Scalar(1.0); }
+  virtual Scalar getBeta (const Scalar   ) const override { return Scalar(1.0); }
 
   /// Compute predictor given the supplied stepper
   virtual void computePredictor(
     const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
 
-  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const;
+  /// Return a valid ParameterList with current settings.
+  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const override;
 
   /// \name Overridden from Teuchos::Describable
   //@{
     virtual void describe(Teuchos::FancyOStream        & out,
-                          const Teuchos::EVerbosityLevel verbLevel) const;
+                          const Teuchos::EVerbosityLevel verbLevel) const override;
   //@}
 
-  virtual bool isValidSetup(Teuchos::FancyOStream & out) const;
+  virtual bool isValidSetup(Teuchos::FancyOStream & out) const override;
 
   /// \name Implementation of StepperOptimizationInterface
   //@{
-    virtual int stencilLength() const;
+    virtual int stencilLength() const override;
     virtual void computeStepResidual(
       Thyra::VectorBase<Scalar>& residual,
       const Teuchos::Array< Teuchos::RCP<const Thyra::VectorBase<Scalar> > >& x,
       const Teuchos::Array<Scalar>& t,
       const Thyra::VectorBase<Scalar>& p,
-      const int param_index) const;
+      const int param_index) const override;
     virtual void computeStepJacobian(
       Thyra::LinearOpBase<Scalar>& jacobian,
       const Teuchos::Array< Teuchos::RCP<const Thyra::VectorBase<Scalar> > >& x,
       const Teuchos::Array<Scalar>& t,
       const Thyra::VectorBase<Scalar>& p,
       const int param_index,
-      const int deriv_index) const;
+      const int deriv_index) const override;
     virtual void computeStepParamDeriv(
       Thyra::LinearOpBase<Scalar>& deriv,
       const Teuchos::Array< Teuchos::RCP<const Thyra::VectorBase<Scalar> > >& x,
       const Teuchos::Array<Scalar>& t,
       const Thyra::VectorBase<Scalar>& p,
-      const int param_index) const;
+      const int param_index) const override;
     virtual void computeStepSolver(
       Thyra::LinearOpWithSolveBase<Scalar>& jacobian_solver,
       const Teuchos::Array< Teuchos::RCP<const Thyra::VectorBase<Scalar> > >& x,
       const Teuchos::Array<Scalar>& t,
       const Thyra::VectorBase<Scalar>& p,
-      const int param_index) const;
+      const int param_index) const override;
   //@}
 
 private:
diff --git a/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp b/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp
index b1dd127a2fd0..a9f5f792bb2e 100644
--- a/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperBackwardEuler_impl.hpp
@@ -120,6 +120,24 @@ void StepperBackwardEuler<Scalar>::setAppAction(
 }
 
 
+template<class Scalar>
+void StepperBackwardEuler<Scalar>::setModel(
+  const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel)
+{
+  StepperImplicit<Scalar>::setModel(appModel);
+
+  if (predictorStepper_ != Teuchos::null) {
+    // If predictor's model is not set, set it to the stepper model.
+    if (predictorStepper_->getModel() == Teuchos::null) {
+      predictorStepper_->setModel(appModel);
+      predictorStepper_->initialize();
+    }
+  }
+
+  this->isInitialized_ = false;
+}
+
+
 template<class Scalar>
 void StepperBackwardEuler<Scalar>::setInitialConditions(
   const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory)
@@ -247,6 +265,7 @@ void StepperBackwardEuler<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -269,6 +288,7 @@ void StepperBackwardEuler<Scalar>::describe(
 template<class Scalar>
 bool StepperBackwardEuler<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
@@ -438,14 +458,14 @@ createStepperBackwardEuler(
 
   stepper->setStepperImplicitValues(pl);
 
+  if (pl != Teuchos::null) {
+    std::string predictorName =
+      pl->get<std::string>("Predictor Stepper Type", "None");
+    stepper->setPredictor(predictorName);
+  }
+
   if (model != Teuchos::null) {
     stepper->setModel(model);
-
-    if (pl != Teuchos::null) {
-      std::string predictorName =
-        pl->get<std::string>("Predictor Stepper Type", "None");
-      stepper->setPredictor(predictorName);
-    }
     stepper->initialize();
   }
 
diff --git a/packages/tempus/src/Tempus_StepperDIRK_decl.hpp b/packages/tempus/src/Tempus_StepperDIRK_decl.hpp
index d3098d4a7d85..6658a66c789e 100644
--- a/packages/tempus/src/Tempus_StepperDIRK_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperDIRK_decl.hpp
@@ -161,11 +161,15 @@ class StepperDIRK : virtual public Tempus::StepperImplicit<Scalar>,
   /// \name Basic stepper methods
   //@{
     /// Initialize after construction and changing input parameters.
-    virtual void initialize();
+    virtual void initialize() override;
+
+    /// Set the model
+    virtual void setModel(
+      const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel) override;
 
     /// Set the initial conditions and make them consistent.
     virtual void setInitialConditions (
-      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
+      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory) override;
 
     /// Set parameter so that the initial guess is reset at the beginning of each timestep.
     virtual void setResetInitialGuess(bool reset_guess)
@@ -175,26 +179,26 @@ class StepperDIRK : virtual public Tempus::StepperImplicit<Scalar>,
 
     /// Take the specified timestep, dt, and return true if successful.
     virtual void takeStep(
-      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
+      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory) override;
 
     /// Get a default (initial) StepperState
-    virtual Teuchos::RCP<Tempus::StepperState<Scalar> >getDefaultStepperState();
+    virtual Teuchos::RCP<Tempus::StepperState<Scalar> >getDefaultStepperState() override;
 
-    virtual bool isExplicit() const
+    virtual bool isExplicit() const override
     {
       const int numStages = this->tableau_->numStages();
       Teuchos::SerialDenseMatrix<int,Scalar> A = this->tableau_->A();
       bool isExplicit = false;
       for (int i=0; i<numStages; ++i) if (A(i,i) == 0.0) isExplicit = true;
-      return isExplicit;
+      return isExplicit && this->tableau_->isDIRK();
     }
-    virtual bool isImplicit()         const {return true;}
-    virtual bool isExplicitImplicit() const
+    virtual bool isImplicit()         const override {return true;}
+    virtual bool isExplicitImplicit() const override
       {return isExplicit() && isImplicit();}
-    virtual bool isOneStepMethod()   const {return true;}
-    virtual bool isMultiStepMethod() const {return !isOneStepMethod();}
+    virtual bool isOneStepMethod()   const override {return true;}
+    virtual bool isMultiStepMethod() const override {return !isOneStepMethod();}
 
-    virtual OrderODE getOrderODE()   const {return FIRST_ORDER_ODE;}
+    virtual OrderODE getOrderODE()   const override {return FIRST_ORDER_ODE;}
 
     virtual std::string getDescription() const = 0;
   //@}
@@ -203,25 +207,38 @@ class StepperDIRK : virtual public Tempus::StepperImplicit<Scalar>,
   Teuchos::RCP<Thyra::VectorBase<Scalar> >& getXTilde() {return xTilde_;}
 
   /// Return alpha = d(xDot)/dx.
-  virtual Scalar getAlpha(const Scalar dt) const
+  virtual Scalar getAlpha(const Scalar dt) const override
   {
+    const int numStages = this->tableau_->numStages();
     const Teuchos::SerialDenseMatrix<int,Scalar> & A=this->tableau_->A();
-    return Scalar(1.0)/(dt*A(0,0));  // Getting the first diagonal coeff!
+    Scalar aii = A(0,0);
+    for (int i=0; i<numStages; ++i) {
+      if (A(i,i) != 0.0) aii = A(i,i);
+      break;
+    }
+    return (aii == 0.0) ? std::numeric_limits<Scalar>::infinity() : Scalar(1.0)/(dt*aii);
   }
   /// Return beta  = d(x)/dx.
-  virtual Scalar getBeta (const Scalar   ) const { return Scalar(1.0); }
+  virtual Scalar getBeta (const Scalar   ) const override { return Scalar(1.0); }
+
+  /// Return alpha = d(xDot)/dx for stage i.
+  virtual Scalar getAlpha(const Scalar dt, int i) const
+  {
+    const Teuchos::SerialDenseMatrix<int,Scalar> & A=this->tableau_->A();
+    return (A(i,i) == 0.0) ? std::numeric_limits<Scalar>::infinity() : Scalar(1.0)/(dt*A(i,i));
+  }
 
-  virtual Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const;
+  virtual Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const override;
 
   Teuchos::RCP<Teuchos::ParameterList> getValidParametersBasicDIRK() const;
 
   /// \name Overridden from Teuchos::Describable
   //@{
     virtual void describe(Teuchos::FancyOStream        & out,
-                          const Teuchos::EVerbosityLevel verbLevel) const;
+                          const Teuchos::EVerbosityLevel verbLevel) const override;
   //@}
 
-  virtual bool isValidSetup(Teuchos::FancyOStream & out) const;
+  virtual bool isValidSetup(Teuchos::FancyOStream & out) const override;
 
   /// Set StepperDIRK member data from the ParameterList.
   virtual void setStepperDIRKValues(Teuchos::RCP<Teuchos::ParameterList> pl)
@@ -258,6 +275,9 @@ class StepperDIRK : virtual public Tempus::StepperImplicit<Scalar>,
 
   virtual void setupTableau() = 0;
 
+  virtual void setEmbeddedMemory() override;
+
+
   std::vector<Teuchos::RCP<Thyra::VectorBase<Scalar> > > stageXDot_;
   Teuchos::RCP<Thyra::VectorBase<Scalar> >               xTilde_;
 
diff --git a/packages/tempus/src/Tempus_StepperDIRK_impl.hpp b/packages/tempus/src/Tempus_StepperDIRK_impl.hpp
index b9265ce89aa6..24656daf38e6 100644
--- a/packages/tempus/src/Tempus_StepperDIRK_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperDIRK_impl.hpp
@@ -76,24 +76,59 @@ StepperDIRK<Scalar>::getValidParametersBasicDIRK() const
 template<class Scalar>
 void StepperDIRK<Scalar>::initialize()
 {
-  // Initialize the stage vectors
+  TEUCHOS_TEST_FOR_EXCEPTION(
+    this->tableau_ == Teuchos::null, std::logic_error,
+    "Error - Need to set the tableau, before calling "
+    "StepperDIRK::initialize()\n");
+
+  TEUCHOS_TEST_FOR_EXCEPTION(
+    this->wrapperModel_==Teuchos::null, std::logic_error,
+    "Error - Need to set the model, setModel(), before calling "
+    "StepperDIRK::initialize()\n");
+
+  StepperImplicit<Scalar>::initialize();
+}
+
+
+template<class Scalar>
+void StepperDIRK<Scalar>::setModel(
+  const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel)
+{
+  StepperImplicit<Scalar>::setModel(appModel);
+
+  // Set the stage vectors
   const int numStages = this->tableau_->numStages();
   stageXDot_.resize(numStages);
   for (int i=0; i<numStages; ++i) {
     stageXDot_[i] = Thyra::createMember(this->wrapperModel_->get_f_space());
     assign(stageXDot_[i].ptr(), Teuchos::ScalarTraits<Scalar>::zero());
   }
-  xTilde_    = Thyra::createMember(this->wrapperModel_->get_x_space());
-  assign(xTilde_.ptr(),    Teuchos::ScalarTraits<Scalar>::zero());
+  xTilde_ = Thyra::createMember(this->wrapperModel_->get_x_space());
+  assign(xTilde_.ptr(), Teuchos::ScalarTraits<Scalar>::zero());
+
+  this->setEmbeddedMemory();
+
+  this->isInitialized_ = false;
+}
+
+
+template<class Scalar>
+void StepperDIRK<Scalar>::setEmbeddedMemory()
+{
+  if (this->getModel() == Teuchos::null)
+    return;  // Embedded memory will be set when setModel() is called.
 
   if (this->tableau_->isEmbedded() && this->getUseEmbedded()) {
     this->ee_    = Thyra::createMember(this->wrapperModel_->get_f_space());
     this->abs_u0 = Thyra::createMember(this->wrapperModel_->get_f_space());
     this->abs_u  = Thyra::createMember(this->wrapperModel_->get_f_space());
     this->sc     = Thyra::createMember(this->wrapperModel_->get_f_space());
+  } else {
+    this->ee_    = Teuchos::null;
+    this->abs_u0 = Teuchos::null;
+    this->abs_u  = Teuchos::null;
+    this->sc     = Teuchos::null;
   }
-
-  StepperImplicit<Scalar>::initialize();
 }
 
 
@@ -302,6 +337,7 @@ void StepperDIRK<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -328,6 +364,7 @@ void StepperDIRK<Scalar>::describe(
 template<class Scalar>
 bool StepperDIRK<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp b/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp
index 8f664f351f89..60dceefd8cd0 100644
--- a/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperExplicitRK_decl.hpp
@@ -108,6 +108,10 @@ class StepperExplicitRK : virtual public Tempus::StepperExplicit<Scalar>,
     /// Initialize during construction and after changing input parameters.
     virtual void initialize();
 
+    /// Set model
+    virtual void setModel(
+      const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
+
     /// Set the initial conditions and make them consistent.
     virtual void setInitialConditions (
       const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
@@ -162,6 +166,8 @@ class StepperExplicitRK : virtual public Tempus::StepperExplicit<Scalar>,
 
   virtual void setupTableau() = 0;
 
+  virtual void setEmbeddedMemory();
+
 
   std::vector<Teuchos::RCP<Thyra::VectorBase<Scalar> > > stageXDot_;
 
diff --git a/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp b/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp
index fcfd33c0d7b8..cb6afef5a5e6 100644
--- a/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperExplicitRK_impl.hpp
@@ -167,7 +167,17 @@ void StepperExplicitRK<Scalar>::initialize()
     "Error - Need to set the model, setModel(), before calling "
     "StepperExplicitRK::initialize()\n");
 
-  // Initialize the stage vectors
+  Stepper<Scalar>::initialize();
+}
+
+
+template<class Scalar>
+void StepperExplicitRK<Scalar>::setModel(
+  const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel)
+{
+  StepperExplicit<Scalar>::setModel(appModel);
+
+  // Set the stage vectors
   int numStages = this->tableau_->numStages();
   stageXDot_.resize(numStages);
   for (int i=0; i<numStages; ++i) {
@@ -175,14 +185,29 @@ void StepperExplicitRK<Scalar>::initialize()
     assign(stageXDot_[i].ptr(), Teuchos::ScalarTraits<Scalar>::zero());
   }
 
+  this->setEmbeddedMemory();
+
+  this->isInitialized_ = false;
+}
+
+
+template<class Scalar>
+void StepperExplicitRK<Scalar>::setEmbeddedMemory()
+{
+  if (this->getModel() == Teuchos::null)
+    return;  // Embedded memory will be set when setModel() is called.
+
   if ( this->tableau_->isEmbedded() && this->getUseEmbedded() ){
-     this->ee_ = Thyra::createMember(this->appModel_->get_f_space());
-     this->abs_u0 = Thyra::createMember(this->appModel_->get_f_space());
-     this->abs_u = Thyra::createMember(this->appModel_->get_f_space());
-     this->sc = Thyra::createMember(this->appModel_->get_f_space());
+    this->ee_    = Thyra::createMember(this->appModel_->get_f_space());
+    this->abs_u0 = Thyra::createMember(this->appModel_->get_f_space());
+    this->abs_u  = Thyra::createMember(this->appModel_->get_f_space());
+    this->sc     = Thyra::createMember(this->appModel_->get_f_space());
+  } else {
+    this->ee_    = Teuchos::null;
+    this->abs_u0 = Teuchos::null;
+    this->abs_u  = Teuchos::null;
+    this->sc     = Teuchos::null;
   }
-
-  Stepper<Scalar>::initialize();
 }
 
 
@@ -313,7 +338,7 @@ void StepperExplicitRK<Scalar>::takeStep(
       Teuchos::SerialDenseVector<int,Scalar> errWght = b ;
       errWght -= this->tableau_->bstar();
 
-      //compute local truncation error estimate: | u^{n+1} - \hat{u}^{n+1} |
+      // Compute local truncation error estimate: | u^{n+1} - \hat{u}^{n+1} |
       // Sum for solution: ee_n = Sum{ (b(i) - bstar(i)) * dt*f(i) }
       assign(this->ee_.ptr(), Teuchos::ScalarTraits<Scalar>::zero());
       for (int i=0; i < numStages; ++i) {
@@ -322,13 +347,13 @@ void StepperExplicitRK<Scalar>::takeStep(
          }
       }
 
-      // compute: Atol + max(|u^n|, |u^{n+1}| ) * Rtol
+      // Compute: Atol + max(|u^n|, |u^{n+1}| ) * Rtol
       Thyra::abs( *(currentState->getX()), this->abs_u0.ptr());
       Thyra::abs( *(workingState->getX()), this->abs_u.ptr());
       Thyra::pair_wise_max_update(tolRel, *this->abs_u0, this->abs_u.ptr());
       Thyra::add_scalar(tolAbs, this->abs_u.ptr());
 
-      //compute: || ee / sc ||
+      // Compute: || ee / sc ||
       assign(this->sc.ptr(), Teuchos::ScalarTraits<Scalar>::zero());
       Thyra::ele_wise_divide(Teuchos::as<Scalar>(1.0), *this->ee_, *this->abs_u,this->sc.ptr());
 
@@ -336,7 +361,7 @@ void StepperExplicitRK<Scalar>::takeStep(
       Scalar err = std::abs(Thyra::norm(*this->sc)) / space_dim ;
       workingState->setErrorRel(err);
 
-      // test if step should be rejected
+      // Test if step should be rejected
       if (std::isinf(err) || std::isnan(err) || err > Teuchos::as<Scalar>(1.0))
         workingState->setSolutionStatus(Status::FAILED);
     }
@@ -371,6 +396,7 @@ void StepperExplicitRK<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperExplicit<Scalar>::describe(out, verbLevel);
@@ -396,6 +422,7 @@ void StepperExplicitRK<Scalar>::describe(
 template<class Scalar>
 bool StepperExplicitRK<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperExplicit_decl.hpp b/packages/tempus/src/Tempus_StepperExplicit_decl.hpp
index a3a9a7efc76e..354b8a63b8a1 100644
--- a/packages/tempus/src/Tempus_StepperExplicit_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperExplicit_decl.hpp
@@ -46,11 +46,13 @@ class StepperExplicit : virtual public Tempus::Stepper<Scalar>
 
   /// \name Basic explicit stepper methods
   //@{
+    /// Set model
     virtual void setModel(
       const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
 
+    /// Return the application ModelEvaluator.
     virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >
-      getModel(){return appModel_;}
+      getModel() const {return appModel_;}
 
     virtual Scalar getInitTimeStep(
         const Teuchos::RCP<SolutionHistory<Scalar> >& /* solutionHistory */) const
diff --git a/packages/tempus/src/Tempus_StepperExplicit_impl.hpp b/packages/tempus/src/Tempus_StepperExplicit_impl.hpp
index af7f2868a30c..fcf39e451320 100644
--- a/packages/tempus/src/Tempus_StepperExplicit_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperExplicit_impl.hpp
@@ -319,16 +319,21 @@ template<class Scalar>
 void StepperExplicit<Scalar>::describe(Teuchos::FancyOStream        & out,
                                const Teuchos::EVerbosityLevel verbLevel) const
 {
-  out << "--- StepperExplicit ---\n";
-  out << "  appModel_         = " << appModel_ << std::endl;
-  out << "  inArgs_           = " << inArgs_ << std::endl;
-  out << "  outArgs_          = " << outArgs_ << std::endl;
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "--- StepperExplicit ---\n"
+         << "  appModel_         = " << appModel_ << std::endl
+         << "  inArgs_           = " << inArgs_   << std::endl
+         << "  outArgs_          = " << outArgs_  << std::endl;
 }
 
 
 template<class Scalar>
 bool StepperExplicit<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if (appModel_ == Teuchos::null) {
diff --git a/packages/tempus/src/Tempus_StepperFactory_impl.hpp b/packages/tempus/src/Tempus_StepperFactory_impl.hpp
index 46568dd249c0..c6b29a299fa3 100644
--- a/packages/tempus/src/Tempus_StepperFactory_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperFactory_impl.hpp
@@ -198,6 +198,7 @@ createStepper(
   else {
     Teuchos::RCP<Teuchos::FancyOStream> out =
       Teuchos::VerboseObjectBase::getDefaultOStream();
+    out->setOutputToRootOnly(0);
     Teuchos::OSTab ostab(out,1,"StepperFactory::createStepper");
     *out
     << "Unknown Stepper Type!  ('"+stepperType+"').\n"
diff --git a/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp b/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp
index 917164a7a8ec..3ddefeb75b13 100644
--- a/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperForwardEuler_impl.hpp
@@ -185,18 +185,24 @@ void StepperForwardEuler<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
-  out << std::endl;
-  Stepper<Scalar>::describe(out, verbLevel);
-  StepperExplicit<Scalar>::describe(out, verbLevel);
-  out << "  stepperFEAppAction_                = "
-      << stepperFEAppAction_ << std::endl;
-  out << "----------------------------" << std::endl;
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << std::endl;
+  Stepper<Scalar>::describe(*l_out, verbLevel);
+  StepperExplicit<Scalar>::describe(*l_out, verbLevel);
+  *l_out << "  stepperFEAppAction_ = "
+         << stepperFEAppAction_ << std::endl
+         << "----------------------------" << std::endl;
 }
 
 
 template<class Scalar>
 bool StepperForwardEuler<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
+
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp b/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp
index ba1c317ada88..447e82e700ba 100644
--- a/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperHHTAlpha_impl.hpp
@@ -488,17 +488,18 @@ void StepperHHTAlpha<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
 
 #ifdef VERBOSE_DEBUG_OUTPUT
   *out_ << "DEBUG: " << __PRETTY_FUNCTION__ << "\n";
 #endif
 
-  out << std::endl;
-  Stepper<Scalar>::describe(out, verbLevel);
-  StepperImplicit<Scalar>::describe(out, verbLevel);
+  *l_out << std::endl;
+  Stepper<Scalar>::describe(*l_out, verbLevel);
+  StepperImplicit<Scalar>::describe(*l_out, verbLevel);
 
-  auto l_out = Teuchos::fancyOStream( out.getOStream() );
-  l_out->setOutputToRootOnly(0);
   *l_out << "--- StepperHHTAlpha ---\n";
   *l_out << "  schemeName_ = " << schemeName_ << std::endl;
   *l_out << "  beta_       = " << beta_       << std::endl;
@@ -512,6 +513,7 @@ void StepperHHTAlpha<Scalar>::describe(
 template<class Scalar>
 bool StepperHHTAlpha<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp
index 40dd3f51a884..81344b2cb50d 100644
--- a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_decl.hpp
@@ -377,7 +377,7 @@ class StepperIMEX_RK_Partition : virtual public Tempus::StepperImplicit<Scalar>,
     virtual void setModel(
       const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
 
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel()
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const
      { return this->wrapperModel_; }
 
     virtual void setModelPair(
diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp
index 361a32c6f1d4..0a8eae3d6f1d 100644
--- a/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperIMEX_RK_Partition_impl.hpp
@@ -788,6 +788,7 @@ void StepperIMEX_RK_Partition<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -817,6 +818,8 @@ void StepperIMEX_RK_Partition<Scalar>::describe(
 template<class Scalar>
 bool StepperIMEX_RK_Partition<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
+
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp
index 9d707971c285..b69b52c4b670 100644
--- a/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperIMEX_RK_decl.hpp
@@ -347,7 +347,7 @@ class StepperIMEX_RK : virtual public Tempus::StepperImplicit<Scalar>,
     virtual void setModel(
       const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
 
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel()
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const
      { return this->wrapperModel_; }
 
     virtual void setModelPair(
diff --git a/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp b/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp
index 1e88b3b7693f..1ac0de02f7c9 100644
--- a/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperIMEX_RK_impl.hpp
@@ -859,6 +859,8 @@ void StepperIMEX_RK<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
+
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -888,6 +890,7 @@ void StepperIMEX_RK<Scalar>::describe(
 template<class Scalar>
 bool StepperIMEX_RK<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperImplicit_decl.hpp b/packages/tempus/src/Tempus_StepperImplicit_decl.hpp
index 49ce7ef997ac..d95af0a98c98 100644
--- a/packages/tempus/src/Tempus_StepperImplicit_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperImplicit_decl.hpp
@@ -231,10 +231,11 @@ class StepperImplicit : virtual public Tempus::Stepper<Scalar>
 
   /// \name Basic implicit stepper methods
   //@{
+    /// Set the model
     virtual void setModel(
-      const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
+      const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel) override;
 
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel()
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const override
     {
       Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > model;
       if (wrapperModel_ != Teuchos::null) model = wrapperModel_->getAppModel();
@@ -248,14 +249,14 @@ class StepperImplicit : virtual public Tempus::Stepper<Scalar>
 
     /// Set solver.
     virtual void setSolver(
-      Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > solver);
+      Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > solver) override;
 
-    virtual Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > getSolver() const
+    virtual Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > getSolver() const override
       { return solver_; }
 
     /// Set the initial conditions and make them consistent.
     virtual void setInitialConditions (
-      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory);
+      const Teuchos::RCP<SolutionHistory<Scalar> >& solutionHistory) override;
 
     /// Return alpha = d(xDot)/dx.
     virtual Scalar getAlpha(const Scalar dt) const = 0;
@@ -284,7 +285,7 @@ class StepperImplicit : virtual public Tempus::Stepper<Scalar>
 
     /// Pass initial guess to Newton solver (only relevant for implicit solvers)
     virtual void setInitialGuess(
-      Teuchos::RCP<const Thyra::VectorBase<Scalar> > initialGuess)
+      Teuchos::RCP<const Thyra::VectorBase<Scalar> > initialGuess) override
     {
       initialGuess_ = initialGuess;
       this->isInitialized_ = false;
@@ -299,19 +300,19 @@ class StepperImplicit : virtual public Tempus::Stepper<Scalar>
     virtual bool getZeroInitialGuess() const { return zeroInitialGuess_; }
 
     virtual Scalar getInitTimeStep(
-      const Teuchos::RCP<SolutionHistory<Scalar> >& /* solutionHistory */) const
+      const Teuchos::RCP<SolutionHistory<Scalar> >& /* solutionHistory */) const override
     {return Scalar(1.0e+99);}
   //@}
 
   /// \name Overridden from Teuchos::Describable
   //@{
     virtual void describe(Teuchos::FancyOStream        & out,
-                          const Teuchos::EVerbosityLevel verbLevel) const;
+                          const Teuchos::EVerbosityLevel verbLevel) const override;
   //@}
 
-  virtual bool isValidSetup(Teuchos::FancyOStream & out) const;
+  virtual bool isValidSetup(Teuchos::FancyOStream & out) const override;
 
-  virtual Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const;
+  virtual Teuchos::RCP<const Teuchos::ParameterList> getValidParameters() const override;
 
   Teuchos::RCP<Teuchos::ParameterList> getValidParametersBasicImplicit() const;
 
diff --git a/packages/tempus/src/Tempus_StepperImplicit_impl.hpp b/packages/tempus/src/Tempus_StepperImplicit_impl.hpp
index d3efae855f47..3048735a90c0 100644
--- a/packages/tempus/src/Tempus_StepperImplicit_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperImplicit_impl.hpp
@@ -328,6 +328,7 @@ template<class Scalar>
 void StepperImplicit<Scalar>::describe(Teuchos::FancyOStream        & out,
                                const Teuchos::EVerbosityLevel verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << "--- StepperImplicit ---\n";
   out << "  wrapperModel_     = " << wrapperModel_ << std::endl;
   out << "  solver_           = " << solver_ << std::endl;
@@ -340,6 +341,7 @@ void StepperImplicit<Scalar>::describe(Teuchos::FancyOStream        & out,
 template<class Scalar>
 bool StepperImplicit<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if (wrapperModel_->getAppModel() == Teuchos::null) {
diff --git a/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp b/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp
index 4e9cf5a19d65..3c4a8cf09e7b 100644
--- a/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperLeapfrog_impl.hpp
@@ -189,6 +189,7 @@ void StepperLeapfrog<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperExplicit<Scalar>::describe(out, verbLevel);
@@ -203,6 +204,7 @@ void StepperLeapfrog<Scalar>::describe(
 template<class Scalar>
 bool StepperLeapfrog<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp
index 31ca61c79ea9..5ebe36dcdabc 100644
--- a/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperNewmarkExplicitAForm_impl.hpp
@@ -343,6 +343,7 @@ void StepperNewmarkExplicitAForm<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperExplicit<Scalar>::describe(out, verbLevel);
@@ -356,6 +357,7 @@ void StepperNewmarkExplicitAForm<Scalar>::describe(
 template<class Scalar>
 bool StepperNewmarkExplicitAForm<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp
index 919314c459d4..8b076752e38d 100644
--- a/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperNewmarkImplicitAForm_impl.hpp
@@ -549,6 +549,7 @@ void StepperNewmarkImplicitAForm<Scalar>::describe(
 template<class Scalar>
 bool StepperNewmarkImplicitAForm<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
   out.setOutputToRootOnly(0);
 
diff --git a/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp b/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp
index 96556c93af97..dc69d0fbdfa8 100644
--- a/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperNewmarkImplicitDForm_impl.hpp
@@ -455,6 +455,7 @@ StepperNewmarkImplicitDForm<Scalar>::describe(
   *out_ << "DEBUG: " << __PRETTY_FUNCTION__ << "\n";
 #endif
 
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -470,6 +471,7 @@ StepperNewmarkImplicitDForm<Scalar>::describe(
 template<class Scalar>
 bool StepperNewmarkImplicitDForm<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp b/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp
index 5af4c130b3d9..e5f3c57c7e23 100644
--- a/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperOperatorSplit_decl.hpp
@@ -90,8 +90,7 @@ class StepperOperatorSplit : virtual public Tempus::Stepper<Scalar>
     virtual void setModel(
       const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
 
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >
-      getModel();
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const;
 
     virtual void setSolver(
         Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > solver);
diff --git a/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp b/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp
index ac3e68e711c9..cd6fef62fe57 100644
--- a/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperOperatorSplit_impl.hpp
@@ -85,7 +85,7 @@ void StepperOperatorSplit<Scalar>::setModel(
 
 template<class Scalar>
 Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >
-StepperOperatorSplit<Scalar>::getModel()
+StepperOperatorSplit<Scalar>::getModel() const
 {
   Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > model;
   typename std::vector<Teuchos::RCP<Stepper<Scalar> > >::const_iterator
@@ -350,6 +350,7 @@ void StepperOperatorSplit<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
 
@@ -375,6 +376,7 @@ void StepperOperatorSplit<Scalar>::describe(
 template<class Scalar>
 bool StepperOperatorSplit<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperRKBase.hpp b/packages/tempus/src/Tempus_StepperRKBase.hpp
index 89c133b22af7..b393d608a20d 100644
--- a/packages/tempus/src/Tempus_StepperRKBase.hpp
+++ b/packages/tempus/src/Tempus_StepperRKBase.hpp
@@ -43,7 +43,13 @@ class StepperRKBase : virtual public Tempus::Stepper<Scalar>
   virtual int getStageNumber() const { return stageNumber_; }
   virtual void setStageNumber(int s) { stageNumber_ = s; }
 
-  virtual void setUseEmbedded(bool a) { useEmbedded_ = a; }
+  virtual void setUseEmbedded(bool a)
+  {
+    useEmbedded_ = a;
+    this->setEmbeddedMemory();
+    this->isInitialized_ = false;
+  }
+
   virtual bool getUseEmbedded() const { return useEmbedded_; }
 
   virtual void setAppAction(Teuchos::RCP<StepperRKAppAction<Scalar> > appAction)
@@ -181,6 +187,8 @@ class StepperRKBase : virtual public Tempus::Stepper<Scalar>
 
 protected:
 
+  virtual void setEmbeddedMemory() {}
+
   Teuchos::RCP<RKButcherTableau<Scalar> >   tableau_;
 
   // For Embedded RK
diff --git a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp
index 20a86add290c..94432e9695ba 100644
--- a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_decl.hpp
@@ -82,7 +82,7 @@ class StepperStaggeredForwardSensitivity :
   //@{
     virtual void setModel(
       const Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >& appModel);
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel();
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const;
 
     virtual void setSolver(
       Teuchos::RCP<Thyra::NonlinearSolverBase<Scalar> > solver = Teuchos::null);
diff --git a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp
index e4f513d785bf..898b6e417804 100644
--- a/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperStaggeredForwardSensitivity_impl.hpp
@@ -89,7 +89,7 @@ setModel(
 template<class Scalar>
 Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >
 StepperStaggeredForwardSensitivity<Scalar>::
-getModel()
+getModel() const
 {
   return combined_fsa_model_;
 }
@@ -274,6 +274,7 @@ describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
 
@@ -296,6 +297,7 @@ describe(
 template<class Scalar>
 bool StepperStaggeredForwardSensitivity<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_StepperState.hpp b/packages/tempus/src/Tempus_StepperState.hpp
index 78b1ea236f0e..df2dc59ed091 100644
--- a/packages/tempus/src/Tempus_StepperState.hpp
+++ b/packages/tempus/src/Tempus_StepperState.hpp
@@ -59,13 +59,19 @@ class StepperState :
 
   /// \name Overridden from Teuchos::Describable
   //@{
-    virtual std::string description() const { return "Tempus::StepperState"; }
+    virtual std::string description() const
+    {
+      return "Tempus::StepperState - '" + stepperName_ + "'";
+    }
 
     virtual void describe(Teuchos::FancyOStream        & out,
                           const Teuchos::EVerbosityLevel /* verbLevel */) const
     {
-      out << description() << "::describe" << std::endl
-          << "  stepperName   = " << stepperName_ << std::endl;
+      auto l_out = Teuchos::fancyOStream( out.getOStream() );
+      Teuchos::OSTab ostab(*l_out,2, this->description());
+      l_out->setOutputToRootOnly(0);
+
+      *l_out << "\n--- " << this->description() << " ---" << std::endl;
     }
   //@}
 
diff --git a/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp b/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp
index e14f9722e6cf..4112bdafbae0 100644
--- a/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp
+++ b/packages/tempus/src/Tempus_StepperSubcycling_decl.hpp
@@ -80,7 +80,7 @@ class StepperSubcycling : virtual public Tempus::Stepper<Scalar>
       const Teuchos::RCP<Thyra::ModelEvaluator<Scalar> >& appModel);
 
     virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> >
-      getModel(){return scIntegrator_->getStepper()->getModel();}
+      getModel() const {return scIntegrator_->getStepper()->getModel();}
 
     virtual void setAppAction(
       Teuchos::RCP<StepperSubcyclingAppAction<Scalar> > appAction = Teuchos::null);
diff --git a/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp b/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp
index 127bfe9d0310..500547139ba4 100644
--- a/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperSubcycling_impl.hpp
@@ -508,6 +508,7 @@ void StepperSubcycling<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
 
diff --git a/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp b/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp
index e876d52327c6..3e0b01aade0f 100644
--- a/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp
+++ b/packages/tempus/src/Tempus_StepperTrapezoidal_impl.hpp
@@ -188,6 +188,7 @@ void StepperTrapezoidal<Scalar>::describe(
   Teuchos::FancyOStream               &out,
   const Teuchos::EVerbosityLevel      verbLevel) const
 {
+  out.setOutputToRootOnly(0);
   out << std::endl;
   Stepper<Scalar>::describe(out, verbLevel);
   StepperImplicit<Scalar>::describe(out, verbLevel);
@@ -201,6 +202,7 @@ void StepperTrapezoidal<Scalar>::describe(
 template<class Scalar>
 bool StepperTrapezoidal<Scalar>::isValidSetup(Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !Stepper<Scalar>::isValidSetup(out) ) isValidSetup = false;
diff --git a/packages/tempus/src/Tempus_Stepper_decl.hpp b/packages/tempus/src/Tempus_Stepper_decl.hpp
index aced12426a84..6dbed0b6ce5f 100644
--- a/packages/tempus/src/Tempus_Stepper_decl.hpp
+++ b/packages/tempus/src/Tempus_Stepper_decl.hpp
@@ -70,7 +70,7 @@ class Stepper
       const Teuchos::RCP<Thyra::ModelEvaluator<Scalar> >& /* appModel */){}
 
 #endif
-    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel()
+    virtual Teuchos::RCP<const Thyra::ModelEvaluator<Scalar> > getModel() const
     { return Teuchos::null; }
 
     /// Set solver.
diff --git a/packages/tempus/src/Tempus_Stepper_impl.hpp b/packages/tempus/src/Tempus_Stepper_impl.hpp
index 0c6d6392125e..36814abcef9d 100644
--- a/packages/tempus/src/Tempus_Stepper_impl.hpp
+++ b/packages/tempus/src/Tempus_Stepper_impl.hpp
@@ -120,12 +120,14 @@ Stepper<Scalar>::getStepperXDotDot(Teuchos::RCP<SolutionState<Scalar> > state)
 
 
 template<class Scalar>
-void Stepper<Scalar>::describe(Teuchos::FancyOStream        & in_out,
+void Stepper<Scalar>::describe(Teuchos::FancyOStream        & out,
                                const Teuchos::EVerbosityLevel verbLevel) const
 {
-  auto out = Teuchos::fancyOStream( in_out.getOStream() );
-  out->setOutputToRootOnly(0);
-  *out << "--- Stepper ---\n"
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
+
+  *l_out << "--- Stepper ---\n"
        << "  isInitialized_      = " << Teuchos::toString(isInitialized_) << std::endl
        << "  stepperType_        = " << stepperType_ << std::endl
        << "  useFSAL_            = " << Teuchos::toString(useFSAL_) << std::endl
@@ -139,18 +141,19 @@ void Stepper<Scalar>::describe(Teuchos::FancyOStream        & in_out,
 
 template<class Scalar>
 bool Stepper<Scalar>::isValidSetup(
-  Teuchos::FancyOStream & in_out) const
+  Teuchos::FancyOStream & out) const
 {
+  out.setOutputToRootOnly(0);
   bool isValidSetup = true;
 
   if ( !(ICConsistency_ == "None" || ICConsistency_ == "Zero" ||
          ICConsistency_ == "App"  || ICConsistency_ == "Consistent") ) {
     isValidSetup = false;
-    auto out = Teuchos::fancyOStream( in_out.getOStream() );
-    out->setOutputToRootOnly(0);
-    *out << "The IC consistency does not have a valid value!\n"
-         << "('None', 'Zero', 'App' or 'Consistent')\n"
-         << "  ICConsistency  = " << ICConsistency_ << "\n";
+    auto l_out = Teuchos::fancyOStream( out.getOStream() );
+    l_out->setOutputToRootOnly(0);
+    *l_out << "The IC consistency does not have a valid value!\n"
+           << "('None', 'Zero', 'App' or 'Consistent')\n"
+           << "  ICConsistency  = " << ICConsistency_ << "\n";
   }
 
   return isValidSetup;
diff --git a/packages/tempus/src/Tempus_TimeEventBase.hpp b/packages/tempus/src/Tempus_TimeEventBase.hpp
index bfa8e00038ae..8d86918c77dd 100644
--- a/packages/tempus/src/Tempus_TimeEventBase.hpp
+++ b/packages/tempus/src/Tempus_TimeEventBase.hpp
@@ -89,6 +89,7 @@ class TimeEventBase
     {
       Teuchos::RCP<Teuchos::FancyOStream> out =
         Teuchos::VerboseObjectBase::getDefaultOStream();
+      out->setOutputToRootOnly(0);
       *out << "TimeEventBase name = " << getName() << std::endl;
     }
   //@}
diff --git a/packages/tempus/src/Tempus_TimeEventComposite.hpp b/packages/tempus/src/Tempus_TimeEventComposite.hpp
index 15236e24e022..5f9f6878b3e0 100644
--- a/packages/tempus/src/Tempus_TimeEventComposite.hpp
+++ b/packages/tempus/src/Tempus_TimeEventComposite.hpp
@@ -188,6 +188,7 @@ class TimeEventComposite : virtual public TimeEventBase<Scalar>
   {
     Teuchos::RCP<Teuchos::FancyOStream> out =
       Teuchos::VerboseObjectBase::getDefaultOStream();
+    out->setOutputToRootOnly(0);
     *out << "TimeEventComposite:" << "\n"
         << "name                 = " << this->getName() << "\n"
         << "Number of TimeEvents = " << timeEvents_.size() << std::endl;
diff --git a/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp b/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp
index a2166b9b01fa..6b82ad5def44 100644
--- a/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp
+++ b/packages/tempus/src/Tempus_TimeEventListIndex_impl.hpp
@@ -126,6 +126,7 @@ void TimeEventListIndex<Scalar>::describe() const
 {
   Teuchos::RCP<Teuchos::FancyOStream> out =
     Teuchos::VerboseObjectBase::getDefaultOStream();
+  out->setOutputToRootOnly(0);
   *out << "TimeEventListIndex:" << "\n"
        << "name       = " << this->getName() << "\n"
        << "IndexList_ = " << std::endl;
diff --git a/packages/tempus/src/Tempus_TimeEventList_impl.hpp b/packages/tempus/src/Tempus_TimeEventList_impl.hpp
index 797b28ae7ce3..5151f1306fe9 100644
--- a/packages/tempus/src/Tempus_TimeEventList_impl.hpp
+++ b/packages/tempus/src/Tempus_TimeEventList_impl.hpp
@@ -176,6 +176,7 @@ void TimeEventList<Scalar>::describe() const
 {
   Teuchos::RCP<Teuchos::FancyOStream> out =
     Teuchos::VerboseObjectBase::getDefaultOStream();
+  out->setOutputToRootOnly(0);
   *out << "TimeEventList:" << "\n"
        << "name           = " << this->getName() << "\n"
        << "timeScale_     = " << timeScale_     << "\n"
diff --git a/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp b/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp
index 0bb6ccb340c4..9dff4f43b2f0 100644
--- a/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp
+++ b/packages/tempus/src/Tempus_TimeEventRangeIndex_impl.hpp
@@ -138,6 +138,7 @@ void TimeEventRangeIndex<Scalar>::describe() const
 {
   Teuchos::RCP<Teuchos::FancyOStream> out =
     Teuchos::VerboseObjectBase::getDefaultOStream();
+  out->setOutputToRootOnly(0);
   *out << "TimeEventRange:" << "\n"
        << "name       = " << this->getName() << "\n"
        << "start_     = " << start_     << "\n"
diff --git a/packages/tempus/src/Tempus_TimeEventRange_impl.hpp b/packages/tempus/src/Tempus_TimeEventRange_impl.hpp
index ec4c97a10d34..c8c24ed5b2ad 100644
--- a/packages/tempus/src/Tempus_TimeEventRange_impl.hpp
+++ b/packages/tempus/src/Tempus_TimeEventRange_impl.hpp
@@ -205,6 +205,7 @@ void TimeEventRange<Scalar>::describe() const
 {
   Teuchos::RCP<Teuchos::FancyOStream> out =
     Teuchos::VerboseObjectBase::getDefaultOStream();
+  out->setOutputToRootOnly(0);
   *out << "TimeEventRange:" << "\n"
        << "name           = " << this->getName() << "\n"
        << "start_         = " << start_         << "\n"
diff --git a/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp b/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp
index 69bedd796255..fa9ac4bad1ae 100644
--- a/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp
+++ b/packages/tempus/src/Tempus_TimeStepControlStrategyBasicVS.hpp
@@ -212,13 +212,21 @@ class TimeStepControlStrategyBasicVS
     void describe(Teuchos::FancyOStream          &out,
                   const Teuchos::EVerbosityLevel verbLevel) const override
     {
-      Teuchos::OSTab ostab(out,2,"describe");
-      out << description() << "::describe:" << std::endl
-          << "StrategyType                      = " << this->getStrategyType()<< std::endl
-          << "Amplification Factor              = " << getAmplFactor()   << std::endl
-          << "Reduction Factor                  = " << getReductFactor() << std::endl
-          << "Minimum Value Monitoring Function = " << getMinEta()       << std::endl
-          << "Maximum Value Monitoring Function = " << getMaxEta()       << std::endl;
+      auto l_out = Teuchos::fancyOStream( out.getOStream() );
+      Teuchos::OSTab ostab(*l_out, 2, this->description());
+      l_out->setOutputToRootOnly(0);
+
+      *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
+      if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
+        *l_out << "  StrategyType                      = " << this->getStrategyType()<< std::endl
+               << "  Step Type                         = " << this->getStepType() << std::endl
+               << "  Amplification Factor              = " << getAmplFactor()   << std::endl
+               << "  Reduction Factor                  = " << getReductFactor() << std::endl
+               << "  Minimum Value Monitoring Function = " << getMinEta()       << std::endl
+               << "  Maximum Value Monitoring Function = " << getMaxEta()       << std::endl;
+        *l_out << std::string(this->description().length()+8, '-') <<std::endl;
+      }
     }
   //@}
 
diff --git a/packages/tempus/src/Tempus_TimeStepControlStrategyComposite.hpp b/packages/tempus/src/Tempus_TimeStepControlStrategyComposite.hpp
index b61a9876a2be..23f126fe5875 100644
--- a/packages/tempus/src/Tempus_TimeStepControlStrategyComposite.hpp
+++ b/packages/tempus/src/Tempus_TimeStepControlStrategyComposite.hpp
@@ -69,25 +69,31 @@ class TimeStepControlStrategyComposite
     std::string description() const override
     { return "Tempus::TimeStepControlComposite"; }
 
-    void describe(Teuchos::FancyOStream          &in_out,
+    void describe(Teuchos::FancyOStream          &out,
                   const Teuchos::EVerbosityLevel verbLevel) const override
     {
-      auto out = Teuchos::fancyOStream( in_out.getOStream() );
-      out->setOutputToRootOnly(0);
-      Teuchos::OSTab ostab(*out,2,"describe");
-      *out << description() << "::describe:" << std::endl
-           << "Strategy Type = " << this->getStrategyType()<< std::endl
-           << "Step Type     = " << this->getStepType()<< std::endl;
-
-      std::stringstream sList;
-      for(std::size_t i = 0; i < strategies_.size(); ++i) {
-        sList << strategies_[i]->getStrategyType();
-        if (i < strategies_.size()-1) sList << ", ";
-      }
-      *out << "Strategy List = " << sList.str() << std::endl;
+      auto l_out = Teuchos::fancyOStream( out.getOStream() );
+      Teuchos::OSTab ostab(*l_out, 2, this->description());
+      l_out->setOutputToRootOnly(0);
+
+      *l_out << "\n--- " << this->description() << " ---" << std::endl;
 
-      for(auto& s : strategies_)
-        s->describe(*out, verbLevel);
+      if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
+        *l_out << "  Strategy Type = " << this->getStrategyType()<< std::endl
+             << "  Step Type     = " << this->getStepType()<< std::endl;
+
+        std::stringstream sList;
+        for(std::size_t i = 0; i < strategies_.size(); ++i) {
+          sList << strategies_[i]->getStrategyType();
+          if (i < strategies_.size()-1) sList << ", ";
+        }
+        *l_out << "  Strategy List = " << sList.str() << std::endl;
+
+        for(auto& s : strategies_)
+          s->describe(*l_out, verbLevel);
+
+        *l_out << std::string(this->description().length()+8, '-') <<std::endl;
+      }
     }
   //@}
 
diff --git a/packages/tempus/src/Tempus_TimeStepControlStrategyConstant.hpp b/packages/tempus/src/Tempus_TimeStepControlStrategyConstant.hpp
index c5a5518f32ac..5da629d5dfe5 100644
--- a/packages/tempus/src/Tempus_TimeStepControlStrategyConstant.hpp
+++ b/packages/tempus/src/Tempus_TimeStepControlStrategyConstant.hpp
@@ -67,6 +67,7 @@ class TimeStepControlStrategyConstant
 
     RCP<Teuchos::FancyOStream> out = tsc.getOStream();
     Teuchos::OSTab ostab(out,1,"setNextTimeStep");
+    out->setOutputToRootOnly(0);
 
 
     // Check constant time step
@@ -124,11 +125,19 @@ class TimeStepControlStrategyConstant
     void describe(Teuchos::FancyOStream          &out,
                   const Teuchos::EVerbosityLevel verbLevel) const override
     {
-      Teuchos::OSTab ostab(out,2,"describe");
-      out << description() << std::endl
-          << "Strategy Type = " << this->getStrategyType() << std::endl
-          << "Step Type     = " << this->getStepType() << std::endl
-          << "Time Step     = " << getConstantTimeStep() << std::endl;
+      auto l_out = Teuchos::fancyOStream( out.getOStream() );
+      Teuchos::OSTab ostab(*l_out, 2, this->description());
+      l_out->setOutputToRootOnly(0);
+
+      *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
+      if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
+        *l_out << "  Strategy Type = " << this->getStrategyType() << std::endl
+               << "  Step Type     = " << this->getStepType() << std::endl
+               << "  Time Step     = " << getConstantTimeStep() << std::endl;
+
+        *l_out << std::string(this->description().length()+8, '-') <<std::endl;
+      }
     }
   //@}
 
diff --git a/packages/tempus/src/Tempus_TimeStepControlStrategyIntegralController.hpp b/packages/tempus/src/Tempus_TimeStepControlStrategyIntegralController.hpp
index 2c8128a9e3d2..fcc99f80b065 100644
--- a/packages/tempus/src/Tempus_TimeStepControlStrategyIntegralController.hpp
+++ b/packages/tempus/src/Tempus_TimeStepControlStrategyIntegralController.hpp
@@ -178,22 +178,29 @@ class TimeStepControlStrategyIntegralController
     void describe(Teuchos::FancyOStream          &out,
                   const Teuchos::EVerbosityLevel verbLevel) const override
     {
-      Teuchos::OSTab ostab(out,2,"describe");
-      out << description() << "::describe:" << std::endl
-          << "Strategy Type                      = " << this->getStrategyType() << std::endl
-          << "Step Type                          = " << this->getStepType()     << std::endl
-          << "Controller Type                    = " << getController()         << std::endl
-          << "KI                                 = " << getKI()                 << std::endl
-          << "KP                                 = " << getKP()                 << std::endl
-          << "KD                                 = " << getKD()                 << std::endl
-          << "errN_                              = " << errN_                   << std::endl
-          << "errNm1_                            = " << errNm1_                 << std::endl
-          << "errNm2_                            = " << errNm2_                 << std::endl
-          << "Safety Factor                      = " << getSafetyFactor()       << std::endl
-          << "Safety Factor After Step Rejection = " << getSafetyFactorAfterReject() << std::endl
-          << "Maximum Safety Factor (INPUT)      = " << facMaxINPUT_            << std::endl
-          << "Maximum Safety Factor              = " << getFacMax()             << std::endl
-          << "Minimum Safety Factor              = " << getFacMin()             << std::endl;
+      auto l_out = Teuchos::fancyOStream( out.getOStream() );
+      Teuchos::OSTab ostab(*l_out, 2, this->description());
+      l_out->setOutputToRootOnly(0);
+
+      *l_out << "\n--- " << this->description() << " ---" << std::endl;
+
+      if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
+       *l_out << "  Strategy Type                      = " << this->getStrategyType() << std::endl
+            << "  Step Type                          = " << this->getStepType()     << std::endl
+            << "  Controller Type                    = " << getController()         << std::endl
+            << "  KI                                 = " << getKI()                 << std::endl
+            << "  KP                                 = " << getKP()                 << std::endl
+            << "  KD                                 = " << getKD()                 << std::endl
+            << "  errN_                              = " << errN_                   << std::endl
+            << "  errNm1_                            = " << errNm1_                 << std::endl
+            << "  errNm2_                            = " << errNm2_                 << std::endl
+            << "  Safety Factor                      = " << getSafetyFactor()       << std::endl
+            << "  Safety Factor After Step Rejection = " << getSafetyFactorAfterReject() << std::endl
+            << "  Maximum Safety Factor (INPUT)      = " << facMaxINPUT_            << std::endl
+            << "  Maximum Safety Factor              = " << getFacMax()             << std::endl
+            << "  Minimum Safety Factor              = " << getFacMin()             << std::endl;
+        *l_out << std::string(this->description().length()+8, '-') <<std::endl;
+      }
     }
   //@}
 
diff --git a/packages/tempus/src/Tempus_TimeStepControl_impl.hpp b/packages/tempus/src/Tempus_TimeStepControl_impl.hpp
index 416c9c7b12bb..5b9d1fe44d8b 100644
--- a/packages/tempus/src/Tempus_TimeStepControl_impl.hpp
+++ b/packages/tempus/src/Tempus_TimeStepControl_impl.hpp
@@ -460,8 +460,13 @@ void TimeStepControl<Scalar>::describe(
    Teuchos::FancyOStream               &out,
    const Teuchos::EVerbosityLevel      verbLevel) const
 {
-  if (verbLevel == Teuchos::VERB_EXTREME) {
+  auto l_out = Teuchos::fancyOStream( out.getOStream() );
+  Teuchos::OSTab ostab(*l_out, 2, this->description());
+  l_out->setOutputToRootOnly(0);
 
+  *l_out << "\n--- " << this->description() << " ---" <<std::endl;
+
+  if (Teuchos::as<int>(verbLevel) >= Teuchos::as<int>(Teuchos::VERB_MEDIUM)) {
     std::vector<int> idx = getOutputIndices();
     std::ostringstream listIdx;
     if (!idx.empty()) {
@@ -472,37 +477,36 @@ void TimeStepControl<Scalar>::describe(
     std::vector<Scalar> times = getOutputTimes();
     std::ostringstream listTimes;
     if (!times.empty()) {
-      for(std::size_t i = 0; i < times.size()-1; ++i) listTimes << times[i] << ", ";
+      for(std::size_t i = 0; i < times.size()-1; ++i)
+        listTimes << times[i] << ", ";
       listTimes << times[times.size()-1];
     }
 
-    auto l_out = Teuchos::fancyOStream( out.getOStream() );
-    l_out->setOutputToRootOnly(0);
-    *l_out << description() << "::describe:" << std::endl
-           << "stepType           = " << getStepType()            << std::endl
-           << "initTime           = " << getInitTime()            << std::endl
-           << "finalTime          = " << getFinalTime()           << std::endl
-           << "minTimeStep        = " << getMinTimeStep()         << std::endl
-           << "initTimeStep       = " << getInitTimeStep()        << std::endl
-           << "maxTimeStep        = " << getMaxTimeStep()         << std::endl
-           << "initIndex          = " << getInitIndex()           << std::endl
-           << "finalIndex         = " << getFinalIndex()          << std::endl
-           << "maxAbsError        = " << getMaxAbsError()         << std::endl
-           << "maxRelError        = " << getMaxRelError()         << std::endl
-           << "maxFailures        = " << getMaxFailures()         << std::endl
-           << "maxConsecFailures  = " << getMaxConsecFailures()   << std::endl
-           << "numTimeSteps       = " << getNumTimeSteps()        << std::endl
-           << "printDtChanges     = " << getPrintDtChanges()      << std::endl
-           << "outputExactly      = " << getOutputExactly()       << std::endl
-           << "outputIndices      = " << listIdx.str()            << std::endl
-           << "outputTimes        = " << listTimes.str()          << std::endl
-           << "outputIndexInterval= " << getOutputIndexInterval() << std::endl
-           << "outputTimeInterval = " << getOutputTimeInterval()  << std::endl
-           << "outputAdjustedDt   = " << outputAdjustedDt_        << std::endl
-           << "dtAfterOutput      = " << dtAfterOutput_           << std::endl
-           << "stepControlSrategy = " << std::endl;
-           stepControlStrategy_->describe(out, verbLevel);
+    *l_out << "  stepType           = " << getStepType()            << std::endl
+           << "  initTime           = " << getInitTime()            << std::endl
+           << "  finalTime          = " << getFinalTime()           << std::endl
+           << "  minTimeStep        = " << getMinTimeStep()         << std::endl
+           << "  initTimeStep       = " << getInitTimeStep()        << std::endl
+           << "  maxTimeStep        = " << getMaxTimeStep()         << std::endl
+           << "  initIndex          = " << getInitIndex()           << std::endl
+           << "  finalIndex         = " << getFinalIndex()          << std::endl
+           << "  maxAbsError        = " << getMaxAbsError()         << std::endl
+           << "  maxRelError        = " << getMaxRelError()         << std::endl
+           << "  maxFailures        = " << getMaxFailures()         << std::endl
+           << "  maxConsecFailures  = " << getMaxConsecFailures()   << std::endl
+           << "  numTimeSteps       = " << getNumTimeSteps()        << std::endl
+           << "  printDtChanges     = " << getPrintDtChanges()      << std::endl
+           << "  outputExactly      = " << getOutputExactly()       << std::endl
+           << "  outputIndices      = " << listIdx.str()            << std::endl
+           << "  outputTimes        = " << listTimes.str()          << std::endl
+           << "  outputIndexInterval= " << getOutputIndexInterval() << std::endl
+           << "  outputTimeInterval = " << getOutputTimeInterval()  << std::endl
+           << "  outputAdjustedDt   = " << outputAdjustedDt_        << std::endl
+           << "  dtAfterOutput      = " << dtAfterOutput_           <<std::endl;
+           stepControlStrategy_->describe(*l_out, verbLevel);
   }
+  *l_out << std::string(this->description().length()+8, '-') <<std::endl;
+
 }
 
 
diff --git a/packages/tempus/test/Integrator/Tempus_IntegratorTest.cpp b/packages/tempus/test/Integrator/Tempus_IntegratorTest.cpp
index 044943608069..565823eaadee 100644
--- a/packages/tempus/test/Integrator/Tempus_IntegratorTest.cpp
+++ b/packages/tempus/test/Integrator/Tempus_IntegratorTest.cpp
@@ -27,6 +27,7 @@ using Tempus::IntegratorBasic;
 using Tempus::SolutionHistory;
 using Tempus::SolutionState;
 
+
 // Test Integrator construction from ParameterList and ModelEvaluator.
 TEUCHOS_UNIT_TEST(IntegratorBasic, PL_ME_Construction)
 {
@@ -60,7 +61,7 @@ TEUCHOS_UNIT_TEST(IntegratorBasic, PL_ME_Construction)
 }
 
 
-// Test integator construction, and then setParameterList, setStepper, and
+// Test integrator construction, and then setParameterList, setStepper, and
 // initialization.
 TEUCHOS_UNIT_TEST(IntegratorBasic, Construction)
 {
@@ -102,4 +103,42 @@ TEUCHOS_UNIT_TEST(IntegratorBasic, Construction)
   TEST_ASSERT(pass)
 }
 
+
+TEUCHOS_UNIT_TEST(IntegratorBasic, Describe)
+{
+  // 1) Setup the ParameterList (here we start with params from .xml file)
+  RCP<ParameterList> pl = getParametersFromXmlFile("Tempus_default.xml");
+
+  // 2) Setup the ModelEvaluator
+  RCP<SinCosModel<double> > model = Teuchos::rcp(new SinCosModel<double> ());
+
+  // 3) Setup the Integrator
+  RCP<ParameterList> tempusPL = sublist(pl, "Tempus", true);
+  RCP<Tempus::IntegratorBasic<double> > integrator =
+    Tempus::createIntegratorBasic<double>(tempusPL, model);
+
+  std::ostringstream ss;
+  Teuchos::RCP<Teuchos::FancyOStream> myOut =
+    Teuchos::fancyOStream(Teuchos::rcpFromRef(ss));
+
+  integrator->describe(*myOut, Teuchos::VERB_EXTREME);
+
+  auto testS = ss.str();
+
+  // Find major headers.
+  auto npos = std::string::npos;
+  TEST_ASSERT(npos != testS.find("--- Tempus::IntegratorBasic ---"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::SolutionHistory"));
+  TEST_ASSERT(npos != testS.find("--- SolutionState (index =     0; time =         0; dt =         1) ---"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::SolutionStateMetaData ---"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::StepperState"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::PhysicsState"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::TimeStepControl ---"));
+  TEST_ASSERT(npos != testS.find("--- Tempus::TimeStepControlStrategyConstant ---"));
+  TEST_ASSERT(npos != testS.find("--- Stepper ---"));
+  TEST_ASSERT(npos != testS.find("stepperType_        = Forward Euler"));
+  TEST_ASSERT(npos != testS.find("--- StepperExplicit ---"));
+}
+
+
 } // namespace Tempus_Test
diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp
index 72ba5ed8a972..7d5e0e4c8018 100644
--- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp
+++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraEuclideanScalarProd_def.hpp
@@ -70,12 +70,6 @@ void TpetraEuclideanScalarProd<Scalar,LocalOrdinal,GlobalOrdinal,Node>::scalarPr
     // in EuclideanScalarProd transposes X...
     X_tpetra->dot(*Y_tpetra, scalarProds_out);
   } else {
-    // If one of the casts succeeded, sync that MV to host space
-    if (nonnull(X_tpetra))
-      Teuchos::rcp_const_cast<TMV>(X_tpetra)->sync_host ();
-    if (nonnull(Y_tpetra))
-      Teuchos::rcp_const_cast<TMV>(Y_tpetra)->sync_host ();
-
     EuclideanScalarProd<Scalar>::scalarProdsImpl(X, Y, scalarProds_out);
   }
 }
diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp
index 3f26dcba8249..1e01c63e2d67 100644
--- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp
+++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraLinearOp_def.hpp
@@ -413,14 +413,16 @@ void TpetraLinearOp<Scalar,LocalOrdinal,GlobalOrdinal,Node>::getRowStatImpl(
 
     size_t numMyRows = tCrsMatrix->getNodeNumRows();
 
-    Teuchos::ArrayView<const LocalOrdinal> indices;
-    Teuchos::ArrayView<const Scalar> values;
+    using crs_t = Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+    typename crs_t::local_inds_host_view_type indices;
+    typename crs_t::values_host_view_type values;
+
 
     for (size_t row=0; row < numMyRows; ++row) {
       MT sum = STM::zero ();
       tCrsMatrix->getLocalRowView (row, indices, values);
 
-      for (int col = 0; col < values.size(); ++col) {
+      for (int col = 0; col < (int) values.size(); ++col) {
         sum += STS::magnitude (values[col]);
       }
 
diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp
index 93aa0a45e584..8bf18ee68116 100644
--- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp
+++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraMultiVector_def.hpp
@@ -130,9 +130,6 @@ assignMultiVecImpl(const MultiVectorBase<Scalar>& mv)
   if (nonnull(tmv)) {
     tpetraMultiVector_.getNonconstObj()->assign(*tmv);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraMultiVector_.getNonconstObj()->sync_host ();
-    tpetraMultiVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::assignMultiVecImpl(mv);
   }
 }
@@ -160,9 +157,6 @@ void TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::updateImpl(
     typedef Teuchos::ScalarTraits<Scalar> ST;
     tpetraMultiVector_.getNonconstObj()->update(alpha, *tmv, ST::one());
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraMultiVector_.getNonconstObj()->sync_host ();
-    tpetraMultiVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::updateImpl(alpha, mv);
   }
 }
@@ -239,9 +233,6 @@ void TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::linearCombinatio
         *alphaIter, *(*tmvIter), *(alphaIter+1), *(*(tmvIter+1)), ST::one());
     }
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraMultiVector_.getNonconstObj()->sync_host ();
-    tpetraMultiVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::linearCombinationImpl(alpha, mv, beta);
   }
 }
@@ -260,9 +251,6 @@ void TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dotsImpl(
   if (nonnull(tmv)) {
     tpetraMultiVector_.getConstObj()->dot(*tmv, prods);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraMultiVector_.getNonconstObj()->sync_host ();
-    tpetraMultiVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::dotsImpl(mv, prods);
   }
 }
@@ -459,25 +447,6 @@ mvMultiReductApplyOpImpl(
   const Ordinal primary_global_offset
   ) const
 {
-  typedef TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TMV;
-
-  // Sync any non-target Tpetra MVs to host space
-  for (auto itr = multi_vecs.begin(); itr != multi_vecs.end(); ++itr) {
-    Ptr<const TMV> tmv = Teuchos::ptr_dynamic_cast<const TMV>(*itr);
-    if (nonnull(tmv)) {
-      Teuchos::rcp_const_cast<Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(
-      tmv->getConstTpetraMultiVector())-> sync_host ();
-    }
-  }
-
-  // Sync any target Tpetra MVs and mark modified
-  for (auto itr = targ_multi_vecs.begin(); itr != targ_multi_vecs.end(); ++itr) {
-    Ptr<TMV> tmv = Teuchos::ptr_dynamic_cast<TMV>(*itr);
-    if (nonnull(tmv)) {
-      tmv->getTpetraMultiVector()->sync_host ();
-      tmv->getTpetraMultiVector()->modify_host ();
-    }
-  }
 
   MultiVectorAdapterBase<Scalar>::mvMultiReductApplyOpImpl(
     primary_op, multi_vecs, targ_multi_vecs, reduct_objs, primary_global_offset);
@@ -492,11 +461,6 @@ acquireDetachedMultiVectorViewImpl(
   RTOpPack::ConstSubMultiVectorView<Scalar>* sub_mv
   ) const
 {
-  // Only viewing data, so just sync dual view to host space
-  typedef typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TMV;
-  Teuchos::rcp_const_cast<TMV>(
-    tpetraMultiVector_.getConstObj())->sync_host ();
-
   SpmdMultiVectorDefaultBase<Scalar>::
     acquireDetachedMultiVectorViewImpl(rowRng, colRng, sub_mv);
 }
@@ -510,10 +474,6 @@ acquireNonconstDetachedMultiVectorViewImpl(
   RTOpPack::SubMultiVectorView<Scalar>* sub_mv
   )
 {
-  // Sync to host and mark as modified
-  tpetraMultiVector_.getNonconstObj()->sync_host ();
-  tpetraMultiVector_.getNonconstObj()->modify_host ();
-
   SpmdMultiVectorDefaultBase<Scalar>::
     acquireNonconstDetachedMultiVectorViewImpl(rowRng, colRng, sub_mv);
 }
@@ -528,10 +488,6 @@ commitNonconstDetachedMultiVectorViewImpl(
   SpmdMultiVectorDefaultBase<Scalar>::
     commitNonconstDetachedMultiVectorViewImpl(sub_mv);
 
-  // Sync changes from host view to execution space
-  typedef typename Tpetra::MultiVector<
-    Scalar,LocalOrdinal,GlobalOrdinal,Node>::execution_space execution_space;
-  tpetraMultiVector_.getNonconstObj()->template sync<execution_space>();
 }
 
 
@@ -627,13 +583,6 @@ void TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::euclideanApply(
   // If the cast succeeded, call Tpetra directly.
   // Otherwise, fall back to the default implementation.
   if (nonnull(X_tpetra) && nonnull(Y_tpetra)) {
-    // Sync everything to the execution space
-    typedef typename TMV::execution_space execution_space;
-    Teuchos::rcp_const_cast<TMV>(X_tpetra)->template sync<execution_space>();
-    Y_tpetra->template sync<execution_space>();
-    Teuchos::rcp_const_cast<TMV>(
-      tpetraMultiVector_.getConstObj())->template sync<execution_space>();
-
     typedef Teuchos::ScalarTraits<Scalar> ST;
     TEUCHOS_TEST_FOR_EXCEPTION(ST::isComplex && (M_trans == CONJ),
       std::logic_error,
@@ -655,12 +604,9 @@ void TpetraMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::euclideanApply(
         break;
     }
 
-    Y_tpetra->template modify<execution_space>();
     Y_tpetra->multiply(trans, Teuchos::NO_TRANS, alpha, *tpetraMultiVector_.getConstObj(), *X_tpetra, beta);
     Kokkos::fence();
   } else {
-    Teuchos::rcp_const_cast<TMV>(
-      tpetraMultiVector_.getConstObj())->sync_host ();
     SpmdMultiVectorDefaultBase<Scalar>::euclideanApply(M_trans, X, Y, alpha, beta);
   }
 
diff --git a/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp b/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp
index 88b93f1b7bc0..57bfb80cbdc3 100644
--- a/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp
+++ b/packages/thyra/adapters/tpetra/src/Thyra_TpetraVector_def.hpp
@@ -176,9 +176,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::absImpl(
   if (nonnull(tx)) {
     tpetraVector_.getNonconstObj()->abs(*tx);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     VectorDefaultBase<Scalar>::absImpl(x);
   }
 }
@@ -196,9 +193,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::reciprocalImpl(
   if (nonnull(tx)) {
     tpetraVector_.getNonconstObj()->reciprocal(*tx);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     VectorDefaultBase<Scalar>::reciprocalImpl(x);
   }
 }
@@ -218,9 +212,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::eleWiseScaleImpl(
     tpetraVector_.getNonconstObj()->elementWiseMultiply(
       ST::one(), *tx, *tpetraVector_.getConstObj(), ST::zero());
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     VectorDefaultBase<Scalar>::eleWiseScaleImpl(x);
   }
 }
@@ -245,8 +236,6 @@ TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::norm2WeightedImpl(
       ST::one(), *tx, *tpetraVector_.getConstObj(), ST::zero());
     return ST::magnitude(ST::squareroot(tpetraVector_.getConstObj()->dot(*temp)));
   } else {
-    // This version will require the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
     return VectorDefaultBase<Scalar>::norm2WeightedImpl(x);
   }
 }
@@ -261,24 +250,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::applyOpImpl(
   const Ordinal global_offset
   ) const
 {
-  // Sync any non-target Tpetra vecs to host space
-  for (auto itr = vecs.begin(); itr != vecs.end(); ++itr) {
-    auto tv = this->getConstTpetraVector(Teuchos::rcpFromPtr(*itr));
-    if (nonnull(tv)) {
-      typedef Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TV;
-      Teuchos::rcp_const_cast<TV>(tv)->sync_host ();
-    }
-  }
-
-  // Sync any target Tpetra vecs and mark modified on host
-  for (auto itr = targ_vecs.begin(); itr != targ_vecs.end(); ++itr) {
-    auto tv = this->getTpetraVector(Teuchos::rcpFromPtr(*itr));
-    if (nonnull(tv)) {
-      tv->sync_host ();
-      tv->modify_host ();
-    }
-  }
-
   SpmdVectorDefaultBase<Scalar>::applyOpImpl(op, vecs, targ_vecs, reduct_obj, global_offset);
 }
 
@@ -290,11 +261,6 @@ acquireDetachedVectorViewImpl(
   RTOpPack::ConstSubVectorView<Scalar>* sub_vec
   ) const
 {
-  // Only viewing data, so just sync dual view to host space
-  typedef typename Tpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TV;
-  Teuchos::rcp_const_cast<TV>(
-    tpetraVector_.getConstObj())->sync_host ();
-
   SpmdVectorDefaultBase<Scalar>::acquireDetachedVectorViewImpl(rng, sub_vec);
 }
 
@@ -306,9 +272,6 @@ acquireNonconstDetachedVectorViewImpl(
   RTOpPack::SubVectorView<Scalar>* sub_vec
   )
 {
-  // Sync to host and mark as modified
-  tpetraVector_.getNonconstObj()->sync_host ();
-  tpetraVector_.getNonconstObj()->modify_host ();
 
   SpmdVectorDefaultBase<Scalar>::acquireNonconstDetachedVectorViewImpl(rng, sub_vec);
 }
@@ -321,11 +284,6 @@ commitNonconstDetachedVectorViewImpl(
   )
 {
   SpmdVectorDefaultBase<Scalar>::commitNonconstDetachedVectorViewImpl(sub_vec);
-
-  // Sync changes from host view to execution space
-  typedef typename Tpetra::Vector<
-    Scalar,LocalOrdinal,GlobalOrdinal,Node>::execution_space execution_space;
-  tpetraVector_.getNonconstObj()->template sync<execution_space>();
 }
 
 
@@ -350,9 +308,6 @@ assignMultiVecImpl(const MultiVectorBase<Scalar>& mv)
   if (nonnull(tmv)) {
     tpetraVector_.getNonconstObj()->assign(*tmv);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::assignMultiVecImpl(mv);
   }
 }
@@ -379,9 +334,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::updateImpl(
   if (nonnull(tmv)) {
     tpetraVector_.getNonconstObj()->update(alpha, *tmv, ST::one());
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host();
-    tpetraVector_.getNonconstObj()->modify_host();
     MultiVectorDefaultBase<Scalar>::updateImpl(alpha, mv);
   }
 }
@@ -458,9 +410,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::linearCombinationImpl
         *alphaIter, *(*tmvIter), *(alphaIter+1), *(*(tmvIter+1)), ST::one());
     }
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::linearCombinationImpl(alpha, mv, beta);
   }
 }
@@ -479,9 +428,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dotsImpl(
   if (nonnull(tmv)) {
     tpetraVector_.getConstObj()->dot(*tmv, prods);
   } else {
-    // This version will require/modify the host view of this vector.
-    tpetraVector_.getNonconstObj()->sync_host ();
-    tpetraVector_.getNonconstObj()->modify_host ();
     MultiVectorDefaultBase<Scalar>::dotsImpl(mv, prods);
   }
 }
@@ -532,12 +478,6 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::applyImpl(
   // If the cast succeeded, call Tpetra directly.
   // Otherwise, fall back to the default implementation.
   if (nonnull(X_tpetra) && nonnull(Y_tpetra)) {
-    // Sync everything to the execution space
-    typedef typename TMV::execution_space execution_space;
-    Teuchos::rcp_const_cast<TMV>(X_tpetra)->template sync<execution_space>();
-    Y_tpetra->template sync<execution_space>();
-    Teuchos::rcp_const_cast<TV>(tpetraVector_.getConstObj())->template sync<execution_space>();
-
     typedef Teuchos::ScalarTraits<Scalar> ST;
     TEUCHOS_TEST_FOR_EXCEPTION(ST::isComplex && (M_trans == CONJ),
       std::logic_error,
@@ -559,11 +499,9 @@ void TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::applyImpl(
         break;
     }
 
-    Y_tpetra->template modify<execution_space>();
     Y_tpetra->multiply(trans, Teuchos::NO_TRANS, alpha, *tpetraVector_.getConstObj(), *X_tpetra, beta);
     Kokkos::fence();
   } else {
-    Teuchos::rcp_const_cast<TV>(tpetraVector_.getConstObj())->sync_host ();
     VectorDefaultBase<Scalar>::applyImpl(M_trans, X, Y, alpha, beta);
   }
 
diff --git a/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp b/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp
index 3f8cf917de9f..016db9a0ba76 100644
--- a/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp
+++ b/packages/thyra/adapters/tpetra/test/Simple2DTpetraModelEvaluator_UnitTests.cpp
@@ -135,9 +135,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( Simple2DTpetraModelEvaluator, eval, Scalar )
   const RCP<const Tpetra::CrsMatrix<Scalar> > W_tpetra =
     rcp_dynamic_cast<Tpetra::CrsMatrix<Scalar> >(
       ConverterT::getTpetraOperator(W_op));
-
-  ArrayView<const LO> row_indices;
-  ArrayView<const Scalar> row_values;
+  using crs_t = Tpetra::CrsMatrix<Scalar>;
+    typename crs_t::local_inds_host_view_type row_indices;
+    typename crs_t::values_host_view_type row_values;
 
   W_tpetra->getLocalRowView(0, row_indices, row_values);
   // FIXME (mfh 22 Oct 2015) This test assumes that local indices
diff --git a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp
index d71ca755d6de..9b49fa8cd2c9 100644
--- a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp
+++ b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrs.cpp
@@ -281,9 +281,7 @@ int main (int argc, char *argv[])
       // - all internal views are allocated on device; mirror as mesh database is constructed on host
       const auto mesh_gids_host = mesh.getElementGlobalIDs();
       const auto mesh_gids =
-        Kokkos::create_mirror_view (typename exec_space::memory_space {},
-                                    mesh.getElementGlobalIDs ());
-      Kokkos::deep_copy(mesh_gids, mesh_gids_host);
+        Kokkos::create_mirror_view_and_copy (typename exec_space::memory_space(), mesh_gids_host);
 
       // for convenience, separate the access to owned and remote gids
       const auto owned_gids =
@@ -315,9 +313,9 @@ int main (int argc, char *argv[])
       // Graph Construction
       // ------------------
       // local graph is constructed on device space
-      typedef tpetra_crs_graph_type::local_graph_type local_graph_type;
-      typedef local_graph_type::row_map_type::non_const_type rowptr_view_type;
-      typedef typename local_graph_type::entries_type colidx_view_type;
+      typedef tpetra_crs_graph_type::local_graph_device_type local_graph_device_type;
+      typedef local_graph_device_type::row_map_type::non_const_type rowptr_view_type;
+      typedef typename local_graph_device_type::entries_type colidx_view_type;
 
       rowptr_view_type rowptr;
       colidx_view_type colidx;
@@ -344,8 +342,7 @@ int main (int argc, char *argv[])
         // the last entry of rowptr is the total number of nonzeros in the local graph
         // mirror to host to use the information in constructing colidx
         auto nnz = Kokkos::subview(rowptr, num_owned_elements);
-        const auto nnz_host = Kokkos::create_mirror_view(nnz);
-        Kokkos::deep_copy(nnz_host, nnz);
+        const auto nnz_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), nnz);
 
         // allocate colidx
         colidx = colidx_view_type("colidx", nnz_host());
@@ -370,7 +367,14 @@ int main (int argc, char *argv[])
       RCP<tpetra_crs_graph_type> bcrs_graph;
       {
         TimeMonitor timerGlobalGraphConstruction(*TimeMonitor::getNewTimer("1) GlobalGraphConstruction"));
-        bcrs_graph = rcp(new tpetra_crs_graph_type(row_map, col_map, local_graph_type(colidx, rowptr),
+        rowptr_view_type rowptr_tpetra = 
+          rowptr_view_type(Kokkos::ViewAllocateWithoutInitializing("rowptr_tpetra"), rowptr.extent(0));
+        colidx_view_type colidx_tpetra =
+          colidx_view_type(Kokkos::ViewAllocateWithoutInitializing("colidx_tpetra"), colidx.extent(0));
+        Kokkos::deep_copy(rowptr_tpetra, rowptr);
+        Kokkos::deep_copy(colidx_tpetra, colidx);
+        bcrs_graph = rcp(new tpetra_crs_graph_type(row_map, col_map, 
+                                                   local_graph_device_type(colidx_tpetra, rowptr_tpetra),
                                                    Teuchos::null));
       } // end global graph timer
 
@@ -398,29 +402,31 @@ int main (int argc, char *argv[])
 
         // Tpetra BlockCrsMatrix only has high level access functions
         // To fill this on device, we need an access to the meta data of blocks
-        const auto rowptr_host = Kokkos::create_mirror_view(rowptr);
-        const auto colidx_host = Kokkos::create_mirror_view(colidx);
-
-        Kokkos::deep_copy(rowptr_host, rowptr);
-        Kokkos::deep_copy(colidx_host, colidx);
+        const auto rowptr_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowptr);
+        const auto colidx_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colidx);
 
         blocks = Kokkos::View<block_type*,exec_space>("blocks", rowptr_host(num_owned_elements));
-
-        const auto blocks_host = Kokkos::create_mirror_view(blocks);
+        auto blocks_host = Kokkos::create_mirror_view(Kokkos::HostSpace(), blocks);
         // This MUST run on host, since it invokes a host-only method,
         // getLocalBlock.  This means we must NOT use KOKKOS_LAMBDA,
         // since that would build the lambda for both host AND device.
 
-        Kokkos::parallel_for
-          (Kokkos::RangePolicy<host_space, LO> (0, num_owned_elements),
-           [&] (const LO row) {
-            const auto beg = rowptr_host(row);
-            const auto end = rowptr_host(row+1);
-            typedef typename std::remove_const<decltype (beg) >::type offset_type;
-            for (offset_type loc = beg; loc < end; ++loc) {
-              blocks_host(loc) = A_bcrs->getLocalBlock(row, colidx(loc));
-            }
-          });
+        /// without UVM, the getLocalBlockDeviceNonConst cannot be called within the parallel for 
+        /// even though it is host execution space as the method can involve kernel launch 
+        /// for memory transfers.
+        // Kokkos::parallel_for
+        //   (Kokkos::RangePolicy<host_space, LO> (0, num_owned_elements),
+        //    [&] (const LO row) {
+        for (LO row=0;row<LO(num_owned_elements);++row) {
+          const auto beg = rowptr_host(row);
+          const auto end = rowptr_host(row+1);
+          typedef typename std::remove_const<decltype (beg) >::type offset_type;
+          for (offset_type loc = beg; loc < end; ++loc) {
+            blocks_host(loc) = A_bcrs->getLocalBlockDeviceNonConst(row, colidx_host(loc));
+          }
+        }
+        //   });
+
         Kokkos::deep_copy(blocks, blocks_host);
 
         Kokkos::parallel_for
@@ -519,7 +525,7 @@ int main (int argc, char *argv[])
         // point-wise row map can be obtained from A_bcrs->getDomainMap().
         // A constructor exist for crs matrix with a local matrix and a row map.
         // see, Tpetra_CrsMatrix_decl.hpp, line 504
-        //     CrsMatrix (const local_matrix_type& lclMatrix,
+        //     CrsMatrix (const local_matrix_device_type& lclMatrix,
         //                const Teuchos::RCP<const map_type>& rowMap,
         //                const Teuchos::RCP<const map_type>& colMap = Teuchos::null,
         //                const Teuchos::RCP<const map_type>& domainMap = Teuchos::null,
@@ -545,7 +551,7 @@ int main (int argc, char *argv[])
 
         rowptr_view_type crs_rowptr = rowptr_view_type("crs_rowptr", num_owned_elements*blocksize+1);
         colidx_view_type crs_colidx = colidx_view_type("crs_colidx", colidx.extent(0)*blocksize*blocksize);
-        typename tpetra_crs_matrix_type::local_matrix_type::values_type
+        typename tpetra_crs_matrix_type::local_matrix_device_type::values_type
           crs_values("crs_values", colidx.extent(0)*blocksize*blocksize);
 
         Kokkos::parallel_for
@@ -579,11 +585,11 @@ int main (int argc, char *argv[])
             }
           });
 
-        typename tpetra_crs_matrix_type::local_matrix_type
+        typename tpetra_crs_matrix_type::local_matrix_device_type
           local_matrix("local_crs_matrix",
                        num_owned_and_remote_elements*blocksize,
                        crs_values,
-                       local_graph_type(crs_colidx, crs_rowptr));
+                       local_graph_device_type(crs_colidx, crs_rowptr));
 
         A_crs = rcp(new tpetra_crs_matrix_type(row_crs_map,
                                                col_crs_map,
diff --git a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp
index bce848fbb7b7..d66687be669b 100644
--- a/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp
+++ b/packages/tpetra/core/example/BlockCrs/Tpetra_TestBlockCrsMeshDatabase.hpp
@@ -470,7 +470,7 @@ namespace BlockCrsTest {
     local_ordinal_range_type _remote_range_j;
     local_ordinal_range_type _remote_range_k;
 
-    typedef typename tpetra_crs_graph_type::local_graph_type::row_map_type::non_const_type rowptr_view_type;
+    typedef typename tpetra_crs_graph_type::local_graph_device_type::row_map_type::non_const_type rowptr_view_type;
     rowptr_view_type _rowptr;
 
     typedef typename rowptr_view_type::non_const_value_type scan_value_type;
@@ -536,8 +536,8 @@ namespace BlockCrsTest {
 
   struct LocalGraphFill {
   private:
-    typedef typename tpetra_crs_graph_type::local_graph_type::row_map_type::non_const_type rowptr_view_type;
-    typedef typename tpetra_crs_graph_type::local_graph_type::entries_type colidx_view_type;
+    typedef typename tpetra_crs_graph_type::local_graph_device_type::row_map_type::non_const_type rowptr_view_type;
+    typedef typename tpetra_crs_graph_type::local_graph_device_type::entries_type colidx_view_type;
 
     MeshDatabase::StructuredBlock _sb;
     MeshDatabase::global_ordinal_view_type _owned_gids;
diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp
index b3f6da5620a9..4b7438e0549c 100644
--- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp
+++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_Element.hpp
@@ -100,7 +100,6 @@ KOKKOS_INLINE_FUNCTION void ReferenceQuad4RHS(ViewType& rhs) {
     rhs[i] = static_cast<Scalar>(.25);
 }
 
-template<>
 void ReferenceQuad4RHS(Teuchos::Array<Scalar>& rhs) {
   for(int i=0; (int)i<rhs.size(); i++)
     rhs[i] = static_cast<Scalar>(.25);
diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp
index 8938d7586b98..e54711a3472d 100644
--- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp
+++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_InsertGlobalIndices_FE.hpp
@@ -139,10 +139,10 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP<const Teuchos::Comm<int>
   // -----------------
   // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77
   RCP<const map_type> row_map =
-    rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(),
+    rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly),
                      0, comm));
   RCP<const map_type> owned_plus_shared_map =
-    rcp(new map_type(GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs(),
+    rcp(new map_type(GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly),
                      0, comm));
 
   if(opts.verbose) row_map->describe(out);
@@ -156,7 +156,7 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP<const Teuchos::Comm<int>
   auto domain_map = row_map;
   auto range_map  = row_map;
 
-  auto owned_element_to_node_ids = mesh.getOwnedElementToNode();
+  auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly);
 
   Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer();
   RCP<TimeMonitor> timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop  (Graph)")));
@@ -176,7 +176,8 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP<const Teuchos::Comm<int>
     //   each row associated with this element's contribution.
     for(size_t element_node_idx=0; element_node_idx<owned_element_to_node_ids.extent(1); element_node_idx++)
     {
-      global_ids_in_row[element_node_idx] = owned_element_to_node_ids(element_gidx, element_node_idx);
+      global_ids_in_row[element_node_idx] = 
+	owned_element_to_node_ids(element_gidx, element_node_idx);
     }
 
     // Add the contributions from the current row into the graph.
@@ -262,12 +263,11 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP<const Teuchos::Comm<int>
       ReferenceQuad4(element_matrix);
       ReferenceQuad4RHS(element_rhs);
 
-      // Fill the global column ids array for this element
       for (size_t element_node_idx=0;
-           element_node_idx < owned_element_to_node_ids.extent(1);
-           ++element_node_idx) {
-        column_global_ids[element_node_idx] =
-          owned_element_to_node_ids(element_gidx, element_node_idx);
+	   element_node_idx < owned_element_to_node_ids.extent(1);
+	   ++element_node_idx) {
+	column_global_ids[element_node_idx] =
+	  owned_element_to_node_ids(element_gidx, element_node_idx);
       }
 
       // For each node (row) on the current element:
@@ -275,16 +275,16 @@ int executeInsertGlobalIndicesFESP_(const Teuchos::RCP<const Teuchos::Comm<int>
       // - add the values to the fe_matrix.
       // Note: hardcoded 4 here because we're using quads.
       for (size_t element_node_idx = 0; element_node_idx < 4;
-           ++element_node_idx) {
-        global_ordinal_type global_row_id =
-          owned_element_to_node_ids(element_gidx, element_node_idx);
-
-        for(size_t col_idx=0; col_idx<4; col_idx++) {
-          column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx);
-        }
-
-        fe_matrix->sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values);
-        rhs->sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]);
+	   ++element_node_idx) {
+	global_ordinal_type global_row_id =
+	  owned_element_to_node_ids(element_gidx, element_node_idx);
+	
+	for(size_t col_idx=0; col_idx<4; col_idx++) {
+	  column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx);
+	}
+
+	fe_matrix->sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values);
+	rhs->sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]);
       }
     }
   } // timerElementLoopMatrix
@@ -357,10 +357,10 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP<const Teuchos::Comm
   // -----------------
   // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77
   RCP<const map_type> row_map =
-    rcp (new map_type (GO_INVALID, mesh.getOwnedNodeGlobalIDs (),
+    rcp (new map_type (GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly),
                        0, comm));
   RCP<const map_type> owned_plus_shared_map =
-    rcp (new map_type (GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs (),
+    rcp (new map_type (GO_INVALID, mesh.getOwnedAndGhostNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly),
                        0, comm));
 
   if (opts.verbose) {
@@ -376,7 +376,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP<const Teuchos::Comm
   auto domain_map = row_map;
   auto range_map  = row_map;
 
-  auto owned_element_to_node_ids = mesh.getOwnedElementToNode();
+  auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly);
 
   Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer();
 
@@ -471,7 +471,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP<const Teuchos::Comm
   RCP<fe_multivector_type> rhs =
     rcp (new fe_multivector_type(domain_map, fe_graph->getImporter(), 1));
 
-  auto localMatrix  = fe_matrix->getLocalMatrix();
+  auto localMatrix  = fe_matrix->getLocalMatrixDevice();
   auto localRHS     = rhs->getLocalViewDevice(Tpetra::Access::OverwriteAll);
   auto localMap     = owned_plus_shared_map->getLocalMap();
   auto localColMap  = fe_matrix->getColMap()->getLocalMap();
@@ -482,7 +482,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP<const Teuchos::Comm
   pair_type alln = pair_type(0,nperel);
   scalar_2d_array_type all_element_matrix("all_element_matrix",nperel*numOwnedElements);
   scalar_1d_array_type all_element_rhs("all_element_rhs",nperel*numOwnedElements);
-  local_ordinal_view_type  all_lcids("all_lids",nperel*numOwnedElements);
+  local_ordinal_single_view_type  all_lcids("all_lids",nperel*numOwnedElements);
 
   timerElementLoopMemory=Teuchos::null;
 
@@ -518,7 +518,7 @@ int executeInsertGlobalIndicesFESPKokkos_(const Teuchos::RCP<const Teuchos::Comm
         // - add the values to the fe_matrix.
         for (int element_node_idx = 0; element_node_idx < nperel; ++element_node_idx) {
           const local_ordinal_type local_row_id =
-            localMap.getLocalElement (owned_element_to_node_ids(element_gidx, element_node_idx));
+            localMap.getLocalElement (owned_element_to_node_ids (element_gidx, element_node_idx));
           auto row_values = Kokkos::subview(element_matrix, element_node_idx, alln);
           // Force atomics on sums
           for (int col_idx = 0; col_idx < nperel; ++col_idx) {
diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_MeshDatabase.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_MeshDatabase.hpp
index 608d9c0930f1..5b4a680c845f 100644
--- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_MeshDatabase.hpp
+++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_MeshDatabase.hpp
@@ -139,6 +139,7 @@ class MeshDatabase {
 
   void initializeOwnedAndGhostElementGlobalIDs(void);
 
+  //wrapped dual views 
   global_ordinal_view_type ownedElementGlobalIDs_;
   global_ordinal_view_type ghostElementGlobalIDs_;
 
@@ -182,7 +183,7 @@ MeshDatabase::MeshDatabase(Teuchos::RCP<const Teuchos::Comm<int> > comm,
 {
 
   // NOTE: Elements/nodes are numbered sequentially with x as the "fast" direction
-
+  // NOTE: assembly is all on host, so the overall scopeguard is sufficient here
   // Get processor decomp information
   MyRank_ = comm_->getRank();
   ij_from_idx(globalProcs_[0],MyRank_,myProcIJ_[0],myProcIJ_[1]);
@@ -202,40 +203,43 @@ MeshDatabase::MeshDatabase(Teuchos::RCP<const Teuchos::Comm<int> > comm,
   }
 
   // Generate the owned element ids
-  Kokkos::resize(ownedElementGlobalIDs_,num_my_elements);
+  auto ownedElementGlobalIDs = ownedElementGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(ownedElementGlobalIDs,num_my_elements);
   int ect=0;
   for(global_ordinal_type j=myElementStart_[1]; j<myElementStop_[1]; j++) {
     for(global_ordinal_type i=myElementStart_[0]; i<myElementStop_[0]; i++) {
       global_ordinal_type idx=idx_from_ij(globalElements_[0],i,j);
-      ownedElementGlobalIDs_(ect) = idx;
+      ownedElementGlobalIDs(ect) = idx;
       ect++;
     }
   }
 
   // Generate the owned node ids
-  Kokkos::resize(ownedNodeGlobalIDs_,num_my_nodes);
+  auto _ownedNodeGlobalIDs = ownedNodeGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ownedNodeGlobalIDs,num_my_nodes);
   int nct=0;
   for(global_ordinal_type j=myNodeStart_[1]; j<myNodeStop_[1]; j++) {
     for(global_ordinal_type i=myNodeStart_[0]; i<myNodeStop_[0]; i++) {
       global_ordinal_type idx=idx_from_ij(globalNodes_[0],i,j);
-      ownedNodeGlobalIDs_(nct) = idx;
+      _ownedNodeGlobalIDs(nct) = idx;
       nct++;
     }
   }
 
   // Generate the element-to-node map
   // NOTE: Hardwired to QUAD4's.  Nodes are ordered exodus-style (counter-clockwise) within an element
-  Kokkos::resize(ownedElementToNode_,num_my_elements);
+  auto _ownedElementToNode = ownedElementToNode_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ownedElementToNode,num_my_elements);
   int cct=0;
   for(global_ordinal_type j=myElementStart_[1]; j<myElementStop_[1]; j++) {
     for(global_ordinal_type i=myElementStart_[0]; i<myElementStop_[0]; i++) {
       // The (i,j) of the bottom left corner matches for elements & nodes
       global_ordinal_type nidx=idx_from_ij(globalNodes_[0],i,j);
 
-      ownedElementToNode_(cct,0) = nidx;
-      ownedElementToNode_(cct,1) = nidx+1;
-      ownedElementToNode_(cct,2) = nidx+globalNodes_[0]+1;
-      ownedElementToNode_(cct,3) = nidx+globalNodes_[0];
+      _ownedElementToNode(cct,0) = nidx;
+      _ownedElementToNode(cct,1) = nidx+1;
+      _ownedElementToNode(cct,2) = nidx+globalNodes_[0]+1;
+      _ownedElementToNode(cct,3) = nidx+globalNodes_[0];
       cct++;
     }
   }
@@ -259,37 +263,41 @@ MeshDatabase::MeshDatabase(Teuchos::RCP<const Teuchos::Comm<int> > comm,
   }
 
   // NOTE: This are not recorded in Aztec/Ifpack/ML ordering.  Because most apps don't do that.
-  Kokkos::resize(ghostElementGlobalIDs_,my_ghost_elements.size());
-  Kokkos::resize(ghostElementToNode_,my_ghost_elements.size());
+  auto _ghostElementGlobalIDs = ghostElementGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  auto _ghostElementToNode = ghostElementToNode_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ghostElementGlobalIDs,my_ghost_elements.size());
+  Kokkos::resize(_ghostElementToNode,my_ghost_elements.size());
   for(size_t k=0; k<my_ghost_elements.size(); k++) {
     global_ordinal_type i,j, eidx= my_ghost_elements[k];
-    ghostElementGlobalIDs_(k) = eidx;
+    _ghostElementGlobalIDs(k) = eidx;
     ij_from_idx(globalElements_[0],eidx,i,j);
 
     // The (i,j) of the bottom left corner matches for elements & nodes
     global_ordinal_type nidx=idx_from_ij(globalNodes_[0],i,j);
 
-    ghostElementToNode_(k,0) = nidx;
-    ghostElementToNode_(k,1) = nidx+1;
-    ghostElementToNode_(k,2) = nidx+globalNodes_[0]+1;
-    ghostElementToNode_(k,3) = nidx+globalNodes_[0];
+    _ghostElementToNode(k,0) = nidx;
+    _ghostElementToNode(k,1) = nidx+1;
+    _ghostElementToNode(k,2) = nidx+globalNodes_[0]+1;
+    _ghostElementToNode(k,3) = nidx+globalNodes_[0];
   }
 
   // Generate the list of "ghost" nodes (aka any node that exists on the ownedElement list that isn't owned
   std::set<global_ordinal_type> my_ghost_nodes;
-  for(size_t k=0; k<ownedElementToNode_.extent(0); k++) {
-    for(size_t l=0; l<ownedElementToNode_.extent(1); l++) {
-      global_ordinal_type nidx=ownedElementToNode_(k,l);
+  auto ownedElementToNodeView = ownedElementToNode_.getHostView(Tpetra::Access::ReadOnly);  
+  for(size_t k=0; k<ownedElementToNodeView.extent(0); k++) {
+    for(size_t l=0; l<ownedElementToNodeView.extent(1); l++) {
+      global_ordinal_type nidx=ownedElementToNodeView(k,l);
       if(!nodeIsOwned(nidx)) {
         my_ghost_nodes.insert(nidx);
       }
     }
   }
 
-  Kokkos::resize(ghostNodeGlobalIDs_,my_ghost_nodes.size());
+  auto _ghostNodeGlobalIDs = ghostNodeGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ghostNodeGlobalIDs,my_ghost_nodes.size());
   for(auto k=my_ghost_nodes.begin(); k!=my_ghost_nodes.end(); k++) {
     size_t kk = std::distance(my_ghost_nodes.begin(),k);
-    ghostNodeGlobalIDs_(kk) = *k;
+    _ghostNodeGlobalIDs(kk) = *k;
   }
 
   initializeOwnedAndGhostNodeGlobalIDs();
@@ -301,16 +309,21 @@ void MeshDatabase::initializeOwnedAndGhostNodeGlobalIDs(void)
 {
   size_t total_size = getNumOwnedNodes() + getNumGhostNodes();
 
-  Kokkos::resize(ownedAndGhostNodeGlobalIDs_, total_size);
+  auto _ownedAndGhostNodeGlobalIDs = ownedAndGhostNodeGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ownedAndGhostNodeGlobalIDs, total_size);
 
-  size_t insert_idx = 0;
-  for(size_t idx=0; idx < getNumOwnedNodes(); idx++)
-  {
-    ownedAndGhostNodeGlobalIDs_(insert_idx++) = getOwnedNodeGlobalIDs()(idx);
-  }
-  for(size_t idx=0; idx < getNumGhostNodes(); idx++)
   {
-    ownedAndGhostNodeGlobalIDs_(insert_idx++) = getGhostNodeGlobalIDs()(idx);
+    size_t insert_idx = 0;
+    auto ownedNodeGlobalIDs = getOwnedNodeGlobalIDs().getHostView(Tpetra::Access::ReadOnly);
+    auto ghostNodeGlobalIDs = getGhostNodeGlobalIDs().getHostView(Tpetra::Access::ReadOnly);
+    for(size_t idx=0; idx < getNumOwnedNodes(); idx++)
+    {
+      _ownedAndGhostNodeGlobalIDs(insert_idx++) = ownedNodeGlobalIDs(idx);
+    }
+    for(size_t idx=0; idx < getNumGhostNodes(); idx++)
+    {
+      _ownedAndGhostNodeGlobalIDs(insert_idx++) = ghostNodeGlobalIDs(idx);
+    }
   }
 }
 
@@ -318,16 +331,21 @@ void MeshDatabase::initializeOwnedAndGhostNodeGlobalIDs(void)
 void MeshDatabase::initializeOwnedAndGhostElementGlobalIDs(void)
 {
   size_t total_size = getNumOwnedElements() + getNumGhostElements();
-  Kokkos::resize(ownedAndGhostElementGlobalIDs_, total_size);
+  auto _ownedAndGhostElementGlobalIDs = ownedAndGhostElementGlobalIDs_.getHostView(Tpetra::Access::ReadWrite);
+  Kokkos::resize(_ownedAndGhostElementGlobalIDs, total_size);
 
-  size_t insert_idx = 0;
-  for(size_t idx=0; idx<getNumOwnedElements(); idx++)
-  {
-    ownedAndGhostElementGlobalIDs_(insert_idx++) = getOwnedElementGlobalIDs()(idx);
-  }
-  for(size_t idx=0; idx<getNumGhostElements(); idx++)
   {
-    ownedAndGhostElementGlobalIDs_(insert_idx++) = getGhostElementGlobalIDs()(idx);
+    size_t insert_idx = 0;
+    auto ownedElementGlobalIDs = getOwnedElementGlobalIDs().getHostView(Tpetra::Access::ReadOnly);
+    auto ghostElementGlobalIDs = getGhostElementGlobalIDs().getHostView(Tpetra::Access::ReadOnly);
+    for(size_t idx=0; idx<getNumOwnedElements(); idx++)
+    {
+      _ownedAndGhostElementGlobalIDs(insert_idx++) = ownedElementGlobalIDs(idx);
+    }
+    for(size_t idx=0; idx<getNumGhostElements(); idx++)
+    {
+      _ownedAndGhostElementGlobalIDs(insert_idx++) = ghostElementGlobalIDs(idx);
+    }
   }
 }
 
@@ -342,50 +360,75 @@ void MeshDatabase::print(std::ostream & oss)
   oss<<ss.str()<<" Stop/Start Nodes      = ["<<myNodeStart_[0]<<","<<myNodeStop_[0]<<")x["<<myNodeStart_[1]<<","<<myNodeStop_[1]<<")\n";
 
   oss<<ss.str()<<" Owned Global Elements = ";
-  for(size_t i=0; i<ownedElementGlobalIDs_.extent(0); i++) {
-    oss<<ownedElementGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ownedElementGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<IDs[i]<<" ";
+    }
   }
 
   oss<<"\n"<<ss.str()<<" Owned Global Nodes    = ";
-  for(size_t i=0; i<ownedNodeGlobalIDs_.extent(0); i++) {
-    oss<<ownedNodeGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ownedNodeGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<IDs[i]<<" ";
+    }
   }
 
   oss<<"\n"<<ss.str()<<" Owned Element2Node    = ";
-  for(size_t i=0; i<ownedElementToNode_.extent(0); i++) {
-    oss<<"(";
-    for(size_t j=0; j<ownedElementToNode_.extent(1); j++) {
-      oss<<ownedElementToNode_(i,j)<<" ";
+  {
+    auto IDs = ownedElementToNode_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<"(";
+      for(size_t j=0; j<IDs.extent(1); j++) {
+	oss<<IDs(i,j)<<" ";
+      }
+      oss<<") ";
     }
-    oss<<") ";
   }
 
   oss<<"\n"<<ss.str()<<" Ghost Global Elements = ";
-  for(size_t i=0; i<ghostElementGlobalIDs_.extent(0); i++) {
-    oss<<ghostElementGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ghostElementGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<IDs[i]<<" ";
+    }
   }
+
   oss<<"\n"<<ss.str()<<" Ghost Global Nodes    = ";
-  for(size_t i=0; i<ghostNodeGlobalIDs_.extent(0); i++) {
-    oss<<ghostNodeGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ghostNodeGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<IDs[i]<<" ";
+    }
   }
 
   oss<<"\n"<<ss.str()<<" Ghost Element2Node    = ";
-  for(size_t i=0; i<ghostElementToNode_.extent(0); i++) {
-    oss<<"(";
-    for(size_t j=0; j<ghostElementToNode_.extent(1); j++) {
-      oss<<ghostElementToNode_(i,j)<<" ";
+  {
+    auto IDs = ghostElementToNode_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss<<"(";
+      for(size_t j=0; j<IDs.extent(1); j++) {
+	oss<<IDs(i,j)<<" ";
+      }
+      oss<<") ";
     }
-    oss<<") ";
   }
 
   oss << "\n"<<ss.str()<<" Owned And Ghost Nodes = ";
-  for(size_t i=0; i<ownedAndGhostNodeGlobalIDs_.extent(0); i++) {
-    oss << ownedAndGhostNodeGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ownedAndGhostNodeGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss << IDs[i]<<" ";
+    }
   }
 
   oss << "\n"<<ss.str()<<" Owned And Ghost Elements = ";
-  for(size_t i=0; i<ownedAndGhostElementGlobalIDs_.extent(0); i++) {
-    oss << ownedAndGhostElementGlobalIDs_[i]<<" ";
+  {
+    auto IDs = ownedAndGhostElementGlobalIDs_.getHostView(Tpetra::Access::ReadOnly);
+    for(size_t i=0; i<IDs.extent(0); i++) {
+      oss << IDs[i]<<" ";
+    }
   }
 
   oss<<std::endl;
diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_TotalElementLoop.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_TotalElementLoop.hpp
index f99754450360..23b0df7f64d1 100644
--- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_TotalElementLoop.hpp
+++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_TotalElementLoop.hpp
@@ -133,9 +133,9 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
   // Build Tpetra Maps
   // -----------------
   // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77
-  RCP<const map_type> row_map = rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(), 0, comm));
-  RCP<const map_type> owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs(), 0, comm));
-  RCP<const map_type> ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs(), 0, comm));
+  RCP<const map_type> row_map = rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
+  RCP<const map_type> owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
+  RCP<const map_type> ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
   RCP<const import_type> elementImporter = rcp(new import_type(owned_element_map,ghost_element_map));
 
   if(opts.verbose) row_map->describe(out);
@@ -149,8 +149,8 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
   auto domain_map = row_map;
   auto range_map  = row_map;
 
-  auto owned_element_to_node_ids = mesh.getOwnedElementToNode();
-  auto ghost_element_to_node_ids = mesh.getGhostElementToNode();
+  auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly);
+  auto ghost_element_to_node_ids = mesh.getGhostElementToNode().getHostView(Tpetra::Access::ReadOnly);
 
   Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer();
   RCP<TimeMonitor> timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop  (Graph)")));
@@ -183,7 +183,7 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
     {
       if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx]))
       {
-       crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
+	crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
       }
     }
   }
@@ -199,11 +199,12 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
     {
       if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx]))
       {
-       crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
+	crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
       }
     }
   }
 
+
   timerElementLoopGraph = Teuchos::null;
 
   // 'finalize' the crs_graph by calling fillComplete().
@@ -285,10 +286,10 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
 
     // Fill the global column ids array for this element
     for (size_t element_node_idx = 0;
-         element_node_idx < owned_element_to_node_ids.extent(1);
-         ++element_node_idx) {
+	 element_node_idx < owned_element_to_node_ids.extent(1);
+	 ++element_node_idx) {
       column_global_ids[element_node_idx] =
-        owned_element_to_node_ids(element_gidx, element_node_idx);
+	owned_element_to_node_ids(element_gidx, element_node_idx);
     }
 
     // For each node (row) on the current element:
@@ -296,19 +297,19 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
     // - add values to crs_matrix if the row is owned.
     //   Note: hardcoded 4 here because we're using quads.
     for (size_t element_node_idx = 0; element_node_idx < 4;
-         ++element_node_idx) {
+	 ++element_node_idx) {
       const global_ordinal_type global_row_id =
-        owned_element_to_node_ids(element_gidx, element_node_idx);
+	owned_element_to_node_ids(element_gidx, element_node_idx);
       if (mesh.nodeIsOwned (global_row_id)) {
-        for (size_t col_idx = 0; col_idx < 4; ++col_idx) {
-          column_scalar_values[col_idx] =
-            element_matrix(element_node_idx, col_idx);
-        }
-        crs_matrix.sumIntoGlobalValues (global_row_id,
-                                        column_global_ids,
-                                        column_scalar_values);
-        rhs.sumIntoGlobalValue (global_row_id, 0,
-                                element_rhs[element_node_idx]);
+	for (size_t col_idx = 0; col_idx < 4; ++col_idx) {
+	  column_scalar_values[col_idx] =
+	    element_matrix(element_node_idx, col_idx);
+	}
+	crs_matrix.sumIntoGlobalValues (global_row_id,
+					column_global_ids,
+					column_scalar_values);
+	rhs.sumIntoGlobalValue (global_row_id, 0,
+				element_rhs[element_node_idx]);
       }
     }
   }
@@ -331,12 +332,12 @@ int executeTotalElementLoopSP_(const Teuchos::RCP<const Teuchos::Comm<int> >& co
       global_ordinal_type global_row_id = ghost_element_to_node_ids(element_gidx, element_node_idx);
       if(mesh.nodeIsOwned(global_row_id))
       {
-        for(size_t col_idx=0; col_idx<4; col_idx++)
+	for(size_t col_idx=0; col_idx<4; col_idx++)
         {
-          column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx);
-        }
-        crs_matrix.sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values);
-        rhs.sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]);
+	  column_scalar_values[col_idx] = element_matrix(element_node_idx, col_idx);
+	}
+	crs_matrix.sumIntoGlobalValues(global_row_id, column_global_ids, column_scalar_values);
+	rhs.sumIntoGlobalValue(global_row_id, 0, element_rhs[element_node_idx]);
       }
     }
   }
@@ -412,9 +413,9 @@ executeTotalElementLoopSPKokkos_
   // -----------------
   // -- https://trilinos.org/docs/dev/packages/tpetra/doc/html/classTpetra_1_1Map.html#a24490b938e94f8d4f31b6c0e4fc0ff77
   RCP<const map_type> row_map =
-    rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs(), 0, comm));
-  RCP<const map_type> owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs(), 0, comm));
-  RCP<const map_type> ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs(), 0, comm));
+    rcp(new map_type(GO_INVALID, mesh.getOwnedNodeGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
+  RCP<const map_type> owned_element_map = rcp(new map_type(GO_INVALID, mesh.getOwnedElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
+  RCP<const map_type> ghost_element_map = rcp(new map_type(GO_INVALID, mesh.getGhostElementGlobalIDs().getDeviceView(Tpetra::Access::ReadOnly), 0, comm));
   RCP<const import_type> elementImporter = rcp(new import_type(owned_element_map,ghost_element_map));
 
   if(opts.verbose) row_map->describe(out);
@@ -428,8 +429,8 @@ executeTotalElementLoopSPKokkos_
   auto domain_map = row_map;
   auto range_map  = row_map;
 
-  auto owned_element_to_node_ids = mesh.getOwnedElementToNode();
-  auto ghost_element_to_node_ids = mesh.getGhostElementToNode();
+  auto owned_element_to_node_ids = mesh.getOwnedElementToNode().getHostView(Tpetra::Access::ReadOnly);
+  auto ghost_element_to_node_ids = mesh.getGhostElementToNode().getHostView(Tpetra::Access::ReadOnly);
 
   Teuchos::TimeMonitor::getStackedTimer()->startBaseTimer();
   RCP<TimeMonitor> timerElementLoopGraph = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("1) ElementLoop  (Graph)")));
@@ -462,7 +463,7 @@ executeTotalElementLoopSPKokkos_
     {
       if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx]))
       {
-       crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
+	crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
       }
     }
   }
@@ -478,7 +479,7 @@ executeTotalElementLoopSPKokkos_
     {
       if(mesh.nodeIsOwned(global_ids_in_row[element_node_idx]))
       {
-       crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
+	crs_graph->insertGlobalIndices(global_ids_in_row[element_node_idx], global_ids_in_row());
       }
     }
   }
@@ -548,7 +549,7 @@ executeTotalElementLoopSPKokkos_
   RCP<crs_matrix_type> crs_matrix = rcp(new crs_matrix_type(crs_graph));
   RCP<multivector_type> rhs = rcp(new multivector_type(crs_graph->getRowMap(), 1));
 
-  auto localMatrix  = crs_matrix->getLocalMatrix();
+  auto localMatrix  = crs_matrix->getLocalMatrixDevice();
   auto localRHS     = rhs->getLocalViewDevice(Tpetra::Access::OverwriteAll);
   auto localRowMap  = crs_matrix->getRowMap()->getLocalMap();
   auto localColMap  = crs_matrix->getColMap()->getLocalMap();
@@ -560,7 +561,7 @@ executeTotalElementLoopSPKokkos_
   pair_type alln = pair_type(0,nperel);
   scalar_2d_array_type all_element_matrix("all_element_matrix",nperel*std::max(numOwnedElements,numGhostElements));
   scalar_1d_array_type all_element_rhs("all_element_rhs",nperel*std::max(numOwnedElements,numGhostElements));
-  local_ordinal_view_type  all_lcids("all_lids",nperel*std::max(numOwnedElements,numGhostElements));
+  local_ordinal_single_view_type  all_lcids("all_lids",nperel*std::max(numOwnedElements,numGhostElements));
 
 
   timerElementLoopMemory = Teuchos::null;
diff --git a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp
index 5b163784b2db..81b7506db893 100644
--- a/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp
+++ b/packages/tpetra/core/example/Finite-Element-Assembly/fem_assembly_typedefs.hpp
@@ -50,12 +50,14 @@
 #include "Tpetra_FECrsMatrix.hpp"
 #include "Tpetra_MultiVector.hpp"
 #include "Tpetra_FEMultiVector.hpp"
+#include "Tpetra_Details_WrappedDualView.hpp"
 
 namespace TpetraExamples {
 
+using deviceType = Tpetra::Map<>::device_type;
 using local_ordinal_type = Tpetra::Map<>::local_ordinal_type;
 using global_ordinal_type = Tpetra::Map<>::global_ordinal_type;
-using execution_space = Tpetra::Map<>::device_type::execution_space;
+using execution_space = deviceType::execution_space;
 
 using map_type = Tpetra::Map<>;
 using crs_graph_type = Tpetra::CrsGraph<>;
@@ -69,20 +71,32 @@ using export_type = Tpetra::Export<>;
 using multivector_type = Tpetra::MultiVector<Scalar>;
 using fe_multivector_type = Tpetra::FEMultiVector<Scalar>;
 
+using globalDualViewType = Kokkos::DualView<global_ordinal_type*, deviceType>;
+using localDualViewType = Kokkos::DualView<local_ordinal_type*, deviceType>;
+using scalarDualViewType = Kokkos::DualView<Scalar*, deviceType>;
+using global2DArrayDualViewType = Kokkos::DualView<global_ordinal_type*[4], deviceType>;
+using local2DArrayDualViewType = Kokkos::DualView<local_ordinal_type*[4], deviceType>;
+using scalar2DArrayDualViewType = Kokkos::DualView<Scalar*[4], deviceType>;
+using boolDualViewType = Kokkos::DualView<bool*, execution_space>;
 
 using global_ordinal_view_type =
-  Kokkos::View<global_ordinal_type*, execution_space>;
+  Tpetra::Details::WrappedDualView<globalDualViewType>;
 using local_ordinal_view_type =
+  Tpetra::Details::WrappedDualView<localDualViewType>;
+using local_ordinal_single_view_type = 
   Kokkos::View<local_ordinal_type*, execution_space>;
-using scalar_1d_array_type = Kokkos::View<Scalar*, execution_space>;
-using bool_1d_array_type = Kokkos::View<bool*, execution_space>;
+using scalar_1d_array_type = 
+  Kokkos::View<Scalar*, execution_space>;
+using bool_1d_array_type = 
+  Tpetra::Details::WrappedDualView<boolDualViewType>;
 
 // NOTE: Arrays are hardwired for QUAD4
 using local_ordinal_2d_array_type =
-  Kokkos::View<local_ordinal_type*[4], execution_space>;
+  Tpetra::Details::WrappedDualView<local2DArrayDualViewType>;
 using global_ordinal_2d_array_type =
-  Kokkos::View<global_ordinal_type*[4], execution_space>;
-using scalar_2d_array_type = Kokkos::View<Scalar*[4], execution_space>;
+  Tpetra::Details::WrappedDualView<global2DArrayDualViewType>;
+using scalar_2d_array_type = 
+  Kokkos::View<Scalar*[4], execution_space>;
 
 
 }
diff --git a/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp b/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp
index 02e6cf5d1818..f65ac3ef5d3b 100644
--- a/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp
+++ b/packages/tpetra/core/example/Lesson03-Power-Method/lesson03_power_method.cpp
@@ -217,6 +217,8 @@ main (int argc, char *argv[])
   typedef Tpetra::Vector<>::global_ordinal_type global_ordinal_type;
   typedef Tpetra::Vector<>::mag_type magnitude_type;
   typedef Tpetra::CrsMatrix<> crs_matrix_type;
+  typedef typename crs_matrix_type::nonconst_global_inds_host_view_type gids_type;
+  typedef typename crs_matrix_type::nonconst_values_host_view_type vals_type;
 
   Tpetra::ScopeGuard tpetraScope (&argc, &argv);
   {
@@ -322,8 +324,8 @@ main (int argc, char *argv[])
       // the matrix.
       const global_ordinal_type idOfFirstRow = 0;
       size_t numEntriesInRow = A->getNumEntriesInGlobalRow (idOfFirstRow);
-      Array<scalar_type>         rowvals (numEntriesInRow);
-      Array<global_ordinal_type> rowinds (numEntriesInRow);
+      vals_type rowvals ("vals",numEntriesInRow);
+      gids_type rowinds ("gids",numEntriesInRow);
 
       // Fill rowvals and rowinds with the values resp. (global)
       // column indices of the sparse matrix entries owned by the
@@ -341,7 +343,7 @@ main (int argc, char *argv[])
       // The parentheses after rowinds and rowvalues indicate "a view
       // of the Array's data."  Array::operator() returns an
       // ArrayView.
-      A->getGlobalRowCopy (idOfFirstRow, rowinds (), rowvals (), numEntriesInRow);
+      A->getGlobalRowCopy (idOfFirstRow, rowinds, rowvals, numEntriesInRow);
       for (size_t i = 0; i < numEntriesInRow; i++) {
 	if (rowinds[i] == idOfFirstRow) {
 	  // We have found the diagonal entry; modify it.
@@ -354,7 +356,7 @@ main (int argc, char *argv[])
       // method throws an exception.  If you want to modify the
       // structure (by adding new entries), you'll need to call
       // insertGlobalValues().
-      A->replaceGlobalValues (idOfFirstRow, rowinds (), rowvals ());
+      A->replaceGlobalValues (idOfFirstRow, rowinds, rowvals);
     }
 
     // Call fillComplete() again to signal that we are done changing the
diff --git a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp
index 05dc8910235a..ff655facd15c 100644
--- a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp
+++ b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp
@@ -237,7 +237,7 @@ int main (int argc, char* argv[]) {
     // then construct a View of it.  (Note that a row offset needs to
     // have a type that can contain the sum of the row counts.)
     using row_offset_type =
-      Tpetra::CrsMatrix<double>::local_matrix_type::row_map_type::non_const_value_type;
+      Tpetra::CrsMatrix<double>::local_matrix_device_type::row_map_type::non_const_value_type;
 
     // Use a parallel scan (prefix sum) over the array of row counts, to
     // compute the array of row offsets for the sparse graph.
diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp
index 44e7fd875e98..60b818d57308 100644
--- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp
+++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixDenseRowUnpack.cpp
@@ -233,7 +233,7 @@ RCP<Time> getTimer(const std::string& timerName) {
 
 class FillSourceMatrixValues {
 public:
-  FillSourceMatrixValues(const crs_matrix_type::local_matrix_type& A,
+  FillSourceMatrixValues(const crs_matrix_type::local_matrix_device_type& A,
                          const map_type::local_map_type& lclColMap)
     : A_(A), lclColMap_(lclColMap)
   {}
@@ -248,7 +248,7 @@ class FillSourceMatrixValues {
     }
   }
 private:
-  crs_matrix_type::local_matrix_type A_;
+  crs_matrix_type::local_matrix_device_type A_;
   map_type::local_map_type lclColMap_;
 };
 
@@ -256,7 +256,7 @@ void
 fillSourceMatrixValues(const crs_matrix_type& A)
 {
   TEUCHOS_ASSERT(! A.isFillComplete() );
-  auto A_lcl = A.getLocalMatrix();
+  auto A_lcl = A.getLocalMatrixDevice();
 
   TEUCHOS_ASSERT( A.hasColMap() );
   const auto lclColMap = A.getColMap()->getLocalMap();
@@ -275,12 +275,12 @@ fillSourceMatrixValues(const crs_matrix_type& A)
 // optimizations.
 
 class TestTargetMatrixValues {
-  using local_matrix_type = crs_matrix_type::local_matrix_type;
+  using local_matrix_device_type = crs_matrix_type::local_matrix_device_type;
   using device_type = crs_matrix_type::device_type;
   using local_map_type = map_type::local_map_type;
 
 public:
-  TestTargetMatrixValues(const local_matrix_type& A,
+  TestTargetMatrixValues(const local_matrix_device_type& A,
                          const local_map_type& lclColMap,
                          const Kokkos::View<const int*, device_type>& lclRowsToTest,
                          const bool replaceCombineMode)
@@ -316,14 +316,14 @@ class TestTargetMatrixValues {
     }
   }
 private:
-  local_matrix_type A_;
+  local_matrix_device_type A_;
   local_map_type lclColMap_;
   Kokkos::View<const int*, device_type> lclRowsToTest_;
   bool replaceCombineMode_;
 };
 
 int
-testTargetLocalMatrixValues(const crs_matrix_type::local_matrix_type& A,
+testTargetLocalMatrixValues(const crs_matrix_type::local_matrix_device_type& A,
                             const map_type::local_map_type& lclColMap,
                             const Kokkos::View<const int*, typename crs_matrix_type::device_type>& lclRowsToTest,
                             const Tpetra::CombineMode combineMode,
@@ -422,22 +422,10 @@ printLocalMatrix(std::ostream& out,
 
   out << "Proc " << myRank << ":" << endl;
 
-  auto A_lcl = A.getLocalMatrix();
-  Kokkos::HostSpace hostMemSpace;
-  auto val = Kokkos::create_mirror_view(hostMemSpace, A_lcl.values);
-  Kokkos::deep_copy(val, A_lcl.values);
-  auto ind = Kokkos::create_mirror_view(hostMemSpace, A_lcl.graph.entries);
-  Kokkos::deep_copy(ind, A_lcl.graph.entries);
-
-  // A_lcl.graph.row_map is a View of const, so the result of
-  // create_mirror_view is also a View of const.  This means we can't
-  // use it as the destination of deep_copy.
-  using row_map_type = decltype(A_lcl.graph.row_map);
-  Kokkos::View<row_map_type::non_const_data_type,
-    row_map_type::array_layout,
-    Kokkos::DefaultHostExecutionSpace> ptr
-      ("ptr", A_lcl.graph.row_map.extent(0));
-  Kokkos::deep_copy(ptr, A_lcl.graph.row_map);
+  auto A_lcl = A.getLocalMatrixHost();
+  auto val = A_lcl.values;
+  auto ind = A_lcl.graph.entries;
+  auto ptr = A_lcl.graph.row_map;
 
   for(LO lclRow = 0; lclRow < A_lcl.numRows(); ++lclRow) {
     const GO gblRow = rowMap.getGlobalElement(lclRow);
@@ -485,7 +473,7 @@ testTargetMatrixValues(const crs_matrix_type& A,
   auto comm = A.getMap()->getComm();
   const int myRank = comm->getRank();
 
-  auto A_lcl = A.getLocalMatrix();
+  auto A_lcl = A.getLocalMatrixDevice();
   TEUCHOS_ASSERT( A.hasColMap() );
   auto lclColMap = A.getColMap()->getLocalMap();
   const int lclSuccess =
diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp
index 6721fbdcc664..f6cebb6dad65 100644
--- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp
+++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrixMatVec.cpp
@@ -278,13 +278,16 @@ getTpetraCrsMatrix (Teuchos::FancyOStream& out,
   // Fill in the sparse matrix.
   out << "Fill the CrsMatrix" << endl;
   for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) { // for each of my rows
-    Teuchos::ArrayView<const LO> lclColInds;
+    matrix_type::local_inds_host_view_type lclColInds;
     graph->getLocalRowView (lclRow, lclColInds);
 
     // Put some entries in the matrix.
-    Teuchos::Array<SC> lclValues(lclColInds.size(), Teuchos::ScalarTraits<SC>::one());
-    const LO err = A->replaceLocalValues (lclRow, lclColInds, lclValues());
-    TEUCHOS_TEST_FOR_EXCEPTION(err != lclColInds.size(), std::logic_error, "Bug");
+    matrix_type::values_host_view_type::non_const_type
+                 lclValues("testLclValues", lclColInds.extent(0));
+    Kokkos::deep_copy(lclValues, Teuchos::ScalarTraits<SC>::one());
+    const LO err = A->replaceLocalValues (lclRow, lclColInds, lclValues);
+    TEUCHOS_TEST_FOR_EXCEPTION(size_t(err) != lclColInds.size(),
+                               std::logic_error, "Bug");
   }
   A->fillComplete();
 
diff --git a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrix_sumIntoLocalValues.cpp b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrix_sumIntoLocalValues.cpp
index d5d59e3ed763..e6bb4b53726d 100644
--- a/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrix_sumIntoLocalValues.cpp
+++ b/packages/tpetra/core/example/advanced/Benchmarks/CrsMatrix_sumIntoLocalValues.cpp
@@ -178,7 +178,7 @@ doKokkosSumIntoLocalValues (const std::string& label,
 {
   TM mon (*TM::getNewCounter (label));
 
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   const bool is_sorted = A.getCrsGraph ()->isSorted ();
   constexpr bool use_atomics = false;
 
diff --git a/packages/tpetra/core/example/advanced/Benchmarks/blockCrsMatrixMatVec.cpp b/packages/tpetra/core/example/advanced/Benchmarks/blockCrsMatrixMatVec.cpp
index fc16b7d0c0ee..2417724a1cca 100644
--- a/packages/tpetra/core/example/advanced/Benchmarks/blockCrsMatrixMatVec.cpp
+++ b/packages/tpetra/core/example/advanced/Benchmarks/blockCrsMatrixMatVec.cpp
@@ -102,14 +102,12 @@ localApplyBlockNoTrans (Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node>& A,
 
   // Get the matrix values.  Blocks are stored contiguously, each
   // block in row-major order (Kokkos::LayoutRight).
-  auto val = A.getValuesHost ();
+  auto val = A.getValuesHostNonConst ();
 
   auto gblGraph = A.getCrsGraph ();
-  auto lclGraph = G.getLocalGraph ();
-  auto ptrHost = Kokkos::create_mirror_view (lclGraph.row_map);
-  Kokkos::deep_copy (ptrHost, lclGraph.row_map);
-  auto indHost = Kokkos::create_mirror_view (lclGraph.entries);
-  Kokkos::deep_copy (indHost, lclGraph.entries);
+  auto lclGraph = G.getLocalGraphHost ();
+  auto ptrHost = lclGraph.row_map;
+  auto indHost = lclGraph.entries;
   Teuchos::Array<IST> localMem (blockSize);
   little_vec_type Y_lcl (localMem.getRawPtr (), blockSize, 1);
 
@@ -566,7 +564,7 @@ getTpetraBlockCrsMatrix (Teuchos::FancyOStream& out,
   // Fill in the block sparse matrix.
   out << "Fill the BlockCrsMatrix" << endl;
   for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) { // for each of my rows
-    Teuchos::ArrayView<const LO> lclColInds;
+    Tpetra::CrsGraph<>::local_inds_host_view_type lclColInds;
     graph->getLocalRowView (lclRow, lclColInds);
 
     // Put some entries in the matrix.
diff --git a/packages/tpetra/core/example/advanced/Benchmarks/localView.cpp b/packages/tpetra/core/example/advanced/Benchmarks/localView.cpp
index 2d301cc31089..b2597cecb5dd 100644
--- a/packages/tpetra/core/example/advanced/Benchmarks/localView.cpp
+++ b/packages/tpetra/core/example/advanced/Benchmarks/localView.cpp
@@ -668,8 +668,8 @@ main (int argc, char* argv[])
 	for (int trial = 0; trial < opts.numTrials; ++trial) {
 	  const LO lclNumRows = opts.lclNumRows;
 	  for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
-	    Teuchos::ArrayView<const LO> ind;
-	    Teuchos::ArrayView<const SC> val;
+	    typename Tpetra::CrsMatrix<>::local_inds_host_view_type ind;
+	    typename Tpetra::CrsMatrix<>::values_host_view_type val;
 	    A.getLocalRowView (lclRow, ind, val);
 	    const size_t len = static_cast<size_t> (ind.size ());
 	    totalLclNumEnt += len;
@@ -727,7 +727,7 @@ main (int argc, char* argv[])
 
       auto timer = TimeMonitor::getNewCounter ("Kokkos sequential");
       auto A = getTpetraMatrix (comm, opts);
-      auto A_lcl = A->getLocalMatrix ();
+      auto A_lcl = A->getLocalMatrixDevice ();
       { // Start timing after matrix creation
 	TimeMonitor timeMon (*timer);
 
@@ -765,7 +765,7 @@ main (int argc, char* argv[])
 
       auto timer = TimeMonitor::getNewCounter ("Kokkos parallel");
       auto A = getTpetraMatrix (comm, opts);
-      auto A_lcl = A->getLocalMatrix ();
+      auto A_lcl = A->getLocalMatrixDevice ();
       { // Start timing after matrix creation
 	TimeMonitor timeMon (*timer);
 
diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_Cuda.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_Cuda.hpp
index 3133c74202b7..857ba10dfec8 100644
--- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_Cuda.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_Cuda.hpp
@@ -154,7 +154,7 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
 
   // Lots and lots of typedefs
   using Teuchos::RCP;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -179,20 +179,24 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
        typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
 
   // Grab the  Kokkos::SparseCrsMatrices
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
-  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
-  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
-  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
+  c_lno_view_t Arowptr = Amat.graph.row_map,
+               Browptr = Bmat.graph.row_map;
+  const lno_nnz_view_t Acolind = Amat.graph.entries,
+                       Bcolind = Bmat.graph.entries;
+  const scalar_view_t Avals = Amat.values,
+                      Bvals = Bmat.values;
 
   c_lno_view_t  Irowptr;
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
 
@@ -258,12 +262,13 @@ template<class Scalar,
          class LocalOrdinal,
          class GlobalOrdinal,
          class LocalOrdinalViewType>
-void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
+void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
+               CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
-                                                                                               const LocalOrdinalViewType & targetMapToOrigRow,
-                                                                                               const LocalOrdinalViewType & targetMapToImportRow,
-                                                                                               const LocalOrdinalViewType & Bcol2Ccol,
-                                                                                               const LocalOrdinalViewType & Icol2Ccol,
+                                                                                               const LocalOrdinalViewType & targetMapToOrigRow_dev,
+                                                                                               const LocalOrdinalViewType & targetMapToImportRow_dev,
+                                                                                               const LocalOrdinalViewType & Bcol2Ccol_dev,
+                                                                                               const LocalOrdinalViewType & Icol2Ccol_dev,
                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
                                                                                                Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
                                                                                                const std::string& label,
@@ -283,7 +288,7 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
 
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -298,8 +303,24 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
   const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
   const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
 
-  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
-  typename graph_t::execution_space().fence();
+  // Since this is being run on Cuda, we need to fence because the below code will use UVM
+  // typename graph_t::execution_space().fence();
+  
+  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
+  // KDDKDD UVM Ideally, this function would run on device and use 
+  // KDDKDD UVM KokkosKernels instead of this host implementation.
+  auto targetMapToOrigRow = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           targetMapToOrigRow_dev);
+  auto targetMapToImportRow = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           targetMapToImportRow_dev);
+  auto Bcol2Ccol =
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           Bcol2Ccol_dev);
+  auto Icol2Ccol = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           Icol2Ccol_dev);
 
   // Sizes
   RCP<const map_type> Ccolmap = C.getColMap();
@@ -307,12 +328,16 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
   size_t n = Ccolmap->getNodeNumElements();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
-
-  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
-  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
+  const KCRS & Cmat = C.getLocalMatrixHost();
+
+  c_lno_view_t Arowptr = Amat.graph.row_map,
+               Browptr = Bmat.graph.row_map,
+               Crowptr = Cmat.graph.row_map;
+  const lno_nnz_view_t Acolind = Amat.graph.entries, 
+                       Bcolind = Bmat.graph.entries, 
+                       Ccolind = Cmat.graph.entries;
   const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
   scalar_view_t Cvals = Cmat.values;
 
@@ -320,9 +345,10 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCuda
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
 #ifdef HAVE_TPETRA_MMM_TIMINGS
@@ -468,10 +494,10 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
                                                                                                const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
-                                                                                               const LocalOrdinalViewType & targetMapToOrigRow,
-                                                                                               const LocalOrdinalViewType & targetMapToImportRow,
-                                                                                               const LocalOrdinalViewType & Bcol2Ccol,
-                                                                                               const LocalOrdinalViewType & Icol2Ccol,
+                                                                                               const LocalOrdinalViewType & targetMapToOrigRow_dev,
+                                                                                               const LocalOrdinalViewType & targetMapToImportRow_dev,
+                                                                                               const LocalOrdinalViewType & Bcol2Ccol_dev,
+                                                                                               const LocalOrdinalViewType & Icol2Ccol_dev,
                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
                                                                                                Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
                                                                                                const std::string& label,
@@ -490,7 +516,7 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
   using Teuchos::rcp;
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -507,7 +533,24 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
   const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
 
   // Since this is being run on Cuda, we need to fence because the below host code will use UVM
-  typename graph_t::execution_space().fence();
+  // KDDKDD typename graph_t::execution_space().fence();
+
+  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
+  // KDDKDD UVM Ideally, this function would run on device and use 
+  // KDDKDD UVM KokkosKernels instead of this host implementation.
+  auto targetMapToOrigRow = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           targetMapToOrigRow_dev);
+  auto targetMapToImportRow = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           targetMapToImportRow_dev);
+  auto Bcol2Ccol =
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           Bcol2Ccol_dev);
+  auto Icol2Ccol = 
+       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), 
+                                           Icol2Ccol_dev);
+  
  
   // Sizes
   RCP<const map_type> Ccolmap = C.getColMap();
@@ -515,9 +558,9 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
   size_t n = Ccolmap->getNodeNumElements();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
+  const KCRS & Cmat = C.getLocalMatrixHost();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -528,9 +571,10 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Jacobi-specific inner stuff
@@ -670,7 +714,7 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
 
   // Usings
   using device_t = typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
-  using matrix_t = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode>::local_matrix_type;
+  using matrix_t = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode>::local_matrix_device_type;
   using graph_t = typename matrix_t::StaticCrsGraphType;
   using lno_view_t = typename graph_t::row_map_type::non_const_type;
   using c_lno_view_t = typename graph_t::row_map_type::const_type;
@@ -687,17 +731,18 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCud
   lno_nnz_view_t Icolind;
   scalar_view_t Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Merge the B and Bimport matrices
   const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
 
   // Get the properties and arrays of input matrices
-  const matrix_t & Amat = Aview.origMatrix->getLocalMatrix();
-  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix();
+  const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
   typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
   typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_ExtraKernels_def.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
index 5b3fda688c60..a6d5c1bf50ac 100644
--- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
@@ -108,7 +108,7 @@ void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOr
 
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_device_type KCRS;
   //  typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -139,8 +139,8 @@ void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOr
   const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
@@ -151,9 +151,10 @@ void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOr
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
     b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
   }
 
@@ -324,7 +325,7 @@ void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdina
   using Teuchos::rcp;
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_device_type KCRS;
   //  typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
@@ -349,9 +350,9 @@ void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdina
   const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Cmat = C.getLocalMatrixDevice();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
   const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -362,9 +363,10 @@ void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdina
   c_lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Sizes
@@ -476,7 +478,7 @@ void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
 
   // Lots and lots of typedefs
   typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   //  typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -510,8 +512,8 @@ void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
   const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
@@ -522,9 +524,10 @@ void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
     b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
   }
 
@@ -724,7 +727,7 @@ void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
 
   // Lots and lots of typedefs
   typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   //  typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
@@ -752,9 +755,9 @@ void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
   const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Cmat = C.getLocalMatrixDevice();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
   const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -765,9 +768,10 @@ void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
   c_lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Jacobi-specific inner stuff
@@ -1047,7 +1051,7 @@ static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct
         typedef GlobalOrdinal GO;
         typedef Node          NO;
         typedef Map<LO,GO,NO> map_type;
-        typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_type KCRS;
+        typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type KCRS;
         typedef typename KCRS::StaticCrsGraphType graph_t;
         typedef typename graph_t::row_map_type::non_const_type lno_view_t;
         typedef typename graph_t::row_map_type::const_type c_lno_view_t;
@@ -1071,19 +1075,25 @@ static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct
         size_t n = Accolmap->getNodeNumElements();
 
         // Get raw Kokkos matrices, and the raw CSR views
-        const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
-        const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-        const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
-
-        c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
-        const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
+        const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
+        const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+        const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
+
+        c_lno_view_t Rrowptr = Rmat.graph.row_map, 
+                     Arowptr = Amat.graph.row_map, 
+                     Prowptr = Pmat.graph.row_map, Irowptr;
+        const lno_nnz_view_t Rcolind = Rmat.graph.entries, 
+                             Acolind = Amat.graph.entries, 
+                             Pcolind = Pmat.graph.entries;
         lno_nnz_view_t Icolind;
-        const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
+        const scalar_view_t Rvals = Rmat.values, 
+                            Avals = Amat.values, 
+                            Pvals = Pmat.values;
         scalar_view_t Ivals;
 
         if (!Pview.importMatrix.is_null())
         {
-          const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
+          const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
           Irowptr = Imat.graph.row_map;
           Icolind = Imat.graph.entries;
           Ivals = Imat.values;
@@ -1310,7 +1320,7 @@ static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Sca
         typedef GlobalOrdinal GO;
         typedef Node          NO;
         typedef Map<LO,GO,NO> map_type;
-        typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_type KCRS;
+        typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type KCRS;
         typedef typename KCRS::StaticCrsGraphType graph_t;
         typedef typename graph_t::row_map_type::const_type c_lno_view_t;
         typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -1329,10 +1339,10 @@ static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Sca
         size_t n = Accolmap->getNodeNumElements();
 
         // Get raw Kokkos matrices, and the raw CSR views
-        const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
-        const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-        const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
-        const KCRS & Cmat = Ac.getLocalMatrix();
+        const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
+        const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
+        const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
+        const KCRS & Cmat = Ac.getLocalMatrixDevice();
 
         c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
         const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -1343,7 +1353,7 @@ static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Sca
 
         if (!Pview.importMatrix.is_null())
         {
-          const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
+          const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
           Irowptr = Imat.graph.row_map;
           Icolind = Imat.graph.entries;
           Ivals = Imat.values;
diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_OpenMP.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_OpenMP.hpp
index d53427eed117..53ee0052ebbf 100644
--- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_OpenMP.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_OpenMP.hpp
@@ -205,7 +205,7 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpen
 
   // Lots and lots of typedefs
   using Teuchos::RCP;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_device_type KCRS;
   typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -239,8 +239,8 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpen
        typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
 
     // Grab the  Kokkos::SparseCrsMatrices
-    const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
-    // const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
+    const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
+    // const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
 
     // Get the algorithm mode
     std::string alg = nodename+std::string(" algorithm");
@@ -544,7 +544,7 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpe
 
   // Usings
   using device_t = typename Kokkos::Compat::KokkosOpenMPWrapperNode::device_type;
-  using matrix_t = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_type;
+  using matrix_t = typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::local_matrix_device_type;
   using graph_t = typename matrix_t::StaticCrsGraphType;
   using lno_view_t = typename graph_t::row_map_type::non_const_type;
   using c_lno_view_t = typename graph_t::row_map_type::const_type;
@@ -561,17 +561,18 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpe
   lno_nnz_view_t Icolind;
   scalar_view_t Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Merge the B and Bimport matrices
   const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
 
   // Get the properties and arrays of input matrices
-  const matrix_t & Amat = Aview.origMatrix->getLocalMatrix();
-  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix();
+  const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
+  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
   typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
   typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_decl.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_decl.hpp
index 1b302c6a43bf..ba29d427d8ef 100644
--- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_decl.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_decl.hpp
@@ -535,7 +535,7 @@ void setMaxNumEntriesPerRow(
 
   // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
   template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
-  inline const typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type 
+  inline const typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type 
   merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
                  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
                  const LocalOrdinalViewType & Acol2Brow,
@@ -559,7 +559,7 @@ struct AddKernels
   typedef typename device_type::execution_space execution_space;
   typedef typename device_type::memory_space memory_space;
   typedef typename crs_matrix_type::impl_scalar_type impl_scalar_type;
-  typedef typename crs_matrix_type::local_matrix_type KCRS;
+  typedef typename crs_matrix_type::local_matrix_device_type KCRS;
   typedef typename KCRS::values_type::non_const_type values_array;
   typedef typename KCRS::row_map_type::non_const_type row_ptrs_array;
   typedef typename KCRS::row_map_type row_ptrs_array_const;
diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_def.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_def.hpp
index f48a0e560bee..b1fcfc632925 100644
--- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_def.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_def.hpp
@@ -508,8 +508,8 @@ void Add(
   }
 
   size_t a_numEntries;
-  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
-  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
+  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getNodeMaxNumRowEntries());
+  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getNodeMaxNumRowEntries());
   GO row;
 
   if (scalarB != Teuchos::ScalarTraits<SC>::one())
@@ -520,16 +520,16 @@ void Add(
   if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
     for (LO i = 0; (size_t)i < numMyRows; ++i) {
       row = B.getRowMap()->getGlobalElement(i);
-      Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
+      Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
 
       if (scalarA != Teuchos::ScalarTraits<SC>::one())
         for (size_t j = 0; j < a_numEntries; ++j)
           a_vals[j] *= scalarA;
 
       if (bFilled)
-        B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
+        B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
       else
-        B.insertGlobalValues(row,  a_inds(0,a_numEntries), a_vals(0,a_numEntries));
+        B.insertGlobalValues(row,  a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
     }
   }
 }
@@ -759,8 +759,8 @@ add (const Scalar& alpha,
   {
     doFillComplete = params->get<bool>("Call fillComplete");
   }
-  auto Alocal = Aprime->getLocalMatrix();
-  auto Blocal = Bprime->getLocalMatrix();
+  auto Alocal = Aprime->getLocalMatrixDevice();
+  auto Blocal = Bprime->getLocalMatrixDevice();
   LO numLocalRows = Alocal.numRows();
   if(numLocalRows == 0)
   {
@@ -1017,8 +1017,8 @@ void Add(
 
   // do a loop over each matrix to add: A reordering might be more efficient
   for (int k = 0; k < 2; ++k) {
-    Array<GlobalOrdinal> Indices;
-    Array<Scalar> Values;
+    typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
+    typename crs_matrix_type::nonconst_values_host_view_type Values;
 
     // Loop over each locally owned row of the current matrix (either
     // Aprime or Bprime), and sum its entries into the corresponding
@@ -1041,9 +1041,9 @@ void Add(
       const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
       size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
       if (numEntries > 0) {
-        Indices.resize (numEntries);
-        Values.resize (numEntries);
-        Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
+        Kokkos::resize(Indices,numEntries);
+        Kokkos::resize(Values,numEntries);
+        Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
 
         if (scalar[k] != STS::one ()) {
           for (size_t j = 0; j < numEntries; ++j) {
@@ -1052,9 +1052,11 @@ void Add(
         }
 
         if (C->isFillComplete ()) {
-          C->sumIntoGlobalValues (globalRow, Indices, Values);
+          C->sumIntoGlobalValues (globalRow, numEntries, 
+                                  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
         } else {
-          C->insertGlobalValues (globalRow, Indices, Values);
+          C->insertGlobalValues (globalRow,  numEntries, 
+                                 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
         }
       }
     }
@@ -1567,7 +1569,7 @@ void mult_A_B_newmatrix(
 
   // Kokkos typedefs
   typedef typename map_type::local_map_type local_map_type;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename NO::execution_space execution_space;
@@ -1733,7 +1735,7 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
   using Teuchos::rcp;
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -1756,8 +1758,8 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
   size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
@@ -1767,9 +1769,10 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
     b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
   }
 
@@ -1955,7 +1958,7 @@ void mult_A_B_reuse(
 
   // Kokkos typedefs
   typedef typename map_type::local_map_type local_map_type;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename NO::execution_space execution_space;
@@ -2050,7 +2053,7 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
 
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -2071,9 +2074,9 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
   size_t n = Ccolmap->getNodeNumElements();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
+  const KCRS & Cmat = C.getLocalMatrixHost();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -2084,9 +2087,10 @@ void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
 #ifdef HAVE_TPETRA_MMM_TIMINGS
@@ -2194,7 +2198,7 @@ void jacobi_A_B_newmatrix(
   typedef typename map_type::local_map_type local_map_type;
 
   // All of the Kokkos typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename NO::execution_space execution_space;
@@ -2362,7 +2366,7 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   using Teuchos::rcp;
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -2388,8 +2392,8 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
@@ -2399,9 +2403,10 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
     b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
   }
 
@@ -2600,7 +2605,7 @@ void jacobi_A_B_reuse(
 
   // Kokkos typedefs
   typedef typename map_type::local_map_type local_map_type;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename NO::execution_space execution_space;
@@ -2702,7 +2707,7 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   using Teuchos::rcp;
 
   // Lots and lots of typedefs
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -2724,9 +2729,9 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   size_t n = Ccolmap->getNodeNumElements();
 
   // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
-  const KCRS & Cmat = C.getLocalMatrix();
+  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
+  const KCRS & Cmat = C.getLocalMatrixHost();
 
   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -2737,9 +2742,10 @@ void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType
   lno_nnz_view_t  Icolind;
   scalar_view_t  Ivals;
   if(!Bview.importMatrix.is_null()) {
-    Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
-    Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
-    Ivals   = Bview.importMatrix->getLocalMatrix().values;
+    auto lclB = Bview.importMatrix->getLocalMatrixHost();
+    Irowptr = lclB.graph.row_map;
+    Icolind = lclB.graph.entries;
+    Ivals   = lclB.values;
   }
 
   // Jacobi-specific inner stuff
@@ -3056,7 +3062,7 @@ void import_and_extract_views(
 /*********************************************************************************************************/
  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
-const typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type
+const typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
                     CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
                     const LocalOrdinalViewType & Acol2Brow,
@@ -3066,14 +3072,14 @@ merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview
                     const size_t mergedNodeNumCols) {
 
   using Teuchos::RCP;
-  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename KCRS::values_type::non_const_type scalar_view_t;
   // Grab the  Kokkos::SparseCrsMatrices
-  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
-  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
+  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
+  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
 
   // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
   if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
@@ -3081,7 +3087,7 @@ merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview
     // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
     // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
     RCP<const KCRS> Ik_;
-    if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
+    if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
     const KCRS * Ik     = Bview.importMatrix.is_null() ? 0 : &*Ik_;
     KCRS Iks;
     if(Ik!=0) Iks = *Ik;
diff --git a/packages/tpetra/core/ext/TpetraExt_TripleMatrixMultiply_def.hpp b/packages/tpetra/core/ext/TpetraExt_TripleMatrixMultiply_def.hpp
index 7352a058a2e3..ca79c151217c 100644
--- a/packages/tpetra/core/ext/TpetraExt_TripleMatrixMultiply_def.hpp
+++ b/packages/tpetra/core/ext/TpetraExt_TripleMatrixMultiply_def.hpp
@@ -378,7 +378,7 @@ namespace Tpetra {
 
       // Kokkos typedefs
       typedef typename map_type::local_map_type local_map_type;
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::non_const_type lno_view_t;
       typedef typename NO::execution_space execution_space;
@@ -543,7 +543,7 @@ namespace Tpetra {
 
       // Kokkos typedefs
       typedef typename map_type::local_map_type local_map_type;
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::non_const_type lno_view_t;
       typedef typename NO::execution_space execution_space;
@@ -643,7 +643,7 @@ namespace Tpetra {
 
       // Kokkos typedefs
       typedef typename map_type::local_map_type local_map_type;
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::non_const_type lno_view_t;
       typedef typename NO::execution_space execution_space;
@@ -807,7 +807,7 @@ namespace Tpetra {
 
       // Kokkos typedefs
       typedef typename map_type::local_map_type local_map_type;
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::non_const_type lno_view_t;
       typedef typename NO::execution_space execution_space;
@@ -890,10 +890,10 @@ namespace Tpetra {
     void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
                                                                                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
                                                                                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
-                                                                                                                           const LocalOrdinalViewType & Acol2Prow,
-                                                                                                                           const LocalOrdinalViewType & Acol2PIrow,
-                                                                                                                           const LocalOrdinalViewType & Pcol2Accol,
-                                                                                                                           const LocalOrdinalViewType & PIcol2Accol,
+                                                                                                                           const LocalOrdinalViewType & Acol2Prow_dev,
+                                                                                                                           const LocalOrdinalViewType & Acol2PIrow_dev,
+                                                                                                                           const LocalOrdinalViewType & Pcol2Accol_dev,
+                                                                                                                           const LocalOrdinalViewType & PIcol2Accol_dev,
                                                                                                                            CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
                                                                                                                            Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
                                                                                                                            const std::string& label,
@@ -911,7 +911,7 @@ namespace Tpetra {
       using Teuchos::rcp;
 
       // Lots and lots of typedefs
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::const_type c_lno_view_t;
       typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -933,22 +933,39 @@ namespace Tpetra {
       size_t n = Accolmap->getNodeNumElements();
       size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
 
-      // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-      const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-      const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
-      const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
-
-      c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map,  Rrowptr = Rmat.graph.row_map;
-      const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries;
-      const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
+      // Routine runs on host; have to put arguments on host, too
+      auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                           Acol2Prow_dev);
+      auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            Acol2PIrow_dev);
+      auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            Pcol2Accol_dev);
+      auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                             PIcol2Accol_dev);
 
-      c_lno_view_t  Irowptr;
-      lno_nnz_view_t  Icolind;
-      scalar_view_t  Ivals;
+      // Grab the  Kokkos::SparseCrsMatrices & inner stuff
+      const auto Amat = Aview.origMatrix->getLocalMatrixHost();
+      const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
+      const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
+
+      auto Arowptr = Amat.graph.row_map;
+      auto Prowptr = Pmat.graph.row_map;
+      auto Rrowptr = Rmat.graph.row_map;
+      const auto Acolind = Amat.graph.entries;
+      const auto Pcolind = Pmat.graph.entries;
+      const auto Rcolind = Rmat.graph.entries;
+      const auto Avals = Amat.values;
+      const auto Pvals = Pmat.values;
+      const auto Rvals = Rmat.values;
+
+      typename c_lno_view_t::HostMirror::const_type  Irowptr;
+      typename lno_nnz_view_t::HostMirror  Icolind;
+      typename scalar_view_t::HostMirror  Ivals;
       if(!Pview.importMatrix.is_null()) {
-        Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
-        Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
-        Ivals   = Pview.importMatrix->getLocalMatrix().values;
+        auto lclP = Pview.importMatrix->getLocalMatrixHost();
+        Irowptr = lclP.graph.row_map;
+        Icolind = lclP.graph.entries;
+        Ivals   = lclP.values;
         p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
       }
 
@@ -964,9 +981,9 @@ namespace Tpetra {
       // ML; for the non-threaded case, ML found it faster to spend less
       // effort on estimation and risk an occasional reallocation.
       size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
-      lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
-      lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
-      scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
+      typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
+      typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
+      typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
 
       // mfh 27 Sep 2016: The ac_status array is an implementation detail
       // of the local sparse matrix-matrix multiply routine.
@@ -1088,11 +1105,17 @@ namespace Tpetra {
 #ifdef HAVE_TPETRA_MMM_TIMINGS
       MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
 #endif
+      auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
+                         typename KCRS::device_type(), Crowptr);
+      auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
+                         typename KCRS::device_type(), Ccolind);
+      auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
+                         typename KCRS::device_type(), Cvals);
 
       // Final sort & set of CRS arrays
       if (params.is_null() || params->get("sort entries",true))
-        Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
-      Ac.setAllValues(Crowptr, Ccolind, Cvals);
+        Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
+      Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
 
 #ifdef HAVE_TPETRA_MMM_TIMINGS
      MM = Teuchos::null;  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
@@ -1129,10 +1152,10 @@ namespace Tpetra {
     void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
                                                                                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
                                                                                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
-                                                                                                                           const LocalOrdinalViewType & Acol2Prow,
-                                                                                                                           const LocalOrdinalViewType & Acol2PIrow,
-                                                                                                                           const LocalOrdinalViewType & Pcol2Accol,
-                                                                                                                           const LocalOrdinalViewType & PIcol2Accol,
+                                                                                                                           const LocalOrdinalViewType & Acol2Prow_dev,
+                                                                                                                           const LocalOrdinalViewType & Acol2PIrow_dev,
+                                                                                                                           const LocalOrdinalViewType & Pcol2Accol_dev,
+                                                                                                                           const LocalOrdinalViewType & PIcol2Accol_dev,
                                                                                                                            CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
                                                                                                                            Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
                                                                                                                            const std::string& label,
@@ -1150,7 +1173,7 @@ namespace Tpetra {
       using Teuchos::rcp;
 
       // Lots and lots of typedefs
-      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_type KCRS;
+      typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
       typedef typename KCRS::StaticCrsGraphType graph_t;
       typedef typename graph_t::row_map_type::const_type c_lno_view_t;
       typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -1171,11 +1194,21 @@ namespace Tpetra {
       size_t n = Accolmap->getNodeNumElements();
       size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
 
+      // Routine runs on host; have to put arguments on host, too
+      auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                           Acol2Prow_dev);
+      auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            Acol2PIrow_dev);
+      auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            Pcol2Accol_dev);
+      auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                             PIcol2Accol_dev);
+
       // Grab the  Kokkos::SparseCrsMatrices & inner stuff
-      const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
-      const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
-      const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
-      const KCRS & Cmat = Ac.getLocalMatrix();
+      const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
+      const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
+      const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
+      const KCRS & Cmat = Ac.getLocalMatrixHost();
 
       c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map,  Rrowptr = Rmat.graph.row_map, Crowptr =  Cmat.graph.row_map;
       const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
@@ -1186,9 +1219,10 @@ namespace Tpetra {
       lno_nnz_view_t  Icolind;
       scalar_view_t  Ivals;
       if(!Pview.importMatrix.is_null()) {
-        Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
-        Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
-        Ivals   = Pview.importMatrix->getLocalMatrix().values;
+        auto lclP = Pview.importMatrix->getLocalMatrixHost();
+        Irowptr = lclP.graph.row_map;
+        Icolind = lclP.graph.entries;
+        Ivals   = lclP.values;
         p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
       }
 
diff --git a/packages/tpetra/core/guide/src/Examples/SourceCode/power_method_1.cpp b/packages/tpetra/core/guide/src/Examples/SourceCode/power_method_1.cpp
index 98cc2f58de6c..f1e2ad42dd70 100644
--- a/packages/tpetra/core/guide/src/Examples/SourceCode/power_method_1.cpp
+++ b/packages/tpetra/core/guide/src/Examples/SourceCode/power_method_1.cpp
@@ -189,6 +189,10 @@ main (int argc, char *argv[])
   typedef Tpetra::Vector<>::global_ordinal_type global_ordinal_type;
   typedef Tpetra::Vector<>::mag_type magnitude_type;
   typedef Tpetra::CrsMatrix<> crs_matrix_type;
+  typedef Tpetra::CrsMatrix<>::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type;
+  typedef Tpetra::CrsMatrix<>::nonconst_local_inds_host_view_type nonconst_locaal_inds_host_view_type;
+  typedef Tpetra::CrsMatrix<>::nonconst_values_host_view_type nonconst_values_host_view_type;
+
   Teuchos::oblackholestream blackhole;
 
   Tpetra::ScopeGuard tpetraScope(&argc, &argv);
@@ -279,8 +283,8 @@ main (int argc, char *argv[])
       // matrix.
       const global_ordinal_type idOfFirstRow = 0;
       size_t numEntriesInRow = A->getNumEntriesInGlobalRow (idOfFirstRow);
-      Array<scalar_type>         rowvals (numEntriesInRow);
-      Array<global_ordinal_type> rowinds (numEntriesInRow);
+      nonconst_global_inds_host_view_type rowinds ("indices",numEntriesInRow);
+      nonconst_values_host_view_type rowvals ("vals",numEntriesInRow);
       // Fill rowvals and rowinds with the values resp. (global) column
       // indices of the sparse matrix entries owned by the calling
       // process.
@@ -296,7 +300,7 @@ main (int argc, char *argv[])
       //
       // The parentheses after rowinds and rowvalues indicate "a view of
       // the Array's data."  Array::operator() returns an ArrayView.
-      A->getGlobalRowCopy (idOfFirstRow, rowinds (), rowvals (), numEntriesInRow);
+      A->getGlobalRowCopy(idOfFirstRow, rowinds, rowvals, numEntriesInRow);
       for (size_t i = 0; i < numEntriesInRow; i++) {
         if (rowinds[i] == idOfFirstRow) {
           // We have found the diagonal entry; modify it.
@@ -309,7 +313,7 @@ main (int argc, char *argv[])
       // method throws an exception.  If you want to modify the
       // structure (by adding new entries), you'll need to call
       // insertGlobalValues().
-      A->replaceGlobalValues (idOfFirstRow, rowinds (), rowvals ());
+      A->replaceGlobalValues (idOfFirstRow, rowinds, rowvals);
     }
     // Call fillComplete() again to signal that we are done changing the
     // matrix.
diff --git a/packages/tpetra/core/inout/MatrixMarket_Tpetra.hpp b/packages/tpetra/core/inout/MatrixMarket_Tpetra.hpp
index 5754c97e74aa..7c0896c4300d 100644
--- a/packages/tpetra/core/inout/MatrixMarket_Tpetra.hpp
+++ b/packages/tpetra/core/inout/MatrixMarket_Tpetra.hpp
@@ -6011,22 +6011,19 @@ namespace Tpetra {
             for (GO globalRowIndex = minAllGlobalIndex;
                  globalRowIndex <= maxAllGlobalIndex; // inclusive range
                  ++globalRowIndex) {
-              ArrayView<const GO> ind;
-              ArrayView<const ST> val;
+              typename sparse_matrix_type::global_inds_host_view_type ind;
+              typename sparse_matrix_type::values_host_view_type val;
               newMatrix->getGlobalRowView (globalRowIndex, ind, val);
-              auto indIter = ind.begin ();
-              auto valIter = val.begin ();
-              for (; indIter != ind.end() && valIter != val.end();
-                   ++indIter, ++valIter) {
-                const GO globalColIndex = *indIter;
+              for (size_t ii = 0; ii < ind.extent(0); ii++) {
+                const GO globalColIndex = ind(ii);
                 // Convert row and column indices to 1-based.
                 // This works because the global index type is signed.
                 out << (globalRowIndex + 1 - rowIndexBase) << " "
                     << (globalColIndex + 1 - colIndexBase) << " ";
                 if (STS::isComplex) {
-                  out << STS::real (*valIter) << " " << STS::imag (*valIter);
+                  out << STS::real (val(ii)) << " " << STS::imag (val(ii));
                 } else {
-                  out << *valIter;
+                  out << val(ii);
                 }
                 out << endl;
               } // For each entry in the current row
@@ -6045,30 +6042,27 @@ namespace Tpetra {
                 "Failed to convert the supposed local row index "
                 << localRowIndex << " into a global row index.  "
                 "Please report this bug to the Tpetra developers.");
-              ArrayView<const LO> ind;
-              ArrayView<const ST> val;
+              typename sparse_matrix_type::local_inds_host_view_type ind;
+              typename sparse_matrix_type::values_host_view_type val;
               newMatrix->getLocalRowView (localRowIndex, ind, val);
-              auto indIter = ind.begin ();
-              auto valIter = val.begin ();
-              for (; indIter != ind.end() && valIter != val.end();
-                   ++indIter, ++valIter) {
+              for (size_t ii = 0; ii < ind.extent(0); ii++) {
                 // Convert the column index from local to global.
                 const GO globalColIndex =
-                  newMatrix->getColMap()->getGlobalElement (*indIter);
+                  newMatrix->getColMap()->getGlobalElement (ind(ii));
                 TEUCHOS_TEST_FOR_EXCEPTION(
                   globalColIndex == OTG::invalid(), std::logic_error,
                   "On local row " << localRowIndex << " of the sparse matrix: "
                   "Failed to convert the supposed local column index "
-                  << *indIter << " into a global column index.  Please report "
+                  << ind(ii) << " into a global column index.  Please report "
                   "this bug to the Tpetra developers.");
                 // Convert row and column indices to 1-based.
                 // This works because the global index type is signed.
                 out << (globalRowIndex + 1 - rowIndexBase) << " "
                     << (globalColIndex + 1 - colIndexBase) << " ";
                 if (STS::isComplex) {
-                  out << STS::real (*valIter) << " " << STS::imag (*valIter);
+                  out << STS::real (val(ii)) << " " << STS::imag (val(ii));
                 } else {
-                  out << *valIter;
+                  out << val(ii);
                 }
                 out << endl;
               } // For each entry in the current row
@@ -6311,10 +6305,10 @@ namespace Tpetra {
             for (GO globalRowIndex = minAllGlobalIndex;
                  globalRowIndex <= maxAllGlobalIndex; // inclusive range
                  ++globalRowIndex) {
-              ArrayView<const GO> ind;
+              typename crs_graph_type::global_inds_host_view_type ind;
               newGraph.getGlobalRowView (globalRowIndex, ind);
-              for (auto indIter = ind.begin (); indIter != ind.end (); ++indIter) {
-                const GO globalColIndex = *indIter;
+              for (size_t ii = 0; ii < ind.extent(0); ii++) {
+                const GO globalColIndex = ind(ii);
                 // Convert row and column indices to 1-based.
                 // This works because the global index type is signed.
                 out << (globalRowIndex + 1 - rowIndexBase) << " "
@@ -6336,17 +6330,17 @@ namespace Tpetra {
                  "to convert the supposed local row index " << localRowIndex <<
                  " into a global row index.  Please report this bug to the "
                  "Tpetra developers.");
-              ArrayView<const LO> ind;
+              typename crs_graph_type::local_inds_host_view_type ind;
               newGraph.getLocalRowView (localRowIndex, ind);
-              for (auto indIter = ind.begin (); indIter != ind.end (); ++indIter) {
+              for (size_t ii = 0; ii < ind.extent(0); ii++) {
                 // Convert the column index from local to global.
                 const GO globalColIndex =
-                  newGraph.getColMap ()->getGlobalElement (*indIter);
+                  newGraph.getColMap ()->getGlobalElement (ind(ii));
                 TEUCHOS_TEST_FOR_EXCEPTION(
                   globalColIndex == OTG::invalid(), std::logic_error,
                   "On local row " << localRowIndex << " of the sparse graph: "
                   "Failed to convert the supposed local column index "
-                  << *indIter << " into a global column index.  Please report "
+                  << ind(ii) << " into a global column index.  Please report "
                   "this bug to the Tpetra developers.");
                 // Convert row and column indices to 1-based.
                 // This works because the global index type is signed.
diff --git a/packages/tpetra/core/inout/mmio_Tpetra.c b/packages/tpetra/core/inout/mmio_Tpetra.c
index 4fc1d7429245..5b8da4ee45da 100644
--- a/packages/tpetra/core/inout/mmio_Tpetra.c
+++ b/packages/tpetra/core/inout/mmio_Tpetra.c
@@ -327,14 +327,13 @@ char  *mm_typecode_to_str(MM_typecode matcode)
 {
     char buffer[MM_MAX_LINE_LENGTH];
     char *types[4];
-	char *mm_strdup(const char *);
-    int error =0;
+    char *mm_strdup(const char *);
 
     /* check for MTX type */
     if (mm_is_matrix(matcode)) 
         types[0] = MM_MTX_STR;
     else
-        error=1;
+        return NULL;
 
     /* check for CRD or ARR matrix */
     if (mm_is_sparse(matcode))
diff --git a/packages/tpetra/core/src/CMakeLists.txt b/packages/tpetra/core/src/CMakeLists.txt
index a9653d0ba840..83b8da7c8f75 100644
--- a/packages/tpetra/core/src/CMakeLists.txt
+++ b/packages/tpetra/core/src/CMakeLists.txt
@@ -598,34 +598,37 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION)
   TPETRA_PROCESS_ALL_SLGN_TEMPLATES(REPLACEDIAGONALCRSMATRIX_OUTPUT_FILES "Tpetra_ETI_SC_LO_GO_NT.tmpl" "replaceDiagonalCrsMatrix" "REPLACEDIAGONALCRSMATRIX" "${CrsMatrix_ETI_SCALARS}" "${TpetraCore_ETI_LORDS}" "${TpetraCore_ETI_GORDS}" "${TpetraCore_ETI_NODES}" FALSE)
   LIST(APPEND SOURCES ${REPLACEDIAGONALCRSMATRIX_OUTPUT_FILES})
 
-  # Generate ETI .cpp files for Tpetra::Details::localDeepCopy*RowMatrix.
-  # Do so for the same set of Scalar types as CrsMatrix (see above),
-  # because this function is an implementation detail of CrsMatrix.
-  TPETRA_PROCESS_ALL_SLGN_TEMPLATES(LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES
-    "Tpetra_ETI_SC_LO_GO_NT.tmpl"
-    "Details_localDeepCopyRowMatrix"
-    "DETAILS_LOCALDEEPCOPYROWMATRIX"
-    "${CrsMatrix_ETI_SCALARS}"
-    "${TpetraCore_ETI_LORDS}"
-    "${TpetraCore_ETI_GORDS}"
-    "${TpetraCore_ETI_NODES}"
-    TRUE)
-  LIST(APPEND SOURCES ${LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES})
-
-  # Generate ETI .cpp files for the RowMatrix -> CrsMatrix overload of
-  # Tpetra::createDeepCopy.  Do this only for non-integer Scalar
-  # types, since we really only need this function for linear solvers.
-  TPETRA_PROCESS_ALL_SLGN_TEMPLATES(CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES
-    "Tpetra_ETI_SC_LO_GO_NT.tmpl"
-    "createDeepCopy_CrsMatrix"
-    "CREATEDEEPCOPY_CRSMATRIX"
-    "${TpetraCore_ETI_SCALARS_NO_ORDS}"
-    "${TpetraCore_ETI_LORDS}"
-    "${TpetraCore_ETI_GORDS}"
-    "${TpetraCore_ETI_NODES}"
-    FALSE)
-  LIST(APPEND SOURCES ${CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES})
-
+  IF (Tpetra_ENABLE_DEPRECATED_CODE)
+
+    # Generate ETI .cpp files for Tpetra::Details::localDeepCopy*RowMatrix.
+    # Do so for the same set of Scalar types as CrsMatrix (see above),
+    # because this function is an implementation detail of CrsMatrix.
+    TPETRA_PROCESS_ALL_SLGN_TEMPLATES(LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES
+      "Tpetra_ETI_SC_LO_GO_NT.tmpl"
+      "Details_localDeepCopyRowMatrix"
+      "DETAILS_LOCALDEEPCOPYROWMATRIX"
+      "${CrsMatrix_ETI_SCALARS}"
+      "${TpetraCore_ETI_LORDS}"
+      "${TpetraCore_ETI_GORDS}"
+      "${TpetraCore_ETI_NODES}"
+      TRUE)
+    LIST(APPEND SOURCES ${LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES})
+  
+    # Generate ETI .cpp files for the RowMatrix -> CrsMatrix overload of
+    # Tpetra::createDeepCopy.  Do this only for non-integer Scalar
+    # types, since we really only need this function for linear solvers.
+    TPETRA_PROCESS_ALL_SLGN_TEMPLATES(CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES
+      "Tpetra_ETI_SC_LO_GO_NT.tmpl"
+      "createDeepCopy_CrsMatrix"
+      "CREATEDEEPCOPY_CRSMATRIX"
+      "${TpetraCore_ETI_SCALARS_NO_ORDS}"
+      "${TpetraCore_ETI_LORDS}"
+      "${TpetraCore_ETI_GORDS}"
+      "${TpetraCore_ETI_NODES}"
+      FALSE)
+    LIST(APPEND SOURCES ${CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES})
+  ENDIF ()
+  
   # Generate ETI .cpp files for Tpetra::LocalCrsMatrixOperator.
   TPETRA_PROCESS_ALL_SN_TEMPLATES(LOCALCRSMATRIXOPERATOR_OUTPUT_FILES
     "Tpetra_ETI_SC_NT.tmpl" "LocalCrsMatrixOperator"
diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp
index 6559317b7dab..7f043c23676e 100644
--- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp
@@ -198,6 +198,10 @@ namespace Tpetra {
   void writeMatrixStrip(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
     using Teuchos::RCP;
     using map_type = Tpetra::Map<LO, GO, Node>;
+    using bcrs_type = BlockCrsMatrix<Scalar,LO,GO,Node>;
+    using bcrs_local_inds_host_view_type = typename bcrs_type::local_inds_host_view_type;
+    using bcrs_values_host_view_type = typename bcrs_type::values_host_view_type;
+    using impl_scalar_type = typename bcrs_type::impl_scalar_type;
 
     size_t numRows = A.getGlobalNumRows();
     RCP<const map_type> rowMap = A.getRowMap();
@@ -246,30 +250,28 @@ namespace Tpetra {
       for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
 
         // Get a view of the current row.
-        const LO*     localColInds;
-        Scalar* vals;
+        bcrs_local_inds_host_view_type localColInds;
+        bcrs_values_host_view_type vals;
         LO numEntries;
-        err = A.getLocalRowView (localRowInd, localColInds, vals, numEntries);
-        if (err != 0)
-          break;
+        A.getLocalRowView (localRowInd, localColInds, vals); numEntries = localColInds.extent(0);
         GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
 
         for (LO k = 0; k < numEntries; ++k) {
           GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
-          Scalar* const curBlock = vals + blockSize * blockSize * k;
+          const impl_scalar_type* curBlock = vals.data() + blockSize * blockSize * k;
           // Blocks are stored in row-major format.
           for (LO j = 0; j < blockSize; ++j) {
             GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
             for (LO i = 0; i < blockSize; ++i) {
               GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
-              const Scalar curVal = curBlock[i + j * blockSize];
+              const impl_scalar_type curVal = curBlock[i + j * blockSize];
 
               os << globalPointRowID << " " << globalPointColID << " ";
-              if (Teuchos::ScalarTraits<Scalar>::isComplex) {
+              if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
                 // Matrix Market format wants complex values to be
                 // written as space-delimited pairs.  See Bug 6469.
-                os << Teuchos::ScalarTraits<Scalar>::real (curVal) << " "
-                   << Teuchos::ScalarTraits<Scalar>::imag (curVal);
+                os << Teuchos::ScalarTraits<impl_scalar_type>::real (curVal) << " "
+                   << Teuchos::ScalarTraits<impl_scalar_type>::imag (curVal);
               }
               else {
                 os << curVal;
@@ -307,8 +309,9 @@ namespace Tpetra {
       using Teuchos::RCP;
 
       typedef Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node> block_crs_matrix_type;
-      typedef Tpetra::Map<LO,GO,Node>                                 map_type;
-      typedef Tpetra::CrsGraph<LO,GO,Node>                            crs_graph_type;
+      typedef Tpetra::Map<LO,GO,Node>                   map_type;
+      typedef Tpetra::CrsGraph<LO,GO,Node>              crs_graph_type;
+      typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node>     crs_matrix_type;
 
       const map_type &pointRowMap = *(pointMatrix.getRowMap());
       RCP<const map_type> meshRowMap = createMeshMap<LO,GO,Node>(blockSize, pointRowMap);
@@ -331,8 +334,8 @@ namespace Tpetra {
       // rows associated with it. The point column ids are converted to mesh column ids and put into an array.
       // As each point row collection is finished, the mesh column ids are sorted, made unique, and inserted
       // into the mesh graph.
-      ArrayView<const LO> pointColInds;
-      ArrayView<const Scalar> pointVals;
+      typename crs_matrix_type::local_inds_host_view_type pointColInds;
+      typename crs_matrix_type::values_host_view_type pointVals;
       Array<GO> meshColGids;
       meshColGids.reserve(pointMatrix.getGlobalMaxNumRowEntries());
       //again, I assume that point GIDs associated with a mesh GID are consecutive.
@@ -342,7 +345,7 @@ namespace Tpetra {
           LO rowLid = i*blockSize+j;
           pointMatrix.getLocalRowView(rowLid,pointColInds,pointVals); //TODO optimization: Since I don't care about values,
                                                                       //TODO I should use the graph instead.
-          for (int k=0; k<pointColInds.size(); ++k) {
+          for (size_t k=0; k<pointColInds.size(); ++k) {
             GO meshColInd = pointColMap.getGlobalElement(pointColInds[k]) / blockSize;
             meshColGids.push_back(meshColInd);
           }
@@ -378,7 +381,7 @@ namespace Tpetra {
         for (int j=0; j<blockSize; ++j) {
           LO rowLid = i*blockSize+j;
           pointMatrix.getLocalRowView(rowLid,pointColInds,pointVals);
-          for (int k=0; k<pointColInds.size(); ++k) {
+          for (size_t k=0; k<pointColInds.size(); ++k) {
             //convert point column to block col
             LO meshColInd = pointColInds[k] / blockSize;
             iter = bcol2bentry.find(meshColInd);
diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
index f7b0c181dca7..47ef471d79b9 100644
--- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
@@ -189,6 +189,8 @@ class BlockCrsMatrix :
                        device_type,
                        Kokkos::MemoryTraits<Kokkos::Unmanaged> >
           little_block_type;
+  typedef typename little_block_type::HostMirror little_block_host_type;
+
   //! The type used to access const matrix blocks.
   typedef Kokkos::View<const impl_scalar_type**,
                        Kokkos::LayoutRight,
@@ -203,6 +205,27 @@ class BlockCrsMatrix :
   typedef typename BMV::const_little_vec_type const_little_vec_type;
   typedef typename BMV::const_little_host_vec_type const_host_little_vec_type;
 
+  using row_matrix_type = RowMatrix<Scalar, LO, GO, node_type>;
+  using local_inds_device_view_type =
+        typename row_matrix_type::local_inds_device_view_type;
+  using local_inds_host_view_type =
+        typename row_matrix_type::local_inds_host_view_type;
+  using nonconst_local_inds_host_view_type =
+        typename row_matrix_type::nonconst_local_inds_host_view_type;
+
+  using global_inds_device_view_type =
+        typename row_matrix_type::global_inds_device_view_type;
+  using global_inds_host_view_type =
+        typename row_matrix_type::global_inds_host_view_type;
+  using nonconst_global_inds_host_view_type =
+        typename row_matrix_type::nonconst_global_inds_host_view_type;
+
+  using values_device_view_type =
+        typename row_matrix_type::values_device_view_type;
+  using values_host_view_type =
+        typename row_matrix_type::values_host_view_type;
+  using nonconst_values_host_view_type =
+        typename row_matrix_type::nonconst_values_host_view_type;
 
   //@}
   //! \name Constructors and destructor
@@ -409,6 +432,7 @@ class BlockCrsMatrix :
                       const Scalar vals[],
                       const LO numColInds) const;
 
+
   /// \brief Get a view of the (mesh, i.e., block) row, using local
   ///   (mesh, i.e., block) indices.
   ///
@@ -438,27 +462,62 @@ class BlockCrsMatrix :
   ///
   /// \return 0 if \c localRowInd is valid, else
   ///   <tt>Teuchos::OrdinalTraits<LO>::invalid()</tt>.
+  /// KK: we remove this interface 
+  ///     we cannot give a pointer
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   LO
   getLocalRowView (const LO localRowInd,
                    const LO*& colInds,
                    Scalar*& vals,
                    LO& numInds) const;
 
+
   /// \brief Not implemented.
   void
   getLocalRowView (LO LocalRow,
                    Teuchos::ArrayView<const LO> &indices,
                    Teuchos::ArrayView<const Scalar> &values) const;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+  /// KK: this is inherited from row matrix interface and it returns const
+  ///      this cannot replace the deprecated pointer interface
+  ///      we need nonconst version of this code
+  void
+  getLocalRowView (LO LocalRow,
+                   local_inds_host_view_type &indices,
+                   values_host_view_type &values) const override;
 
-  /// \brief Not implemented.
+  /// KK: this is new addition to replace getLocalRowVie with pointers and arrayviews
+  ///     we can change name if it is not prefrred
   void
+  getLocalRowViewNonConst (LO LocalRow,
+                           local_inds_host_view_type &indices,
+                           nonconst_values_host_view_type &values) const;
+  
+  /// \brief Not implemented.
+  virtual void
+  getLocalRowCopy (LO LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+  virtual void
   getLocalRowCopy (LO LocalRow,
                    const Teuchos::ArrayView<LO> &Indices,
                    const Teuchos::ArrayView<Scalar> &Values,
                    size_t &NumEntries) const;
+#endif
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   little_block_type
   getLocalBlock (const LO localRowInd, const LO localColInd) const;
+#endif
+
+  little_block_type
+  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const;
+
+  little_block_host_type
+  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const;
+
 
   /// \brief Get relative offsets corresponding to the given rows,
   ///   given by local row index.
@@ -500,6 +559,12 @@ class BlockCrsMatrix :
                                const Scalar vals[],
                                const LO numOffsets) const;
 
+  LO
+  absMaxLocalValuesByOffsets (const LO localRowInd,
+                              const ptrdiff_t offsets[],
+                              const Scalar vals[],
+                              const LO numOffsets) const;
+
   /// \brief Like sumIntoLocalValues, but avoids computing row offsets.
   ///
   /// \return The number of valid column indices in colInds.  This
@@ -625,11 +690,12 @@ class BlockCrsMatrix :
   /// This method uses the offsets of the diagonal entries, as
   /// precomputed by getLocalDiagOffsets(), to speed up copying the
   /// diagonal of the matrix.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   void
   getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
                                        Kokkos::MemoryUnmanaged>& diag,
                     const Teuchos::ArrayView<const size_t>& offsets) const;
-
+#endif
 
 protected:
   //! Like sumIntoLocalValues, but for the ABSMAX combine mode.
@@ -639,13 +705,6 @@ class BlockCrsMatrix :
                      const Scalar vals[],
                      const LO numColInds) const;
 
-  //! Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
-  LO
-  absMaxLocalValuesByOffsets (const LO localRowInd,
-                              const ptrdiff_t offsets[],
-                              const Scalar vals[],
-                              const LO numOffsets) const;
-
   /// \brief \name Implementation of Tpetra::DistObject.
   ///
   /// The methods here implement Tpetra::DistObject.  They let
@@ -740,21 +799,25 @@ class BlockCrsMatrix :
   /// Kokkos::DualView has extra Views in it for the "modified" flags,
   /// and we don't want the (modest) overhead of creating and storing
   /// those.
-  typename crs_graph_type::local_graph_type::row_map_type::HostMirror ptrHost_;
+  using graph_row_offset_host_type = typename crs_graph_type::local_graph_device_type::row_map_type::HostMirror;
+  graph_row_offset_host_type ptrHost_;
 
   /// \brief Host version of the graph's array of column indices.
   ///
   /// The device version of this is already stored in the graph.  We
   /// need the host version here, because this class' interface needs
   /// to access it on host.  See notes on ptrHost_ above.
-  typename crs_graph_type::local_graph_type::entries_type::HostMirror indHost_;
+  using graph_column_indices_host_type =   typename crs_graph_type::local_graph_device_type::entries_type::HostMirror;
+  graph_column_indices_host_type indHost_;
 
   /// \brief The array of values in the matrix.
   ///
   /// Each blockSize_ x blockSize_ block of values is stored
   /// contiguously, in row major format, with no padding either inside
   /// a block or between blocks.
-  typename Kokkos::DualView<impl_scalar_type*, device_type> val_;
+  using impl_scalar_type_dualview = Kokkos::DualView<impl_scalar_type*, device_type>;
+  using impl_scalar_type_wrapped_dualview = Details::WrappedDualView<impl_scalar_type_dualview>;
+  mutable impl_scalar_type_wrapped_dualview val_;
 
   /// \brief Column Map block multivector (only initialized if needed).
   ///
@@ -845,90 +908,78 @@ class BlockCrsMatrix :
   };
 
 public:
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    // KK: sync modify syntax will not work
+    //     the interface is deprecated bu the functionalities are removed
   //! \name Implementation of "dual view semantics"
   //@{
-
   //! Mark the matrix's valueas as modified in host space
   inline void modify_host()
   {
-    val_.modify_host();
+    //throw std::logic_error("do not use");
   }
 
   //! Mark the matrix's valueas as modified in device space
   inline void modify_device()
   {
-    val_.modify_device();
+    //throw std::logic_error("do not use");
   }
 
   //! Mark the matrix's values as modified in the given memory space.
   template<class MemorySpace>
   void modify ()
   {
-    if (is_cuda<MemorySpace>::value) {
-      this->modify_device ();
-    }
-    else {
-      this->modify_host ();
-    }
+    //throw std::logic_error("do not use");
   }
 
   //! Whether the matrix's values need sync'ing to host space
   inline bool need_sync_host() const
   {
-    return val_.need_sync_host();
+    //throw std::logic_error("do not use");
+    return false; 
   }
 
   //! Whether the matrix's values need sync'ing to device space
   inline bool need_sync_device() const
   {
-    return val_.need_sync_device();
+    //throw std::logic_error("do not use");
+    return false; 
   }
 
   //! Whether the matrix's values need sync'ing to the given memory space.
   template<class MemorySpace>
   bool need_sync () const
   {
-    if (is_cuda<MemorySpace>::value) {
-      return this->need_sync_device ();
-    }
-    else {
-      return this->need_sync_host ();
-    }
+    //throw std::logic_error("do not use");
+    return false;
   }
 
   //! Sync the matrix's values to host space
   inline void sync_host()
   {
-    val_.sync_host();
+    //throw std::logic_error("do not use");
   }
 
   //! Sync the matrix's values to device space
   inline void sync_device()
   {
-    val_.sync_device();
+    //throw std::logic_error("do not use");
   }
 
   //! Sync the matrix's values <i>to</i> the given memory space.
   template<class MemorySpace>
   void sync ()
   {
-    if (is_cuda<MemorySpace>::value) {
-      this->sync_device ();
-    }
-    else {
-      this->sync_host ();
-    }
+    //throw std::logic_error("do not use");
   }
+#endif
 
-  // \brief Get the host view of the matrix's values
-  typename Kokkos::DualView<impl_scalar_type*, device_type>::t_host getValuesHost () const {
-    return val_.view_host();
-  }
 
-  // \brief Get the device view of the matrix's values
-  typename Kokkos::DualView<impl_scalar_type*, device_type>::t_dev getValuesDevice () const {
-    return val_.view_device();
-  }
+    typename impl_scalar_type_dualview::t_host::const_type
+    getValuesHost() const;
+
+    typename impl_scalar_type_dualview::t_dev::const_type
+    getValuesDevice() const;
 
   /// \brief Get the host or device View of the matrix's values (\c val_).
   ///
@@ -947,20 +998,44 @@ class BlockCrsMatrix :
   ///
   /// CT: While we reserved the "right" we ignored this and explicitly did const cast away
   /// Hence I made the non-templated functions [getValuesHost and getValuesDevice; see above] const.
+  /// KK: This should be deprecated.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class MemorySpace>
   typename std::conditional<is_cuda<MemorySpace>::value,
-                            typename Kokkos::DualView<impl_scalar_type*, device_type>::t_dev,
-                            typename Kokkos::DualView<impl_scalar_type*, device_type>::t_host>::type
-  getValues ()
+                            typename impl_scalar_type_dualview::t_dev,
+                            typename impl_scalar_type_dualview::t_host>::type
+  getValues () const
   {
     // Unlike std::conditional, if_c has a select method.
     return Kokkos::Impl::if_c<
-        is_cuda<MemorySpace>::value,
-        typename Kokkos::DualView<impl_scalar_type*, device_type>::t_dev,
-        typename Kokkos::DualView<impl_scalar_type*, device_type>::t_host
-      >::select (this->getValuesDevice (), this->getValuesHost ());
+      is_cuda<MemorySpace>::value,
+      typename impl_scalar_type_dualview::t_dev,
+      typename impl_scalar_type_dualview::t_host
+      >::select (this->getValuesDeviceNonConst (), this->getValuesHostNonConst ());
   }
+#endif
+
+    typename impl_scalar_type_dualview::t_host
+    getValuesHostNonConst() const;
+
+    typename impl_scalar_type_dualview::t_dev
+    getValuesDeviceNonConst() const;
+
+    /// \brief Get a const Host view of the locally owned values
+    typename impl_scalar_type_dualview::t_host::const_type
+    getValuesHost (const LO& lclRow) const;
 
+    /// \brief Get a const Device view of the locally owned values
+    typename impl_scalar_type_dualview::t_dev::const_type
+    getValuesDevice (const LO& lclRow) const;
+
+    /// \brief Get a non-const Host view of the locally owned values
+    typename impl_scalar_type_dualview::t_host
+    getValuesHostNonConst (const LO& lclRow);
+
+    /// \brief Get a non-const Device view of the locally owned values
+    typename impl_scalar_type_dualview::t_dev
+    getValuesDeviceNonConst (const LO& lclRow);
   //@}
 
 private:
@@ -1061,18 +1136,10 @@ class BlockCrsMatrix :
   little_block_type
   getNonConstLocalBlockFromInput (impl_scalar_type* val, const size_t pointOffset) const;
 
-  const_little_block_type
-  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const;
+  little_block_host_type
+  getNonConstLocalBlockFromInputHost (impl_scalar_type* val, const size_t pointOffset) const;
 
-  little_block_type
-  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const;
 
-  /// \c Block at the given local mesh row and relative (mesh) offset.
-  ///
-  /// Use this for 2-argument getLocalDiagCopy that writes to Kokkos::View.
-  const_little_block_type
-  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
-                                   const size_t relMeshOffset) const;
 
 public:
   //! The communicator over which this matrix is distributed.
@@ -1163,11 +1230,18 @@ class BlockCrsMatrix :
   /// the calling process, then the method sets NumIndices to
   /// <tt>Teuchos::OrdinalTraits<size_t>::invalid()</tt>, and does
   /// not modify Indices or Values.
+  virtual void
+  getGlobalRowCopy (GO GlobalRow,
+                    nonconst_global_inds_host_view_type &Indices,
+                    nonconst_values_host_view_type &Values,
+                    size_t& NumEntries) const;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowCopy (GO GlobalRow,
                     const Teuchos::ArrayView<GO> &Indices,
                     const Teuchos::ArrayView<Scalar> &Values,
                     size_t& NumEntries) const;
+#endif
 
   /// \brief Get a constant, nonpersisting, globally indexed view of
   ///   the given row of the matrix.
@@ -1193,10 +1267,16 @@ class BlockCrsMatrix :
   ///
   /// If \c GlobalRow does not belong to this node, then \c indices
   /// is set to \c null.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   virtual void
   getGlobalRowView (GO GlobalRow,
                     Teuchos::ArrayView<const GO>& indices,
                     Teuchos::ArrayView<const Scalar>& values) const;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+  virtual void
+  getGlobalRowView (GO GlobalRow,
+                    global_inds_host_view_type & indices,
+                    values_host_view_type & values) const;
 
   /// \brief Get a copy of the diagonal entries, distributed by the row Map.
   ///
diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
index 334ced98c58c..6e35e3cf2b72 100644
--- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
@@ -584,8 +584,8 @@ class GetLocalDiagCopy {
   typedef Kokkos::View<const size_t*, device_type,
                        Kokkos::MemoryUnmanaged> diag_offsets_type;
   typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
-  typedef typename global_graph_type::local_graph_type local_graph_type;
-  typedef typename local_graph_type::row_map_type row_offsets_type;
+  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
+  typedef typename local_graph_device_type::row_map_type row_offsets_type;
   typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
   typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
   typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
@@ -681,6 +681,7 @@ class GetLocalDiagCopy {
     localError_ (new bool (false)),
     errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
   {
+    /// KK : additional check is needed that graph is fill complete.
     TEUCHOS_TEST_FOR_EXCEPTION(
       ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
       "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
@@ -710,26 +711,18 @@ class GetLocalDiagCopy {
         (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
     }
     {
-      typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
-      typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
+      auto local_graph_h = graph.getLocalGraphHost ();
+      auto ptr_h = local_graph_h.row_map;
+      ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
+      Kokkos::deep_copy(ptrHost_, ptr_h);
 
-      row_map_type ptr_d = graph.getLocalGraph ().row_map;
-      nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
-      Kokkos::deep_copy (ptr_h_nc, ptr_d);
-      ptrHost_ = ptr_h_nc;
-    }
-    {
-      typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
-      typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
+      auto ind_h = local_graph_h.entries;
+      indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
+      Kokkos::deep_copy (indHost_, ind_h);
 
-      entries_type ind_d = graph.getLocalGraph ().entries;
-      nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
-      Kokkos::deep_copy (ind_h_nc, ind_d);
-      indHost_ = ind_h_nc;
+      const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
+      val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
     }
-
-    const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
-    val_ = decltype (val_) ("val", numValEnt);
   }
 
   template<class Scalar, class LO, class GO, class Node>
@@ -776,26 +769,18 @@ class GetLocalDiagCopy {
         (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
     }
     {
-      typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
-      typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
+      auto local_graph_h = graph.getLocalGraphHost ();
+      auto ptr_h = local_graph_h.row_map;
+      ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
+      Kokkos::deep_copy(ptrHost_, ptr_h);
 
-      row_map_type ptr_d = graph.getLocalGraph ().row_map;
-      nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
-      Kokkos::deep_copy (ptr_h_nc, ptr_d);
-      ptrHost_ = ptr_h_nc;
-    }
-    {
-      typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
-      typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
+      auto ind_h = local_graph_h.entries;
+      indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
+      Kokkos::deep_copy (indHost_, ind_h);
 
-      entries_type ind_d = graph.getLocalGraph ().entries;
-      nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
-      Kokkos::deep_copy (ind_h_nc, ind_d);
-      indHost_ = ind_h_nc;
+      const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
+      val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
     }
-
-    const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
-    val_ = decltype (val_) ("val", numValEnt);
   }
 
   template<class Scalar, class LO, class GO, class Node>
@@ -865,7 +850,7 @@ class GetLocalDiagCopy {
          Scalar alpha,
          Scalar beta) const
   {
-    typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
+    using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
     TEUCHOS_TEST_FOR_EXCEPTION(
       mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
       std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
@@ -954,27 +939,8 @@ class GetLocalDiagCopy {
   BlockCrsMatrix<Scalar, LO, GO, Node>::
   setAllToScalar (const Scalar& alpha)
   {
-    // Why do we need to follow the last touch rule in Tpetra ? 
-    // If our main goal is to use device as much as possible, then 
-    // we should give priority to device for almost all operations.
-    // We probably do follow the last touch rule to obtain a certain 
-    // locality but this also can cause unexpected performance behavior
-    // for different use case. This might give inconsistent user experience.
-    
-    // Version 1: giving priority to device
-    //Kokkos::deep_copy(execution_space(), val_.view_device(), alpha);
-    //val_.modify_device();
-
-    // Version 2: set both view with the scalar alpha concurrently and reset sync state
-    // Launch a kernel on device and return immediately
-    Kokkos::deep_copy(execution_space(), val_.view_device(), alpha); 
-    // If a host view has different pointer (not mirror of the device view),
-    // then initialize the host view as well.
-    if (val_.view_device().data() != val_.view_host().data()) 
-      Kokkos::deep_copy(val_.view_host(), alpha);
-
-    // Both host and device views are set with alpha. Clear the sync state.
-    val_.clear_sync_state();
+    auto val_d = val_.getDeviceView(Access::OverwriteAll);
+    Kokkos::deep_copy(val_d, alpha);
   }
 
   template<class Scalar, class LO, class GO, class Node>
@@ -985,67 +951,11 @@ class GetLocalDiagCopy {
                       const Scalar vals[],
                       const LO numColInds) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::replaceLocalValues: ";
-#endif // HAVE_TPETRA_DEBUG
-
-    if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
-      // We modified no values, because the input local row index is
-      // invalid on the calling process.  That may not be an error, if
-      // numColInds is zero anyway; it doesn't matter.  This is the
-      // advantage of returning the number of valid indices.
-      return static_cast<LO> (0);
-    }
-    const impl_scalar_type* const vIn =
-      reinterpret_cast<const impl_scalar_type*> (vals);
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
-    const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
-    const LO perBlockSize = this->offsetPerBlock ();
-    LO hint = 0; // Guess for the relative offset into the current row
-    LO pointOffset = 0; // Current offset into input values
-    LO validCount = 0; // number of valid column indices in colInds
-
-#ifdef HAVE_TPETRA_DEBUG
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (this->need_sync_host (), std::runtime_error,
-       prefix << "The matrix's data were last modified on device, but have "
-       "not been sync'd to host.  Please sync to host (by calling "
-       "sync<Kokkos::HostSpace>() on this matrix) before calling this "
-       "method.");
-#endif // HAVE_TPETRA_DEBUG
-
-    auto vals_host_out = getValuesHost ();
-    impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
-
-    for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
-      const LO relBlockOffset =
-        this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
-      if (relBlockOffset != LINV) {
-        // mfh 21 Dec 2015: Here we encode the assumption that blocks
-        // are stored contiguously, with no padding.  "Contiguously"
-        // means that all memory between the first and last entries
-        // belongs to the block (no striding).  "No padding" means
-        // that getBlockSize() * getBlockSize() is exactly the number
-        // of entries that the block uses.  For another place where
-        // this assumption is encoded, see sumIntoLocalValues.
-
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        // little_block_type A_old =
-        //   getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        impl_scalar_type* const A_old =
-          vals_host_out_raw + absBlockOffset * perBlockSize;
-        // const_little_block_type A_new =
-        //   getConstLocalBlockFromInput (vIn, pointOffset);
-        const impl_scalar_type* const A_new = vIn + pointOffset;
-        // COPY (A_new, A_old);
-        for (LO i = 0; i < perBlockSize; ++i) {
-          A_old[i] = A_new[i];
-        }
-        hint = relBlockOffset + 1;
-        ++validCount;
-      }
-    }
+    Kokkos::View<ptrdiff_t*,Kokkos::HostSpace> 
+      offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
+    ptrdiff_t * offsets = offsets_host_view.data();
+    const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
+    const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
     return validCount;
   }
 
@@ -1067,7 +977,6 @@ class GetLocalDiagCopy {
                                        Kokkos::MemoryUnmanaged>& offsets) const
   {
     using Kokkos::parallel_for;
-    typedef typename device_type::execution_space execution_space;
     const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
 
     const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
@@ -1083,14 +992,6 @@ class GetLocalDiagCopy {
        prefix << "offsets.size() = " << offsets.size () << " < local number of "
        "diagonal blocks " << lclNumMeshRows << ".");
 
-#ifdef HAVE_TPETRA_DEBUG
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (this->template need_sync<device_type> (), std::runtime_error,
-       prefix << "The matrix's data were last modified on host, but have "
-       "not been sync'd to device.  Please sync to device (by calling "
-       "sync<device_type>() on this matrix) before calling this method.");
-#endif // HAVE_TPETRA_DEBUG
-
     typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
     typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
 
@@ -1098,52 +999,26 @@ class GetLocalDiagCopy {
     // we reserve the right to do lazy allocation of device data.  (We
     // don't plan to do lazy allocation for host data; the host
     // version of the data always exists.)
-    typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
-    auto vals_dev =
-      const_cast<this_type*> (this)->template getValues<device_type> ();
-
+    auto val_d = val_.getDeviceView(Access::ReadOnly);
     parallel_for (policy_type (0, lclNumMeshRows),
-                  functor_type (diag, vals_dev, offsets,
-                                graph_.getLocalGraph ().row_map, blockSize_));
+                  functor_type (diag, val_d, offsets,
+                                graph_.getLocalGraphDevice ().row_map, blockSize_));
   }
 
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar,LO,GO,Node>::
   getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
-                                       Kokkos::MemoryUnmanaged>& diag,
+                    Kokkos::MemoryUnmanaged>& diag,
                     const Teuchos::ArrayView<const size_t>& offsets) const
   {
-    using Kokkos::ALL;
-    using Kokkos::parallel_for;
-    typedef typename Kokkos::View<impl_scalar_type***, device_type,
-      Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
-
-    const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
-    const LO blockSize = this->getBlockSize ();
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
-       static_cast<LO> (diag.extent (1)) < blockSize ||
-       static_cast<LO> (diag.extent (2)) < blockSize,
-       std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
-       "The input Kokkos::View is not big enough to hold all the data.");
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
-       std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
-       "offsets.size() = " << offsets.size () << " < local number of diagonal "
-       "blocks " << lclNumMeshRows << ".");
-
-    // mfh 12 Dec 2015: Use the host execution space, since we haven't
-    // quite made everything work with CUDA yet.
-    typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
-    parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
-        auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
-        auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
-        COPY (D_in, D_out);
-      });
+    auto offsets_view_host = Kokkos::View<size_t*,Kokkos::HostSpace>(const_cast<size_t*>(offsets.getRawPtr()), offsets.size());
+    auto offsets_view_device = Kokkos::create_mirror_view_and_copy(typename device_type::memory_space(), offsets_view_host);
+    getLocalDiagCopy(diag, offsets_view_device);
+    Kokkos::deep_copy(offsets_view_host, offsets_view_device);
   }
-
+#endif
 
   template<class Scalar, class LO, class GO, class Node>
   LO
@@ -1153,37 +1028,11 @@ class GetLocalDiagCopy {
                      const Scalar vals[],
                      const LO numColInds) const
   {
-    if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
-      // We modified no values, because the input local row index is
-      // invalid on the calling process.  That may not be an error, if
-      // numColInds is zero anyway; it doesn't matter.  This is the
-      // advantage of returning the number of valid indices.
-      return static_cast<LO> (0);
-    }
-    const impl_scalar_type* const vIn =
-      reinterpret_cast<const impl_scalar_type*> (vals);
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
-    const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
-    const LO perBlockSize = this->offsetPerBlock ();
-    LO hint = 0; // Guess for the relative offset into the current row
-    LO pointOffset = 0; // Current offset into input values
-    LO validCount = 0; // number of valid column indices in colInds
-
-    for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
-      const LO relBlockOffset =
-        this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
-      if (relBlockOffset != LINV) {
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        little_block_type A_old =
-          getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        const_little_block_type A_new =
-          getConstLocalBlockFromInput (vIn, pointOffset);
-
-        ::Tpetra::Impl::absMax (A_old, A_new);
-        hint = relBlockOffset + 1;
-        ++validCount;
-      }
-    }
+    Kokkos::View<ptrdiff_t*,Kokkos::HostSpace> 
+      offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
+    ptrdiff_t * offsets = offsets_host_view.data();
+    const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
+    const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
     return validCount;
   }
 
@@ -1196,71 +1045,14 @@ class GetLocalDiagCopy {
                       const Scalar vals[],
                       const LO numColInds) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::sumIntoLocalValues: ";
-#endif // HAVE_TPETRA_DEBUG
-
-    if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
-      // We modified no values, because the input local row index is
-      // invalid on the calling process.  That may not be an error, if
-      // numColInds is zero anyway; it doesn't matter.  This is the
-      // advantage of returning the number of valid indices.
-      return static_cast<LO> (0);
-    }
-    //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
-    const impl_scalar_type* const vIn =
-      reinterpret_cast<const impl_scalar_type*> (vals);
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
-    const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
-    const LO perBlockSize = this->offsetPerBlock ();
-    LO hint = 0; // Guess for the relative offset into the current row
-    LO pointOffset = 0; // Current offset into input values
-    LO validCount = 0; // number of valid column indices in colInds
-
-#ifdef HAVE_TPETRA_DEBUG
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (this->need_sync_host (), std::runtime_error,
-       prefix << "The matrix's data were last modified on device, but have not "
-       "been sync'd to host.  Please sync to host (by calling "
-       "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
-#endif // HAVE_TPETRA_DEBUG
-
-    auto vals_host_out =
-      getValuesHost ();
-    impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
-
-    for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
-      const LO relBlockOffset =
-        this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
-      if (relBlockOffset != LINV) {
-        // mfh 21 Dec 2015: Here we encode the assumption that blocks
-        // are stored contiguously, with no padding.  "Contiguously"
-        // means that all memory between the first and last entries
-        // belongs to the block (no striding).  "No padding" means
-        // that getBlockSize() * getBlockSize() is exactly the number
-        // of entries that the block uses.  For another place where
-        // this assumption is encoded, see replaceLocalValues.
-
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        // little_block_type A_old =
-        //   getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        impl_scalar_type* const A_old =
-          vals_host_out_raw + absBlockOffset * perBlockSize;
-        // const_little_block_type A_new =
-        //   getConstLocalBlockFromInput (vIn, pointOffset);
-        const impl_scalar_type* const A_new = vIn + pointOffset;
-        // AXPY (ONE, A_new, A_old);
-        for (LO i = 0; i < perBlockSize; ++i) {
-          A_old[i] += A_new[i];
-        }
-        hint = relBlockOffset + 1;
-        ++validCount;
-      }
-    }
+    Kokkos::View<ptrdiff_t*,Kokkos::HostSpace> 
+      offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
+    ptrdiff_t * offsets = offsets_host_view.data();
+    const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
+    const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
     return validCount;
   }
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LO, class GO, class Node>
   LO
   BlockCrsMatrix<Scalar, LO, GO, Node>::
@@ -1269,11 +1061,7 @@ class GetLocalDiagCopy {
                    Scalar*& vals,
                    LO& numInds) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::getLocalRowView: ";
-#endif // HAVE_TPETRA_DEBUG
-
+    
     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
       colInds = NULL;
       vals = NULL;
@@ -1284,26 +1072,41 @@ class GetLocalDiagCopy {
       const size_t absBlockOffsetStart = ptrHost_[localRowInd];
       colInds = indHost_.data () + absBlockOffsetStart;
 
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (this->need_sync_host (), std::runtime_error,
-         prefix << "The matrix's data were last modified on device, but have "
-         "not been sync'd to host.  Please sync to host (by calling "
-         "sync<Kokkos::HostSpace>() on this matrix) before calling this "
-         "method.");
-#endif // HAVE_TPETRA_DEBUG
-
-      auto vals_host_out = getValuesHost ();
-      impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
-      impl_scalar_type* const vOut = vals_host_out_raw +
-        absBlockOffsetStart * offsetPerBlock ();
-      vals = reinterpret_cast<Scalar*> (vOut);
-
+      auto vals_host_out = getValuesHost (localRowInd);
+      impl_scalar_type* vals_host_out_raw = const_cast<impl_scalar_type*>(vals_host_out.data ());
+      vals = reinterpret_cast<Scalar*> (vals_host_out_raw);
       numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
       return 0; // indicates no error
     }
   }
+#endif
+
+  template<class Scalar, class LO, class GO, class Node>
+  void
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getLocalRowCopy (LO LocalRow,
+                   nonconst_local_inds_host_view_type &Indices,
+                   nonconst_values_host_view_type &Values,
+                   size_t& NumEntries) const 
+  {
+    auto vals = getValuesHost(LocalRow);
+    const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
+    if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
+                  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
+                  << numInds << " row entries");
+    }
+    const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
+    for (LO i=0; i<numInds; ++i) {
+      Indices[i] = colInds[i];
+    }
+    for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
+      Values[i] = vals[i];
+    }
+    NumEntries = numInds;
+  }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar, LO, GO, Node>::
@@ -1312,15 +1115,14 @@ class GetLocalDiagCopy {
                    const Teuchos::ArrayView<Scalar>& Values,
                    size_t &NumEntries) const
   {
-    const LO *colInds;
-    Scalar *vals;
-    LO numInds;
-    getLocalRowView(LocalRow,colInds,vals,numInds);
-    if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
+    auto vals = getValuesHost(LocalRow);
+    const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
+    if (numInds > (LO)Indices.size() || numInds*blockSize_*blockSize_ > (LO)Values.size()) {
       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
                   "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
                   << numInds << " row entries");
     }
+    const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
     for (LO i=0; i<numInds; ++i) {
       Indices[i] = colInds[i];
     }
@@ -1329,6 +1131,7 @@ class GetLocalDiagCopy {
     }
     NumEntries = numInds;
   }
+#endif
 
   template<class Scalar, class LO, class GO, class Node>
   LO
@@ -1379,21 +1182,20 @@ class GetLocalDiagCopy {
       return static_cast<LO> (0);
     }
     const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
+    using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
+    auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
+    impl_scalar_type* vOut = val_out.data();
 
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
-    const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
+    const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
     size_t pointOffset = 0; // Current offset into input values
     LO validCount = 0; // number of valid offsets
 
     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
-      const size_t relBlockOffset = offsets[k];
-      if (relBlockOffset != STINV) {
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        little_block_type A_old =
-          getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        const_little_block_type A_new =
-          getConstLocalBlockFromInput (vIn, pointOffset);
+      const size_t blockOffset = offsets[k]*perBlockSize;
+      if (offsets[k] != STINV) {
+        little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
+        const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
         COPY (A_new, A_old);
         ++validCount;
       }
@@ -1418,21 +1220,19 @@ class GetLocalDiagCopy {
       return static_cast<LO> (0);
     }
     const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
+    auto val_out = getValuesHost(localRowInd);
+    impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
 
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
     size_t pointOffset = 0; // Current offset into input values
     LO validCount = 0; // number of valid offsets
 
     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
-      const size_t relBlockOffset = offsets[k];
-      if (relBlockOffset != STINV) {
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        little_block_type A_old =
-          getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        const_little_block_type A_new =
-          getConstLocalBlockFromInput (vIn, pointOffset);
+      const size_t blockOffset = offsets[k]*perBlockSize;
+      if (blockOffset != STINV) {
+        little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
+        const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
         ::Tpetra::Impl::absMax (A_old, A_new);
         ++validCount;
       }
@@ -1458,22 +1258,20 @@ class GetLocalDiagCopy {
     }
     const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
     const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
+    typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
+    auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
+    impl_scalar_type* vOut = val_out.data();
 
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
     size_t pointOffset = 0; // Current offset into input values
     LO validCount = 0; // number of valid offsets
 
     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
-      const size_t relBlockOffset = offsets[k];
-      if (relBlockOffset != STINV) {
-        const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-        little_block_type A_old =
-          getNonConstLocalBlockFromAbsOffset (absBlockOffset);
-        const_little_block_type A_new =
-          getConstLocalBlockFromInput (vIn, pointOffset);
-        //A_old.update (ONE, A_new);
+      const size_t blockOffset = offsets[k]*perBlockSize;
+      if (blockOffset != STINV) {
+        little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
+        const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
         AXPY (ONE, A_new, A_old);
         ++validCount;
       }
@@ -1481,6 +1279,87 @@ class GetLocalDiagCopy {
     return validCount;
   }
 
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_host::const_type
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesHost () const 
+  {
+    return val_.getHostView(Access::ReadOnly);
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_dev::const_type
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesDevice () const 
+  {
+    return val_.getDeviceView(Access::ReadOnly);
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_host
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesHostNonConst () const 
+  {
+    return val_.getHostView(Access::ReadWrite);
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_dev
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesDeviceNonConst () const 
+  {
+    return val_.getDeviceView(Access::ReadWrite);
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_host::const_type
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesHost (const LO& lclRow) const 
+  {
+    const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
+    auto val = val_.getHostView(Access::ReadOnly);
+    auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize)); 
+    return r_val;
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
+  impl_scalar_type_dualview::t_dev::const_type
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesDevice (const LO& lclRow) const 
+  {
+    const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
+    auto val = val_.getDeviceView(Access::ReadOnly);
+    auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize)); 
+    return r_val;
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::impl_scalar_type_dualview::t_host
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesHostNonConst (const LO& lclRow) 
+  {
+    const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
+    auto val = val_.getHostView(Access::ReadWrite);
+    auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize)); 
+    return r_val;
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::impl_scalar_type_dualview::t_dev
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getValuesDeviceNonConst (const LO& lclRow) 
+  {
+    const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
+    auto val = val_.getDeviceView(Access::ReadWrite);
+    auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize)); 
+    return r_val;
+  }
 
   template<class Scalar, class LO, class GO, class Node>
   size_t
@@ -1630,7 +1509,7 @@ class GetLocalDiagCopy {
     using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
 
     const impl_scalar_type alpha_impl = alpha;
-    const auto graph = this->graph_.getLocalGraph ();
+    const auto graph = this->graph_.getLocalGraphDevice ();
     const impl_scalar_type beta_impl = beta;
     const LO blockSize = this->getBlockSize ();
 
@@ -1641,7 +1520,7 @@ class GetLocalDiagCopy {
     //auto Y_lcl = Y_mv.template getLocalView<device_type> (Access::ReadWrite);
     auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
     auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
-    auto val = this->val_.view_device ();
+    auto val = val_.getDeviceView(Access::ReadWrite);
 
     bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
                            beta_impl, Y_lcl);
@@ -1731,125 +1610,79 @@ class GetLocalDiagCopy {
   }
 
   template<class Scalar, class LO, class GO, class Node>
-  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
   BlockCrsMatrix<Scalar, LO, GO, Node>::
-  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
+  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
+                                  const size_t pointOffset) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
-#endif // HAVE_TPETRA_DEBUG
-
-    if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
-      // An empty block signifies an error.  We don't expect to see
-      // this error in correct code, but it's helpful for avoiding
-      // memory corruption in case there is a bug.
-      return const_little_block_type ();
-    }
-    else {
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (this->need_sync_host (), std::runtime_error,
-         prefix << "The matrix's data were last modified on device, but have "
-         "not been sync'd to host.  Please sync to host (by calling "
-         "sync<Kokkos::HostSpace>() on this matrix) before calling this "
-         "method.");
-#endif // HAVE_TPETRA_DEBUG
-      const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
-
-      auto vals_host = getValuesHost ();
-      const impl_scalar_type* vals_host_raw = vals_host.data ();
-
-      return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
-    }
+    // Row major blocks
+    const LO rowStride = blockSize_;
+    const size_t bs2 = blockSize_ * blockSize_;
+    return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LO, class GO, class Node>
-  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
   BlockCrsMatrix<Scalar, LO, GO, Node>::
-  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
-                                   const size_t relMeshOffset) const
+  getLocalBlock (const LO localRowInd, const LO localColInd) const
   {
-    typedef impl_scalar_type IST;
+    using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
 
-    const LO* lclColInds = NULL;
-    Scalar* lclVals = NULL;
-    LO numEnt = 0;
+    const size_t absRowBlockOffset = ptrHost_[localRowInd];
+    const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
 
-    LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
-    if (err != 0) {
-      // An empty block signifies an error.  We don't expect to see
-      // this error in correct code, but it's helpful for avoiding
-      // memory corruption in case there is a bug.
-      return const_little_block_type ();
+    if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
+      const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
+      auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
+      return getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
     }
     else {
-      const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
-      IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
-      return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
-                                                relPointOffset);
+      return little_block_type ();
     }
   }
+#endif
 
   template<class Scalar, class LO, class GO, class Node>
   typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
   BlockCrsMatrix<Scalar, LO, GO, Node>::
-  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
+  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
-#endif // HAVE_TPETRA_DEBUG
+    using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
 
-    if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
-      // An empty block signifies an error.  We don't expect to see
-      // this error in correct code, but it's helpful for avoiding
-      // memory corruption in case there is a bug.
-      return little_block_type ();
+    const size_t absRowBlockOffset = ptrHost_[localRowInd];
+    const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
+    if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
+      const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
+      auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
+      auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);      
+      return r_val; 
     }
     else {
-      const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (this->need_sync_host (), std::runtime_error,
-         prefix << "The matrix's data were last modified on device, but have "
-         "not been sync'd to host.  Please sync to host (by calling "
-         "sync<Kokkos::HostSpace>() on this matrix) before calling this "
-         "method.");
-#endif // HAVE_TPETRA_DEBUG
-      auto vals_host = getValuesHost();
-      impl_scalar_type* vals_host_raw = vals_host.data ();
-      return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
+      return little_block_type ();
     }
   }
 
   template<class Scalar, class LO, class GO, class Node>
-  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
+  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
   BlockCrsMatrix<Scalar, LO, GO, Node>::
-  getLocalBlock (const LO localRowInd, const LO localColInd) const
+  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
   {
-    const size_t absRowBlockOffset = ptrHost_[localRowInd];
-    const LO relBlockOffset =
-      this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
+    using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
 
+    const size_t absRowBlockOffset = ptrHost_[localRowInd];
+    const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
     if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
       const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
-      return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
+      auto vals = const_cast<this_type&>(*this).getValuesHostNonConst();
+      auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);      
+      return r_val; 
     }
     else {
-      return little_block_type ();
+      return little_block_host_type ();
     }
   }
 
-  // template<class Scalar, class LO, class GO, class Node>
-  // void
-  // BlockCrsMatrix<Scalar, LO, GO, Node>::
-  // clearLocalErrorStateAndStream ()
-  // {
-  //   typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
-  //   * (const_cast<this_type*> (this)->localError_) = false;
-  //   *errs_ = Teuchos::null;
-  // }
 
   template<class Scalar, class LO, class GO, class Node>
   bool
@@ -1985,15 +1818,6 @@ class GetLocalDiagCopy {
         "Please report this bug to the Tpetra developers." << endl;
       return;
     }
-    else {
-      // Kyungjoo: where is val_ modified ?
-      //    When we have dual view as a member variable,
-      //    which function should make sure the val_ is upto date ?
-      //    IMO, wherever it is used, the function should check its
-      //    availability.
-      const_cast<this_type*>(src)->sync_host();
-    }
-    this->sync_host();
 
     bool lclErr = false;
 #ifdef HAVE_TPETRA_DEBUG
@@ -2044,20 +1868,14 @@ class GetLocalDiagCopy {
         }
 #endif // HAVE_TPETRA_DEBUG
 
-        const LO* lclSrcCols;
-        Scalar* vals;
+        local_inds_host_view_type lclSrcCols;
+        values_host_view_type vals;
         LO numEntries;
         // If this call fails, that means the mesh row local index is
         // invalid.  That means the Import or Export is invalid somehow.
-        LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
-        if (err != 0) {
-          lclErr = true;
-#ifdef HAVE_TPETRA_DEBUG
-          (void) invalidSrcCopyRows.insert (localRow);
-#endif // HAVE_TPETRA_DEBUG
-        }
-        else {
-          err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
+        src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
+        if (numEntries > 0) {
+          LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
           if (err != numEntries) {
             lclErr = true;
             if (! dstRowMap.isNodeLocalElement (localRow)) {
@@ -2089,18 +1907,12 @@ class GetLocalDiagCopy {
         const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
         const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
 
-        const LO* lclSrcCols;
-        Scalar* vals;
+        local_inds_host_view_type lclSrcCols;
+        values_host_view_type vals;
         LO numEntries;
-        LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
-        if (err != 0) {
-          lclErr = true;
-#ifdef HAVE_TPETRA_DEBUG
-          invalidPermuteFromRows.insert (srcLclRow);
-#endif // HAVE_TPETRA_DEBUG
-        }
-        else {
-          err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
+        src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
+        if (numEntries > 0) {
+          LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
           if (err != numEntries) {
             lclErr = true;
 #ifdef HAVE_TPETRA_DEBUG
@@ -2121,14 +1933,14 @@ class GetLocalDiagCopy {
 
       // Copy local rows that are the "same" in both source and target.
       for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
-        const LO* lclSrcCols;
-        Scalar* vals;
+        local_inds_host_view_type lclSrcCols;
+        values_host_view_type vals;
         LO numEntries;
+
         // If this call fails, that means the mesh row local index is
         // invalid.  That means the Import or Export is invalid somehow.
-        LO err = 0;
         try {
-          err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
+          src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
         } catch (std::exception& e) {
           if (debug) {
             std::ostringstream os;
@@ -2141,13 +1953,7 @@ class GetLocalDiagCopy {
           throw e;
         }
 
-        if (err != 0) {
-          lclErr = true;
-#ifdef HAVE_TPETRA_DEBUG
-          invalidSrcCopyRows.insert (localRow);
-#endif // HAVE_TPETRA_DEBUG
-        }
-        else {
+        if (numEntries > 0) {
           if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
             lclErr = true;
             if (debug) {
@@ -2172,8 +1978,9 @@ class GetLocalDiagCopy {
 #endif // HAVE_TPETRA_DEBUG
               }
             }
+            LO err(0);
             try {
-              err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
+              err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
             } catch (std::exception& e) {
               if (debug) {
                 std::ostringstream os;
@@ -2206,12 +2013,12 @@ class GetLocalDiagCopy {
         const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
         const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
 
-        const LO* lclSrcCols;
-        Scalar* vals;
+        local_inds_host_view_type lclSrcCols;
+        values_host_view_type vals;
         LO numEntries;
-        LO err = 0;
+
         try {
-          err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
+          src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
         } catch (std::exception& e) {
           if (debug) {
             std::ostringstream os;
@@ -2224,13 +2031,7 @@ class GetLocalDiagCopy {
           throw e;
         }
 
-        if (err != 0) {
-          lclErr = true;
-#ifdef HAVE_TPETRA_DEBUG
-          invalidPermuteFromRows.insert (srcLclRow);
-#endif // HAVE_TPETRA_DEBUG
-        }
-        else {
+        if (numEntries > 0) {
           if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
             lclErr = true;
           }
@@ -2247,7 +2048,7 @@ class GetLocalDiagCopy {
 #endif // HAVE_TPETRA_DEBUG
               }
             }
-            err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
+            LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
             if (err != numEntries) {
               lclErr = true;
             }
@@ -2665,9 +2466,13 @@ class GetLocalDiagCopy {
     const crs_graph_type& srcGraph = src->graph_;
     const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
     const size_t numExportLIDs = exportLIDs.extent (0);
-    const size_t numBytesPerValue =
-      PackTraits<impl_scalar_type>
-      ::packValueCount(this->val_.extent(0) ? this->val_.view_host()(0) : impl_scalar_type());
+    size_t numBytesPerValue(0);
+    {
+      auto val_host = val_.getHostView(Access::ReadOnly);
+      numBytesPerValue =
+        PackTraits<impl_scalar_type>
+        ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
+    }
 
     // Compute the number of bytes ("packets") per row to pack.  While
     // we're at it, compute the total # of block entries to send, and
@@ -2766,16 +2571,14 @@ class GetLocalDiagCopy {
               gblColInds(member.team_scratch(0), maxRowLength);
 
             const LO  lclRowInd = exportLIDsHost(i);
-            const LO* lclColIndsRaw;
-            Scalar* valsRaw;
-            LO numEntLO;
+            local_inds_host_view_type lclColInds;
+            values_host_view_type vals;
+
             // It's OK to ignore the return value, since if the calling
             // process doesn't own that local row, then the number of
             // entries in that row on the calling process is zero.
-            (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
-
-            const size_t numEnt = static_cast<size_t> (numEntLO);
-            Kokkos::View<const LO*,host_exec> lclColInds (lclColIndsRaw, numEnt);
+            src->getLocalRowView (lclRowInd, lclColInds, vals); 
+            const size_t numEnt = lclColInds.extent(0);
 
             // Convert column indices from local to global.
             for (size_t j = 0; j < numEnt; ++j)
@@ -2791,7 +2594,7 @@ class GetLocalDiagCopy {
                offset(i),
                numEnt,
                Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
-               Kokkos::View<const impl_scalar_type*, host_exec>(reinterpret_cast<const impl_scalar_type*>(valsRaw), numEnt*blockSize*blockSize),
+               vals,
                numBytesPerValue,
                blockSize);
 
@@ -2929,9 +2732,13 @@ class GetLocalDiagCopy {
     // instances have the same size; that's not the issue here.)  This
     // could be bad if the calling process has no entries, but other
     // processes have entries that they want to send to us.
-    const size_t numBytesPerValue =
-      PackTraits<impl_scalar_type>::packValueCount
-        (this->val_.extent (0) ? this->val_.view_host () (0) : impl_scalar_type ());
+    size_t numBytesPerValue(0);
+    {
+      auto val_host = val_.getHostView(Access::ReadOnly);
+      numBytesPerValue =
+        PackTraits<impl_scalar_type>::packValueCount
+        (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
+    }
     const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
     const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
 
@@ -3207,9 +3014,6 @@ class GetLocalDiagCopy {
     using Teuchos::RCP;
     using Teuchos::wait;
     using std::endl;
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] = "Tpetra::BlockCrsMatrix::describe: ";
-#endif // HAVE_TPETRA_DEBUG
 
     // Set default verbosity if applicable.
     const Teuchos::EVerbosityLevel vl =
@@ -3302,20 +3106,6 @@ class GetLocalDiagCopy {
     }
 
     if (vl >= VERB_EXTREME) {
-      // FIXME (mfh 26 May 2016) It's not nice for this method to sync
-      // to host, since it's supposed to be const.  However, that's
-      // the easiest and least memory-intensive way to implement this
-      // method.
-      typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
-      const_cast<this_type&> (*this).sync_host ();
-
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (this->need_sync_host (), std::logic_error,
-         prefix << "Right after sync to host, the matrix claims that it needs "
-         "sync to host.  Please report this bug to the Tpetra developers.");
-#endif // HAVE_TPETRA_DEBUG
-
       const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
       const int myRank = comm.getRank ();
       const int numProcs = comm.getSize ();
@@ -3335,10 +3125,10 @@ class GetLocalDiagCopy {
         const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
         os << "Row " << meshGblRow << ": {";
 
-        const LO* lclColInds = NULL;
-        Scalar* vals = NULL;
+        local_inds_host_view_type lclColInds;
+        values_host_view_type vals;
         LO numInds = 0;
-        this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
+        this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
 
         for (LO k = 0; k < numInds; ++k) {
           const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
@@ -3544,7 +3334,21 @@ class GetLocalDiagCopy {
     return false;
   }
 
+  template<class Scalar, class LO, class GO, class Node>
+  void
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getGlobalRowCopy (GO /*GlobalRow*/,
+                    nonconst_global_inds_host_view_type &/*Indices*/,
+                    nonconst_values_host_view_type &/*Values*/,
+                    size_t& /*NumEntries*/) const
+  {
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
+      "This class doesn't support global matrix indexing.");
+
+  }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar, LO, GO, Node>::
@@ -3558,7 +3362,9 @@ class GetLocalDiagCopy {
       "This class doesn't support global matrix indexing.");
 
   }
+#endif
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar, LO, GO, Node>::
@@ -3583,17 +3389,67 @@ class GetLocalDiagCopy {
       true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
       "This class doesn't support local matrix indexing.");
   }
+#endif
 
   template<class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar, LO, GO, Node>::
-  getLocalDiagCopy (::Tpetra::Vector<Scalar,LO,GO,Node>& diag) const
+  getGlobalRowView (GO /* GlobalRow */,
+                    global_inds_host_view_type &/* indices */,
+                    values_host_view_type &/* values */) const
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const char prefix[] =
-      "Tpetra::BlockCrsMatrix::getLocalDiagCopy: ";
-#endif // HAVE_TPETRA_DEBUG
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
+      "This class doesn't support global matrix indexing.");
+
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  void
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getLocalRowView (LO localRowInd,
+                   local_inds_host_view_type &colInds,
+                   values_host_view_type &vals) const
+  {
+    if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
+      colInds = local_inds_host_view_type();
+      vals = values_host_view_type();
+    }
+    else {
+      const size_t absBlockOffsetStart = ptrHost_[localRowInd];
+      const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
+      colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
+
+      vals = getValuesHost (localRowInd);
+    }
+  }
+
+  template<class Scalar, class LO, class GO, class Node>
+  void
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getLocalRowViewNonConst (LO localRowInd,
+                           local_inds_host_view_type &colInds,
+                           nonconst_values_host_view_type &vals) const
+  {
+    if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
+      colInds = nonconst_local_inds_host_view_type();
+      vals = nonconst_values_host_view_type();
+    }
+    else {
+      const size_t absBlockOffsetStart = ptrHost_[localRowInd];
+      const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
+      colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
+
+      using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
+      vals = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
+    }
+  }
 
+  template<class Scalar, class LO, class GO, class Node>
+  void
+  BlockCrsMatrix<Scalar, LO, GO, Node>::
+  getLocalDiagCopy (::Tpetra::Vector<Scalar,LO,GO,Node>& diag) const
+  {
     const size_t lclNumMeshRows = graph_.getNodeNumRows ();
 
     Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
@@ -3602,21 +3458,9 @@ class GetLocalDiagCopy {
     // The code below works on host, so use a host View.
     auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
     Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
-    // We're filling diag on host for now.
-    //diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
-
-#ifdef HAVE_TPETRA_DEBUG
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (this->need_sync_host (), std::runtime_error,
-       prefix << "The matrix's data were last modified on device, but have "
-       "not been sync'd to host.  Please sync to host (by calling "
-       "sync<Kokkos::HostSpace>() on this matrix) before calling this "
-       "method.");
-#endif // HAVE_TPETRA_DEBUG
 
-    auto vals_host_out = getValuesHost ();
-    Scalar* vals_host_out_raw =
-      reinterpret_cast<Scalar*> (vals_host_out.data ());
+    auto vals_host_out = val_.getHostView(Access::ReadOnly);
+    const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
 
     // TODO amk: This is a temporary measure to make the code run with Ifpack2
     size_t rowOffset = 0;
diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
index 828f97991d7e..01f2b3a79916 100644
--- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
@@ -50,6 +50,7 @@
 #include "Tpetra_Exceptions.hpp"
 #include "Tpetra_RowGraph.hpp"
 #include "Tpetra_Util.hpp" // need this here for sort2
+#include "Tpetra_Details_WrappedDualView.hpp"
 
 #include "KokkosSparse_findRelOffset.hpp"
 #include "Kokkos_DualView.hpp"
@@ -249,11 +250,15 @@ namespace Tpetra {
     using node_type = Node;
 
     //! The type of the part of the sparse graph on each MPI process.
-    using local_graph_type = Kokkos::StaticCrsGraph<local_ordinal_type,
-                                                    Kokkos::LayoutLeft,
-                                                    device_type,
-                                                    void,
-                                                    size_t>;
+    using local_graph_device_type =
+           Kokkos::StaticCrsGraph<local_ordinal_type, Kokkos::LayoutLeft,
+                                  device_type, void, size_t>;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    using local_graph_type = local_graph_device_type;
+#endif
+
+    //! The type of the part of the sparse graph on each MPI process.
+    using local_graph_host_type = typename local_graph_device_type::HostMirror;
 
     //! The Map specialization used by this class.
     using map_type = ::Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>;
@@ -262,6 +267,57 @@ namespace Tpetra {
     //! The Export specialization used by this class.
     using export_type = ::Tpetra::Export<LocalOrdinal, GlobalOrdinal, Node>;
 
+protected:
+    // Types used for CrsGraph's storage of local column indices
+    using local_inds_dualv_type =
+          Kokkos::DualView<local_ordinal_type*, device_type>;
+    using local_inds_wdv_type = 
+          Details::WrappedDualView<local_inds_dualv_type>;
+
+    // Types used for CrsGraph's storage of global column indices
+    using global_inds_dualv_type =
+          Kokkos::DualView<global_ordinal_type*, device_type>;
+    using global_inds_wdv_type =
+          Details::WrappedDualView<global_inds_dualv_type>;
+
+public:
+    using row_graph_type = RowGraph<LocalOrdinal, GlobalOrdinal, Node>;
+    using row_ptrs_device_view_type =
+          typename row_graph_type::row_ptrs_device_view_type;
+    using row_ptrs_host_view_type =
+          typename row_graph_type::row_ptrs_host_view_type;
+
+    //! The Kokkos::View type for views of local ordinals on device and host
+    using local_inds_device_view_type =
+          typename row_graph_type::local_inds_device_view_type;
+    using local_inds_host_view_type =
+          typename row_graph_type::local_inds_host_view_type;
+    using nonconst_local_inds_host_view_type =
+          typename row_graph_type::nonconst_local_inds_host_view_type;
+
+    //! The Kokkos::View type for views of global ordinals on device and host
+    using global_inds_device_view_type =
+          typename row_graph_type::global_inds_device_view_type;
+    using global_inds_host_view_type =
+          typename row_graph_type::global_inds_host_view_type;
+    using nonconst_global_inds_host_view_type =
+          typename row_graph_type::nonconst_global_inds_host_view_type;
+
+
+//KDDKDD INROW    using local_inds_host_view_type = 
+//KDDKDD INROW          typename local_inds_dualv_type::t_host::const_type;
+
+//KDDKDD INROW    using global_inds_host_view_type = 
+//KDDKDD INROW          typename global_inds_dualv_type::t_host::const_type;
+
+    //! The Kokkos::View type for views of local ordinals on device
+//KDDKDD INROW    using local_inds_device_view_type = 
+//KDDKDD INROW          typename local_inds_dualv_type::t_dev::const_type;
+
+    //! The Kokkos::View type for views of global ordinals on device
+//KDDKDD INROW    using global_inds_device_view_type = 
+//KDDKDD INROW          typename global_inds_dualv_type::t_dev::const_type;
+
     //! @name Constructor/Destructor Methods
     //@{
 
@@ -394,6 +450,21 @@ namespace Tpetra {
               const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
 
+    /// \brief Constructor specifying column Map and an existing graph to subview.
+    ///   The graph created will point to the views of the existing graph,
+    ///   but only have the rows contained in the passed-in rowMap.
+    ///   This constructor assumes it will alias the first N rows of the graph,
+    ///   where N is the number of rows in rowMap.
+    ///
+    /// \param rowMap [in] Distribution of rows of the graph.
+    ///
+    /// \param params [in/out] Optional list of parameters.  If not
+    ///   null, any missing parameters will be filled in with their
+    ///   default values.
+    CrsGraph (CrsGraph<local_ordinal_type, global_ordinal_type, node_type>& originalGraph,
+              const Teuchos::RCP<const map_type>& rowMap,
+              const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
+
     /// \brief Constructor specifying column Map and arrays containing
     ///   the graph. In almost all cases the indices must be sorted on input,
     ///   but if they aren't sorted, "sorted" must be set to false in params.
@@ -418,8 +489,8 @@ namespace Tpetra {
     ///   default values.
     CrsGraph (const Teuchos::RCP<const map_type>& rowMap,
               const Teuchos::RCP<const map_type>& colMap,
-              const typename local_graph_type::row_map_type& rowPointers,
-              const typename local_graph_type::entries_type::non_const_type& columnIndices,
+              const typename local_graph_device_type::row_map_type& rowPointers,
+              const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
               const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
     /// \brief Constructor specifying column Map and arrays containing
@@ -473,7 +544,7 @@ namespace Tpetra {
     ///   default values.
     CrsGraph (const Teuchos::RCP<const map_type>& rowMap,
               const Teuchos::RCP<const map_type>& colMap,
-              const local_graph_type& lclGraph,
+              const local_graph_device_type& lclGraph,
               const Teuchos::RCP<Teuchos::ParameterList>& params);
 
     /// \brief Constructor specifying column, domain and range maps, and a
@@ -502,7 +573,7 @@ namespace Tpetra {
     /// \param params [in/out] Optional list of parameters.  If not
     ///   null, any missing parameters will be filled in with their
     ///   default values.
-    CrsGraph (const local_graph_type& lclGraph,
+    CrsGraph (const local_graph_device_type& lclGraph,
               const Teuchos::RCP<const map_type>& rowMap,
               const Teuchos::RCP<const map_type>& colMap,
               const Teuchos::RCP<const map_type>& domainMap = Teuchos::null,
@@ -513,7 +584,7 @@ namespace Tpetra {
     /// \param lclGraph [in] The local graph.  In almost all cases the
     ///   local graph must be sorted on input,
     ///   but if it isn't sorted, "sorted" must be set to false in params.
-    CrsGraph (const local_graph_type& lclGraph,
+    CrsGraph (const local_graph_device_type& lclGraph,
               const Teuchos::RCP<const map_type>& rowMap,
               const Teuchos::RCP<const map_type>& colMap,
               const Teuchos::RCP<const map_type>& domainMap,
@@ -1035,11 +1106,16 @@ namespace Tpetra {
     /// \param gblRow [in] Global index of the row.
     /// \param gblColInds [out] On output: Global column indices.
     /// \param numColInds [out] Number of indices returned.
+    void
+    getGlobalRowCopy (global_ordinal_type gblRow,
+                      nonconst_global_inds_host_view_type &gblColInds,
+                      size_t& numColInds) const override;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getGlobalRowCopy (global_ordinal_type gblRow,
                       const Teuchos::ArrayView<global_ordinal_type>& gblColInds,
                       size_t& numColInds) const override;
-
+#endif
     /// \brief Get a copy of the given row, using local indices.
     ///
     /// \param lclRow [in] Local index of the row.
@@ -1047,11 +1123,18 @@ namespace Tpetra {
     /// \param numColInds [out] Number of indices returned.
     ///
     /// \pre <tt>hasColMap()</tt>
+    void
+    getLocalRowCopy (local_ordinal_type gblRow,
+                     nonconst_local_inds_host_view_type &gblColInds,
+                     size_t& numColInds) const override;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getLocalRowCopy (local_ordinal_type lclRow,
                      const Teuchos::ArrayView<local_ordinal_type>& lclColInds,
                      size_t& numColInds) const override;
+#endif
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Get a const, non-persisting view of the given global
     ///   row's global column indices, as a Teuchos::ArrayView.
     ///
@@ -1065,11 +1148,29 @@ namespace Tpetra {
     void
     getGlobalRowView (const global_ordinal_type gblRow,
                       Teuchos::ArrayView<const global_ordinal_type>& gblColInds) const override;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
+    /// \brief Get a const view of the given global
+    ///   row's global column indices
+    ///
+    /// \param gblRow [in] Global index of the row.
+    /// \param gblColInds [out] Global column indices in the row.  If
+    ///   the given row is not a valid row index on the calling
+    ///   process, then the result has no entries (its size is zero).
+    ///
+    /// \pre <tt>! isLocallyIndexed()</tt>
+    /// \post <tt>gblColInds.size() == getNumEntriesInGlobalRow(gblRow)</tt>
+    void
+    getGlobalRowView (
+      const global_ordinal_type gblRow,
+      global_inds_host_view_type &gblColInds) const override;
+
 
     /// \brief Whether this class implements getLocalRowView() and
     ///   getGlobalRowView() (it does).
     bool supportsRowViews () const override;
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Get a const, non-persisting view of the given local
     ///   row's local column indices, as a Teuchos::ArrayView.
     ///
@@ -1083,6 +1184,23 @@ namespace Tpetra {
     void
     getLocalRowView (const local_ordinal_type lclRow,
                      Teuchos::ArrayView<const local_ordinal_type>& lclColInds) const override;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
+    /// \brief Get a const view of the given local
+    ///   row's local column indices
+    ///
+    /// \param lclRow [in] Local index of the row.
+    /// \param lclColInds [out] Local column indices in the row.  If
+    ///   the given row is not a valid row index on the calling
+    ///   process, then the result has no entries (its size is zero).
+    ///
+    /// \pre <tt>! isGloballyIndexed()</tt>
+    /// \post <tt>lclColInds.size() == getNumEntriesInLocalRow(lclRow)</tt>
+    void
+    getLocalRowView (
+      const LocalOrdinal lclRow,
+      local_inds_host_view_type &lclColInds) const override;
+
 
     //@}
     //! @name Overridden from Teuchos::Describable
@@ -1319,8 +1437,8 @@ namespace Tpetra {
     /// \warning This method is intended for expert developer use
     ///   only, and should never be called by user code.
     void
-    setAllIndices (const typename local_graph_type::row_map_type& rowPointers,
-                   const typename local_graph_type::entries_type::non_const_type& columnIndices);
+    setAllIndices (const typename local_graph_device_type::row_map_type& rowPointers,
+                   const typename local_graph_device_type::entries_type::non_const_type& columnIndices);
 
     /// \brief Set the graph's data directly, using 1-D storage.
     ///
@@ -1442,17 +1560,22 @@ namespace Tpetra {
     removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap) override;
     //@}
 
-    template<class ViewType, class OffsetViewType >
+    template<class DestViewType, class SrcViewType, 
+             class DestOffsetViewType, class SrcOffsetViewType >
     struct pack_functor {
-      typedef typename ViewType::execution_space execution_space;
-      ViewType src;
-      ViewType dest;
-      OffsetViewType src_offset;
-      OffsetViewType dest_offset;
-      typedef typename OffsetViewType::non_const_value_type ScalarIndx;
-
-      pack_functor(ViewType dest_, ViewType src_, OffsetViewType dest_offset_, OffsetViewType src_offset_):
-        src(src_),dest(dest_),src_offset(src_offset_),dest_offset(dest_offset_) {};
+      typedef typename DestViewType::execution_space execution_space;
+      SrcViewType src;
+      DestViewType dest;
+      SrcOffsetViewType src_offset;
+      DestOffsetViewType dest_offset;
+      typedef typename DestOffsetViewType::non_const_value_type ScalarIndx;
+
+      pack_functor(DestViewType dest_, 
+                   const SrcViewType src_,
+                   DestOffsetViewType dest_offset_, 
+                   const SrcOffsetViewType src_offset_):
+        src(src_),dest(dest_),
+        src_offset(src_offset_),dest_offset(dest_offset_) {};
 
       KOKKOS_INLINE_FUNCTION
       void operator() (size_t row) const {
@@ -1947,7 +2070,7 @@ namespace Tpetra {
     /// \warning You MUST call fillLocalGraph (or
     ///   CrsMatrix::fillLocalGraphAndMatrix) before calling this
     ///   method!  This method depends on the Kokkos::StaticCrsGraph
-    ///   (local_graph_type) object being ready.
+    ///   (local_graph_device_type) object being ready.
     ///
     /// Local constants include:
     /// <ul>
@@ -1981,98 +2104,28 @@ namespace Tpetra {
     /// CrsMatrix::replaceGlobalValues().
     RowInfo getRowInfoFromGlobalRowIndex (const global_ordinal_type gblRow) const;
 
-    /// \brief Get a const, nonowned, locally indexed view of the
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    /// \brief Get a const,  locally indexed view of the
     ///   locally owned row myRow, such that rowinfo =
     ///   getRowInfo(myRow).
+    //  Replaced by getLocalIndsViewHost
     Teuchos::ArrayView<const local_ordinal_type>
     getLocalView (const RowInfo& rowinfo) const;
-
-    /// \brief Get a nonconst, nonowned, locally indexed view of the
-    ///   locally owned row myRow, such that rowinfo =
-    ///   getRowInfo(myRow).
-    Teuchos::ArrayView<local_ordinal_type>
-    getLocalViewNonConst (const RowInfo& rowinfo);
-
-    /// \brief Get a pointer to the local column indices of a locally
-    ///   owned row, using the result of getRowInfo.
-    ///
-    /// \param lclInds [out] Pointer to the local column indices of
-    ///   the given row.
-    /// \param capacity [out] Capacity of (number of entries that can
-    ///   fit in) the given row.
-    /// \param rowInfo [in] Result of getRowInfo(lclRow) for the row
-    ///   \c lclRow to view.
-    ///
-    /// \return 0 if successful, else a nonzero error code.
-    local_ordinal_type
-    getLocalViewRawConst (const local_ordinal_type*& lclInds,
-                          local_ordinal_type& capacity,
-                          const RowInfo& rowInfo) const;
-
-  private:
-
-    /// \brief Get a const nonowned view of the local column indices
-    ///   indices of row rowinfo.localRow (only works if the matrix is
-    ///   locally indexed on the calling process).
-    ///
-    /// \param rowInfo [in] Result of calling getRowInfo with the
-    ///   index of the local row to view.
-    Kokkos::View<const local_ordinal_type*, device_type, Kokkos::MemoryUnmanaged>
-    getLocalKokkosRowView (const RowInfo& rowInfo) const;
-
-    /// \brief Get a nonconst nonowned view of the local column
-    ///   indices of row rowinfo.localRow (only works if the matrix is
-    ///   locally indexed on the calling process).
-    ///
-    /// \param rowInfo [in] Result of calling getRowInfo with the
-    ///   index of the local row to view.
-    Kokkos::View<local_ordinal_type*, device_type, Kokkos::MemoryUnmanaged>
-    getLocalKokkosRowViewNonConst (const RowInfo& rowInfo);
-
-    /// \brief Get a const nonowned view of the global column indices
-    ///   of row rowinfo.localRow (only works if the matrix is
-    ///   globally indexed).
-    ///
-    /// \param rowInfo [in] Result of calling getRowInfo with the
-    ///   index of the local row to view.
-    Kokkos::View<const global_ordinal_type*, device_type, Kokkos::MemoryUnmanaged>
-    getGlobalKokkosRowView (const RowInfo& rowInfo) const;
+#endif
 
   protected:
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Get a const, nonowned, globally indexed view of the
     ///   locally owned row myRow, such that rowinfo =
     ///   getRowInfo(myRow).
+    //  Replaced by getGlobalIndsViewHost
     Teuchos::ArrayView<const global_ordinal_type>
     getGlobalView (const RowInfo& rowinfo) const;
-
-    /// \brief Get a nonconst, nonowned, globally indexed view of the
-    ///   locally owned row myRow, such that rowinfo =
-    ///   getRowInfo(myRow).
-    Teuchos::ArrayView<global_ordinal_type>
-    getGlobalViewNonConst (const RowInfo& rowinfo);
-
-    /// \brief Get a pointer to the global column indices of a locally
-    ///   owned row, using the result of getRowInfoFromGlobalRowIndex.
-    ///
-    /// \param gblInds [out] Pointer to the global column indices of
-    ///   the given row.
-    /// \param capacity [out] Capacity of (number of entries that can
-    ///   fit in) the given row.
-    /// \param rowInfo [in] Result of
-    ///   getRowInfoFromGlobalRowIndex(gblRow) for the row to view,
-    ///   whose global row index is \c gblRow.
-    ///
-    /// \return 0 if successful, else a nonzero error code.
-    local_ordinal_type
-    getGlobalViewRawConst (const global_ordinal_type*& gblInds,
-                           local_ordinal_type& capacity,
-                           const RowInfo& rowInfo) const;
-
+#endif
 
   public:
 
-
     /// \brief Get the local graph.
     ///
     /// \warning THIS IS AN EXPERT MODE FUNCTION.  THIS IS AN
@@ -2080,12 +2133,15 @@ namespace Tpetra {
     ///
     /// This is only a valid representation of the local graph if the
     /// (global) graph is fill complete.
-    local_graph_type getLocalGraph () const;
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    // TPETRA_DEPRECATED
+    local_graph_device_type getLocalGraph () const;
+#endif
+    local_graph_device_type getLocalGraphDevice () const;
+    local_graph_host_type getLocalGraphHost () const;
 
   protected:
 
-
     void fillLocalGraph (const Teuchos::RCP<Teuchos::ParameterList>& params);
 
     //! Throw an exception if the internal state is not consistent.
@@ -2124,9 +2180,6 @@ namespace Tpetra {
     /// is necessary in that case for sparse matrix-vector multiply.
     Teuchos::RCP<const export_type> exporter_;
 
-    //! Local graph; only initialized after first fillComplete() call.
-    local_graph_type lclGraph_;
-
     /// \brief Local maximum of the number of entries in each row.
     ///
     /// Computed in computeLocalConstants; only valid when
@@ -2147,6 +2200,122 @@ namespace Tpetra {
     global_size_t globalMaxNumRowEntries_ =
       Teuchos::OrdinalTraits<global_size_t>::invalid();
 
+    // Replacement for device view k_rowPtrs_
+    // Device view rowPtrsUnpacked_dev_ takes place of k_rowPtrs_ 
+    // Host view rowPtrsUnpacked_host_ takes place of copies and use of getEntryOnHost
+    // Wish this could be a WrappedDualView, but deep_copies in DualView
+    // don't work with const data views (e.g., StaticCrsGraph::row_map)
+    // k_rowPtrs_ is offsets wrt the ALLOCATED indices array, not necessarily
+    // the ACTUAL compressed indices array.
+    // When !OptimizedStorage, k_rowPtrs_ may differ from ACTUAL compressed
+    // indices array.  (Karen is skeptical that !OptimizedStorage works)
+    // When OptimizedStorage, rowPtrsUnpacked_ = k_rowPtrsPacked_
+
+//KDDKDD INROW    using row_ptrs_device_view_type = 
+//KDDKDD INROW          Kokkos::View<const typename local_graph_device_type::size_type *, 
+//KDDKDD INROW                       device_type> ;
+//KDDKDD INROW    using row_ptrs_host_view_type = 
+//KDDKDD INROW          typename row_ptrs_device_view_type::HostMirror::const_type;
+    row_ptrs_device_view_type rowPtrsUnpacked_dev_;
+    row_ptrs_host_view_type rowPtrsUnpacked_host_;
+
+    void setRowPtrsUnpacked(const row_ptrs_device_view_type &dview) {
+      rowPtrsUnpacked_dev_ = dview;
+      rowPtrsUnpacked_host_ = 
+           Kokkos::create_mirror_view_and_copy(
+                          typename row_ptrs_device_view_type::host_mirror_space(),
+                          dview);
+    }
+
+    // Row offsets into the actual graph local indices 
+    // Device view rowPtrsUnpacked_dev_ takes place of lclGraph_.row_map
+
+    row_ptrs_device_view_type rowPtrsPacked_dev_;
+    row_ptrs_host_view_type rowPtrsPacked_host_;
+
+    void setRowPtrsPacked(const row_ptrs_device_view_type &dview) {
+      rowPtrsPacked_dev_ = dview;
+      rowPtrsPacked_host_ = 
+           Kokkos::create_mirror_view_and_copy(
+                          typename row_ptrs_device_view_type::host_mirror_space(),
+                          dview);
+    }
+    
+  
+//KDDKDD Make private -- matrix shouldn't access directly
+    /// \brief Local ordinals of colum indices for all rows
+    /// KDDKDD UVM Removal:   Device view takes place of k_lclInds1D_
+    /// Valid when isLocallyIndexed is true
+    /// If OptimizedStorage, storage is PACKED after fillComplete
+    /// If not OptimizedStorate, storage is UNPACKED after fillComplete; 
+    /// that is, the views have storage equal to sizes provided in CrsGraph
+    /// constructor.
+    ///
+    /// This is allocated only if
+    ///
+    ///   - The calling process has a nonzero number of entries
+    ///   - The graph is locally indexed
+    local_inds_wdv_type lclIndsUnpacked_wdv;
+
+    /// \brief Local ordinals of colum indices for all rows
+    /// KDDKDD UVM Removal:   Device view takes place of lclGraph_.entries
+    /// Valid when isLocallyIndexed is true
+    /// Built during fillComplete or non-fillComplete constructors
+    /// Storage is PACKED after fillComplete
+    /// that is, the views have storage equal to sizes provided in CrsGraph
+    /// constructor.
+    ///
+    /// This is allocated only if
+    ///
+    ///   - The calling process has a nonzero number of entries
+    ///   - The graph is locally indexed
+    mutable local_inds_wdv_type lclIndsPacked_wdv;
+
+//KDDKDD Make private -- matrix shouldn't access directly
+    /// \brief Global ordinals of column indices for all rows
+    /// KDDKDD UVM Removal:   Device view takes place of k_gblInds1D_
+    ///
+    /// This is allocated only if
+    ///
+    ///   - The calling process has a nonzero number of entries
+    ///   - The graph is globally indexed
+
+    global_inds_wdv_type gblInds_wdv;
+
+    /// \brief Get a const, locally indexed view of the
+    ///   locally owned row myRow, such that rowinfo =
+    ///   getRowInfo(myRow).
+    typename local_inds_dualv_type::t_host::const_type
+    getLocalIndsViewHost (const RowInfo& rowinfo) const;
+
+    /// \brief Get a const, locally indexed view of the
+    ///   locally owned row myRow, such that rowinfo =
+    ///   getRowInfo(myRow).
+    typename local_inds_dualv_type::t_dev::const_type
+    getLocalIndsViewDevice (const RowInfo& rowinfo) const;
+
+    /// \brief Get a const, globally indexed view of the
+    ///   locally owned row myRow, such that rowinfo =
+    ///   getRowInfo(myRow).
+    typename global_inds_dualv_type::t_host::const_type
+    getGlobalIndsViewHost (const RowInfo& rowinfo) const;
+
+    /// \brief Get a const, globally indexed view of the
+    ///   locally owned row myRow, such that rowinfo =
+    ///   getRowInfo(myRow).
+    typename global_inds_dualv_type::t_dev::const_type
+    getGlobalIndsViewDevice (const RowInfo& rowinfo) const;
+
+    /// \brief Get a ReadWrite locally indexed view of the
+    ///   locally owned row myRow, such that rowinfo =
+    ///   getRowInfo(myRow).
+    typename local_inds_dualv_type::t_host
+    getLocalIndsViewHostNonConst (const RowInfo& rowinfo);
+
+    // FOR NOW...
+    // KEEP k_numRowEntries_ (though switch from HostMirror to Host)
+    // KEEP k_numAllocPerRow_ (though perhaps switch from HostMirror to Host)
+
     /// \brief The maximum number of entries to allow in each locally
     ///   owned row, per row.
     ///
@@ -2189,25 +2358,9 @@ namespace Tpetra {
     //! \name Graph data structures (packed and unpacked storage).
     //@{
 
-    /// \brief Local column indices for all rows.
-    ///
-    /// This is only allocated if
-    ///
-    ///   - The calling process has a nonzero number of entries
-    ///   - The graph is locally indexed
-    typename local_graph_type::entries_type::non_const_type k_lclInds1D_;
-
     //! Type of the k_gblInds1D_ array of global column indices.
     typedef Kokkos::View<global_ordinal_type*, device_type> t_GlobalOrdinal_1D;
 
-    /// \brief Global column indices for all rows.
-    ///
-    /// This is only allocated if
-    ///
-    ///   - The calling process has a nonzero number of entries
-    ///   - The graph is globally indexed
-    t_GlobalOrdinal_1D k_gblInds1D_;
-
     /// \brief Row offsets for "1-D" storage.
     ///
     /// This is only allocated if "1-D" storage is active.  In that
@@ -2229,7 +2382,6 @@ namespace Tpetra {
     /// If it is allocated, k_rowPtrs_ has length getNodeNumRows()+1.
     /// The k_numRowEntries_ array has has length getNodeNumRows(),
     /// again if it is allocated.
-    typename local_graph_type::row_map_type::const_type k_rowPtrs_;
 
     /// \brief The type of k_numRowEntries_ (see below).
     ///
diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
index 4ae3cbe1cf05..7557b09a28a6 100644
--- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
@@ -125,10 +125,11 @@ namespace Tpetra {
         if (gblColIndsStorage.size() < origNumEnt) {
           gblColIndsStorage.resize(origNumEnt);
         }
-        Teuchos::ArrayView<GO> gblColInds(gblColIndsStorage.data(),
-                                          origNumEnt);
+        typename CrsGraph<LO,GO,Node>::nonconst_global_inds_host_view_type gblColInds(gblColIndsStorage.data(),
+                                                       origNumEnt);
         graph.getGlobalRowCopy(gblRowInd, gblColInds, origNumEnt);
-        return gblColInds;
+        Teuchos::ArrayView<GO> retval(gblColIndsStorage.data(),origNumEnt);
+        return retval;
       }
 
       template<class LO, class GO, class DT, class OffsetType, class NumEntType>
@@ -521,12 +522,49 @@ namespace Tpetra {
   }
 
 
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  CrsGraph (CrsGraph<local_ordinal_type, global_ordinal_type, node_type>& originalGraph,
+            const Teuchos::RCP<const map_type>& rowMap,
+            const Teuchos::RCP<Teuchos::ParameterList>& params) :
+    dist_object_type (rowMap)
+    , rowMap_(rowMap)
+    , colMap_(originalGraph.colMap_)
+    , numAllocForAllRows_(originalGraph.numAllocForAllRows_)
+    , storageStatus_(originalGraph.storageStatus_)
+    , indicesAreAllocated_(originalGraph.indicesAreAllocated_)
+    , indicesAreLocal_(originalGraph.indicesAreLocal_)
+    , indicesAreSorted_(originalGraph.indicesAreSorted_)
+  {
+    staticAssertions();
+
+    int numRows = rowMap->getNodeNumElements();
+    size_t numNonZeros = originalGraph.rowPtrsPacked_host_(numRows);
+    auto rowsToUse = Kokkos::pair<size_t, size_t>(0, numRows+1);
+
+    rowPtrsUnpacked_dev_ = Kokkos::subview(originalGraph.rowPtrsUnpacked_dev_, rowsToUse);
+    rowPtrsUnpacked_host_ = Kokkos::subview(originalGraph.rowPtrsUnpacked_host_, rowsToUse);
+
+    rowPtrsPacked_dev_ = Kokkos::subview(originalGraph.rowPtrsPacked_dev_, rowsToUse);
+    rowPtrsPacked_host_ = Kokkos::subview(originalGraph.rowPtrsPacked_host_, rowsToUse);
+
+    if (indicesAreLocal_) {
+      lclIndsUnpacked_wdv = local_inds_wdv_type(originalGraph.lclIndsUnpacked_wdv, 0, numNonZeros);
+      lclIndsPacked_wdv = local_inds_wdv_type(originalGraph.lclIndsPacked_wdv, 0, numNonZeros);
+    }
+    else {
+      gblInds_wdv = global_inds_wdv_type(originalGraph.gblInds_wdv, 0, numNonZeros);
+    }
+
+    checkInternalState();
+  }
+
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   CrsGraph (const Teuchos::RCP<const map_type>& rowMap,
             const Teuchos::RCP<const map_type>& colMap,
-            const typename local_graph_type::row_map_type& rowPointers,
-            const typename local_graph_type::entries_type::non_const_type& columnIndices,
+            const typename local_graph_device_type::row_map_type& rowPointers,
+            const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
             const Teuchos::RCP<Teuchos::ParameterList>& params) :
     dist_object_type (rowMap)
     , rowMap_(rowMap)
@@ -579,7 +617,7 @@ namespace Tpetra {
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   CrsGraph (const Teuchos::RCP<const map_type>& rowMap,
             const Teuchos::RCP<const map_type>& colMap,
-            const local_graph_type& k_local_graph_,
+            const local_graph_device_type& k_local_graph_,
             const Teuchos::RCP<Teuchos::ParameterList>& params)
     : CrsGraph (k_local_graph_,
                 rowMap,
@@ -591,7 +629,7 @@ namespace Tpetra {
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  CrsGraph (const local_graph_type& k_local_graph_,
+  CrsGraph (const local_graph_device_type& k_local_graph_,
             const Teuchos::RCP<const map_type>& rowMap,
             const Teuchos::RCP<const map_type>& colMap,
             const Teuchos::RCP<const map_type>& domainMap,
@@ -600,7 +638,6 @@ namespace Tpetra {
     : DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, node_type> (rowMap)
     , rowMap_ (rowMap)
     , colMap_ (colMap)
-    , lclGraph_ (k_local_graph_)
     , numAllocForAllRows_ (0)
     , storageStatus_ (Details::STORAGE_1D_PACKED)
     , indicesAreAllocated_ (true)
@@ -628,7 +665,7 @@ namespace Tpetra {
     //   "number of rows.  The row Map claims " << getNodeNumRows () << " row(s), "
     //   "but the local graph claims " << k_local_graph_.numRows () << " row(s).");
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
-      k_lclInds1D_.extent (0) != 0 || k_gblInds1D_.extent (0) != 0, std::logic_error,
+      lclIndsUnpacked_wdv.extent (0) != 0 || gblInds_wdv.extent (0) != 0, std::logic_error,
       ": cannot have 1D data structures allocated.");
 
     if(! params.is_null() && params->isParameter("sorted") &&
@@ -644,8 +681,10 @@ namespace Tpetra {
     Teuchos::Array<int> remotePIDs (0); // unused output argument
     this->makeImportExport (remotePIDs, false);
 
-    k_lclInds1D_ = lclGraph_.entries;
-    k_rowPtrs_ = lclGraph_.row_map;
+    lclIndsPacked_wdv = local_inds_wdv_type(k_local_graph_.entries);
+    lclIndsUnpacked_wdv = lclIndsPacked_wdv;
+    this->setRowPtrsUnpacked(k_local_graph_.row_map);
+    this->setRowPtrsPacked(k_local_graph_.row_map);
 
     set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel
 
@@ -661,7 +700,7 @@ namespace Tpetra {
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  CrsGraph (const local_graph_type& lclGraph,
+  CrsGraph (const local_graph_device_type& lclGraph,
             const Teuchos::RCP<const map_type>& rowMap,
             const Teuchos::RCP<const map_type>& colMap,
             const Teuchos::RCP<const map_type>& domainMap,
@@ -676,22 +715,23 @@ namespace Tpetra {
     domainMap_ (domainMap.is_null () ? rowMap : domainMap),
     importer_ (importer),
     exporter_ (exporter),
-    lclGraph_ (lclGraph),
     numAllocForAllRows_ (0),
     storageStatus_ (Details::STORAGE_1D_PACKED),
     indicesAreAllocated_ (true),
     indicesAreLocal_ (true)
   {
     staticAssertions();
-    const char tfecfFuncName[] = "Tpetra::CrsGraph(local_graph_type,"
+    const char tfecfFuncName[] = "Tpetra::CrsGraph(local_graph_device_type,"
       "Map,Map,Map,Map,Import,Export,params): ";
 
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (colMap.is_null (), std::runtime_error,
        "The input column Map must be nonnull.");
 
-    k_lclInds1D_ = lclGraph_.entries;
-    k_rowPtrs_ = lclGraph_.row_map;
+    lclIndsPacked_wdv = local_inds_wdv_type(lclGraph.entries);
+    lclIndsUnpacked_wdv = lclIndsPacked_wdv;
+    setRowPtrsUnpacked(lclGraph.row_map);
+    setRowPtrsPacked(lclGraph.row_map);
 
     set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel
 
@@ -919,12 +959,12 @@ namespace Tpetra {
         auto numEntPerRow = this->k_numRowEntries_;
         const LO numNumEntPerRow = numEntPerRow.extent (0);
         if (numNumEntPerRow == 0) {
-          if (static_cast<LO> (this->lclGraph_.row_map.extent (0)) <
+          if (static_cast<LO> (this->rowPtrsPacked_dev_.extent (0)) <
               static_cast<LO> (lclNumRows + 1)) {
             return static_cast<size_t> (0);
           }
           else {
-            return ::Tpetra::Details::getEntryOnHost (this->lclGraph_.row_map, lclNumRows);
+            return this->rowPtrsPacked_host_(lclNumRows);
           }
         }
         else { // k_numRowEntries_ is populated
@@ -1024,20 +1064,20 @@ namespace Tpetra {
         return static_cast<size_t> (0);
       }
       else if (storageStatus_ == Details::STORAGE_1D_PACKED) {
-        if (static_cast<LO> (this->lclGraph_.row_map.extent (0)) <
+        if (static_cast<LO> (this->rowPtrsPacked_dev_.extent (0)) <
             static_cast<LO> (lclNumRows + 1)) {
           return static_cast<size_t> (0);
         }
         else {
-          return ::Tpetra::Details::getEntryOnHost (this->lclGraph_.row_map, lclNumRows);
+          return this->rowPtrsPacked_host_(lclNumRows);
         }
       }
       else if (storageStatus_ == Details::STORAGE_1D_UNPACKED) {
-        if (this->k_rowPtrs_.extent (0) == 0) {
+        if (rowPtrsUnpacked_host_.extent (0) == 0) {
           return static_cast<size_t> (0);
         }
         else {
-          return ::Tpetra::Details::getEntryOnHost (this->k_rowPtrs_, lclNumRows);
+          return rowPtrsUnpacked_host_(lclNumRows);
         }
       }
       else {
@@ -1118,13 +1158,8 @@ namespace Tpetra {
     using Teuchos::ArrayRCP;
     using std::endl;
     typedef Teuchos::ArrayRCP<size_t>::size_type size_type;
-    typedef typename local_graph_type::row_map_type::non_const_type
+    typedef typename local_graph_device_type::row_map_type::non_const_type
       non_const_row_map_type;
-    typedef typename local_graph_type::entries_type::non_const_type
-      lcl_col_inds_type;
-    typedef Kokkos::View<GlobalOrdinal*,
-      typename lcl_col_inds_type::array_layout,
-      device_type> gbl_col_inds_type;
     const char tfecfFuncName[] = "allocateIndices: ";
     const char suffix[] =
       "  Please report this bug to the Tpetra developers.";
@@ -1160,6 +1195,7 @@ namespace Tpetra {
     //
     //  STATIC ALLOCATION PROFILE
     //
+  {
     if (verbose) {
       std::ostringstream os;
       os << *prefix << "Allocate k_rowPtrs: " << (numRows+1) << endl;
@@ -1206,27 +1242,29 @@ namespace Tpetra {
     }
 
     // "Commit" the resulting row offsets.
-    this->k_rowPtrs_ = k_rowPtrs;
+    setRowPtrsUnpacked(k_rowPtrs);
+  }
 
-    const size_type numInds =
-      Details::getEntryOnHost(this->k_rowPtrs_, numRows);
+    const size_type numInds = rowPtrsUnpacked_host_(numRows);
     if (lg == LocalIndices) {
       if (verbose) {
         std::ostringstream os;
         os << *prefix << "Allocate local column indices "
-          "k_lclInds1D_: " << numInds << endl;
+          "lclIndsUnpacked_wdv: " << numInds << endl;
         std::cerr << os.str();
       }
-      k_lclInds1D_ = lcl_col_inds_type ("Tpetra::CrsGraph::ind", numInds);
+      lclIndsUnpacked_wdv = local_inds_wdv_type (
+                    local_inds_dualv_type("Tpetra::CrsGraph::lclInd",numInds));
     }
     else {
       if (verbose) {
         std::ostringstream os;
         os << *prefix << "Allocate global column indices "
-          "k_gblInds1D_: " << numInds << endl;
+          "gblInds_wdv: " << numInds << endl;
         std::cerr << os.str();
       }
-      k_gblInds1D_ = gbl_col_inds_type ("Tpetra::CrsGraph::ind", numInds);
+      gblInds_wdv = global_inds_wdv_type (
+                    global_inds_dualv_type("Tpetra::CrsGraph::gblInd",numInds));
     }
     storageStatus_ = Details::STORAGE_1D_UNPACKED;
 
@@ -1283,187 +1321,107 @@ namespace Tpetra {
   }
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Teuchos::ArrayView<const LocalOrdinal>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+                    local_inds_dualv_type::t_host::const_type
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getLocalView (const RowInfo& rowinfo) const
+  getLocalIndsViewHost (const RowInfo& rowinfo) const
   {
-    using Kokkos::subview;
-    typedef LocalOrdinal LO;
-    typedef Kokkos::View<const LO*, device_type,
-      Kokkos::MemoryUnmanaged> row_view_type;
-
-    if (rowinfo.allocSize == 0) {
-      return Teuchos::ArrayView<const LO> ();
-    }
-    else { // nothing in the row to view
-      if (k_lclInds1D_.extent (0) != 0) { // 1-D storage
-        const size_t start = rowinfo.offset1D;
-        const size_t len = rowinfo.allocSize;
-        const std::pair<size_t, size_t> rng (start, start + len);
-        // mfh 23 Nov 2015: Don't just create a subview of
-        // k_lclInds1D_ directly, because that first creates a
-        // _managed_ subview, then returns an unmanaged version of
-        // that.  That touches the reference count, which costs
-        // performance in a measurable way.
-        row_view_type rowView = subview (row_view_type (k_lclInds1D_), rng);
-        const LO* const rowViewRaw = (len == 0) ? nullptr : rowView.data ();
-        return Teuchos::ArrayView<const LO> (rowViewRaw, len, Teuchos::RCP_DISABLE_NODE_LOOKUP);
-      }
-      else {
-        return Teuchos::ArrayView<const LO> (); // nothing in the row to view
-      }
-    }
+    if (rowinfo.allocSize == 0 || lclIndsUnpacked_wdv.extent(0) == 0) 
+      return typename local_inds_dualv_type::t_host::const_type ();
+    else
+      return lclIndsUnpacked_wdv.getHostSubview(rowinfo.offset1D, 
+                                        rowinfo.allocSize,
+                                        Access::ReadOnly);
   }
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  LocalOrdinal
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+                    local_inds_dualv_type::t_host
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getLocalViewRawConst (const LocalOrdinal*& lclInds,
-                        LocalOrdinal& capacity,
-                        const RowInfo& rowInfo) const
+  getLocalIndsViewHostNonConst (const RowInfo& rowinfo) 
   {
-    lclInds = nullptr;
-    capacity = 0;
-
-    if (rowInfo.allocSize != 0 && k_lclInds1D_.extent (0) != 0) {
-      if (debug_) {
-        if (rowInfo.offset1D + rowInfo.allocSize >
-            static_cast<size_t> (k_lclInds1D_.extent (0))) {
-          return static_cast<LocalOrdinal> (-1);
-        }
-      }
-      lclInds = k_lclInds1D_.data () + rowInfo.offset1D;
-      capacity = rowInfo.allocSize;
-    }
-    return static_cast<LocalOrdinal> (0);
+    if (rowinfo.allocSize == 0 || lclIndsUnpacked_wdv.extent(0) == 0) 
+      return typename local_inds_dualv_type::t_host ();
+    else
+      return lclIndsUnpacked_wdv.getHostSubview(rowinfo.offset1D, 
+                                        rowinfo.allocSize,
+                                        Access::ReadWrite);
   }
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Teuchos::ArrayView<LocalOrdinal>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+                    global_inds_dualv_type::t_host::const_type
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getLocalViewNonConst (const RowInfo& rowinfo)
+  getGlobalIndsViewHost (const RowInfo& rowinfo) const
   {
-    using Kokkos::subview;
-    typedef LocalOrdinal LO;
-    typedef Kokkos::View<LO*, device_type,
-      Kokkos::MemoryUnmanaged> row_view_type;
-
-    if (rowinfo.allocSize == 0) { // nothing in the row to view
-      return Teuchos::ArrayView<LO> ();
-    }
-    else {
-      if (k_lclInds1D_.extent (0) != 0) { // 1-D storage
-        const size_t start = rowinfo.offset1D;
-        const size_t len = rowinfo.allocSize;
-        const std::pair<size_t, size_t> rng (start, start + len);
-        // mfh 23 Nov 2015: Don't just create a subview of
-        // k_lclInds1D_ directly, because that first creates a
-        // _managed_ subview, then returns an unmanaged version of
-        // that.  That touches the reference count, which costs
-        // performance in a measurable way.
-        row_view_type rowView = subview (row_view_type (k_lclInds1D_), rng);
-        LO* const rowViewRaw = (len == 0) ? nullptr : rowView.data ();
-        return Teuchos::ArrayView<LO> (rowViewRaw, len, Teuchos::RCP_DISABLE_NODE_LOOKUP);
-      }
-      else {
-        return Teuchos::ArrayView<LO> (); // nothing in the row to view
-      }
-    }
+    if (rowinfo.allocSize == 0 || gblInds_wdv.extent(0) == 0) 
+      return typename global_inds_dualv_type::t_host::const_type ();
+    else
+      return gblInds_wdv.getHostSubview(rowinfo.offset1D, 
+                                        rowinfo.allocSize,
+                                        Access::ReadOnly);
   }
 
-
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Kokkos::View<const LocalOrdinal*,
-               typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::device_type,
-               Kokkos::MemoryUnmanaged>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+                    local_inds_dualv_type::t_dev::const_type
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getLocalKokkosRowView (const RowInfo& rowInfo) const
+  getLocalIndsViewDevice (const RowInfo& rowinfo) const
   {
-    typedef LocalOrdinal LO;
-    typedef Kokkos::View<const LO*, device_type,
-      Kokkos::MemoryUnmanaged> row_view_type;
-
-    if (rowInfo.allocSize == 0) {
-      return row_view_type ();
-    }
-    else { // nothing in the row to view
-      if (k_lclInds1D_.extent (0) != 0) { // 1-D storage
-        const size_t start = rowInfo.offset1D;
-        const size_t len = rowInfo.allocSize;
-        const std::pair<size_t, size_t> rng (start, start + len);
-        // mfh 23 Nov 2015: Don't just create a subview of
-        // k_lclInds1D_ directly, because that first creates a
-        // _managed_ subview, then returns an unmanaged version of
-        // that.  That touches the reference count, which costs
-        // performance in a measurable way.
-        return Kokkos::subview (row_view_type (k_lclInds1D_), rng);
-      }
-      else {
-        return row_view_type (); // nothing in the row to view
-      }
-    }
+    if (rowinfo.allocSize == 0 || lclIndsUnpacked_wdv.extent(0) == 0) 
+      return typename local_inds_dualv_type::t_dev::const_type ();
+    else
+      return lclIndsUnpacked_wdv.getDeviceSubview(rowinfo.offset1D, 
+                                          rowinfo.allocSize,
+                                          Access::ReadOnly);
   }
 
-
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Kokkos::View<LocalOrdinal*,
-               typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::device_type,
-               Kokkos::MemoryUnmanaged>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+                    global_inds_dualv_type::t_dev::const_type
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getLocalKokkosRowViewNonConst (const RowInfo& rowInfo)
+  getGlobalIndsViewDevice (const RowInfo& rowinfo) const
   {
-    using row_view_type = Kokkos::View<LocalOrdinal*,
-      device_type, Kokkos::MemoryUnmanaged>;
-
-    if (rowInfo.allocSize == 0) {
-      return row_view_type ();
-    }
-    else { // nothing in the row to view
-      if (k_lclInds1D_.extent (0) != 0) { // 1-D storage
-        const size_t start = rowInfo.offset1D;
-        const size_t len = rowInfo.allocSize;
-        const std::pair<size_t, size_t> rng (start, start + len);
-        // mfh 23 Nov 2015: Don't just create a subview of
-        // k_lclInds1D_ directly, because that first creates a
-        // _managed_ subview, then returns an unmanaged version of
-        // that.  That touches the reference count, which costs
-        // performance in a measurable way.
-        return Kokkos::subview (row_view_type (this->k_lclInds1D_), rng);
-      }
-      else {
-        return row_view_type (); // nothing in the row to view
-      }
-    }
+    if (rowinfo.allocSize == 0 || gblInds_wdv.extent(0) == 0) 
+      return typename global_inds_dualv_type::t_dev::const_type ();
+    else
+      return gblInds_wdv.getDeviceSubview(rowinfo.offset1D, 
+                                          rowinfo.allocSize,
+                                          Access::ReadOnly);
   }
 
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Kokkos::View<const GlobalOrdinal*,
-               typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::device_type,
-               Kokkos::MemoryUnmanaged>
+  Teuchos::ArrayView<const LocalOrdinal>
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getGlobalKokkosRowView (const RowInfo& rowinfo) const
+  getLocalView (const RowInfo& rowinfo) const
   {
-    using row_view_type = Kokkos::View<const GlobalOrdinal*,
-      device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using Kokkos::subview;
+    typedef LocalOrdinal LO;
 
     if (rowinfo.allocSize == 0) {
-      return row_view_type ();
+      return Teuchos::ArrayView<const LO> ();
     }
-    else { // nothing in the row to view
-      if (this->k_gblInds1D_.extent (0) != 0) { // 1-D storage
+    else { 
+      if (lclIndsUnpacked_wdv.extent (0) != 0) { // 1-D storage
         const size_t start = rowinfo.offset1D;
         const size_t len = rowinfo.allocSize;
         const std::pair<size_t, size_t> rng (start, start + len);
         // mfh 23 Nov 2015: Don't just create a subview of
-        // k_gblInds1D_ directly, because that first creates a
+        // lclIndsUnpacked_wdv directly, because that first creates a
         // _managed_ subview, then returns an unmanaged version of
         // that.  That touches the reference count, which costs
         // performance in a measurable way.
-        return Kokkos::subview (row_view_type (this->k_gblInds1D_), rng);
+        // KDDKDD  Function is deprecated; we ignore the unmanaged bit above.
+        // KDDKDD  Breaks the reference counting paradigm; reference to 
+        // KDDKDD  host view is lost.
+        auto rowViewHost = lclIndsUnpacked_wdv.getHostView(Access::ReadOnly);
+        auto rowView = subview(rowViewHost, rng);
+        const LO* const rowViewRaw = (len == 0) ? nullptr : rowView.data ();
+        return Teuchos::ArrayView<const LO> (rowViewRaw, len, Teuchos::RCP_DISABLE_NODE_LOOKUP);
       }
       else {
-        return row_view_type (); // nothing in the row to view
+        return Teuchos::ArrayView<const LO> (); // nothing in the row to view
       }
     }
   }
@@ -1477,77 +1435,26 @@ namespace Tpetra {
     using GO = global_ordinal_type;
 
     Teuchos::ArrayView<const GO> view;
-    if (rowinfo.allocSize > 0 && k_gblInds1D_.extent (0) != 0) {
+    if (rowinfo.allocSize > 0 && gblInds_wdv.extent (0) != 0) {
       const auto rng =
         std::make_pair (rowinfo.offset1D,
                         rowinfo.offset1D + rowinfo.allocSize);
       // mfh 23 Nov 2015: Don't just create a subview of
-      // k_gblInds1D_ directly, because that first creates a
+      // gblInds_wdv directly, because that first creates a
       // _managed_ subview, then returns an unmanaged version of
       // that.  That touches the reference count, which costs
       // performance in a measurable way.
-      using row_view_type = Kokkos::View<const GO*,
-        device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      row_view_type k_gblInds1D_unmanaged = k_gblInds1D_;
+      // KDDKDD  This method is deprecated; we ignore the unmanaged bit above
+      // KDDKDD  Breaks the reference counting paradigm; unmanaged
+      // KDDKDD  memory does not do reference counting
+      auto gblInds = gblInds_wdv.getHostView(Access::ReadOnly);
       using Kokkos::Compat::getConstArrayView;
       using Kokkos::subview;
-      view = getConstArrayView (subview (k_gblInds1D_unmanaged, rng));
+      view = getConstArrayView (subview (gblInds, rng));
     }
     return view;
   }
-
-
-  template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  LocalOrdinal
-  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getGlobalViewRawConst (const GlobalOrdinal*& gblInds,
-                         LocalOrdinal& capacity,
-                         const RowInfo& rowInfo) const
-  {
-    gblInds = nullptr;
-    capacity = 0;
-
-    if (rowInfo.allocSize != 0 && k_gblInds1D_.extent (0) != 0) {
-      if (debug_) {
-        if (rowInfo.offset1D + rowInfo.allocSize >
-            static_cast<size_t> (k_gblInds1D_.extent (0))) {
-          return static_cast<LocalOrdinal> (-1);
-        }
-      }
-      gblInds = k_gblInds1D_.data () + rowInfo.offset1D;
-      capacity = rowInfo.allocSize;
-    }
-    return static_cast<LocalOrdinal> (0);
-  }
-
-
-  template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  Teuchos::ArrayView<GlobalOrdinal>
-  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  getGlobalViewNonConst (const RowInfo& rowinfo)
-  {
-    using GO = global_ordinal_type;
-
-    Teuchos::ArrayView<GO> view;
-    if (rowinfo.allocSize > 0 && k_gblInds1D_.extent (0) != 0) {
-      const auto rng =
-        std::make_pair (rowinfo.offset1D,
-                        rowinfo.offset1D + rowinfo.allocSize);
-      // mfh 23 Nov 2015: Don't just create a subview of
-      // k_gblInds1D_ directly, because that first creates a
-      // _managed_ subview, then returns an unmanaged version of
-      // that.  That touches the reference count, which costs
-      // performance in a measurable way.
-      using row_view_type = Kokkos::View<GO*, device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      row_view_type k_gblInds1D_unmanaged = k_gblInds1D_;
-      using Kokkos::Compat::getArrayView;
-      using Kokkos::subview;
-      view = getArrayView (subview (k_gblInds1D_unmanaged, rng));
-    }
-    return view;
-  }
-
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   RowInfo
@@ -1567,13 +1474,13 @@ namespace Tpetra {
     ret.localRow = static_cast<size_t> (myRow);
     if (this->indicesAreAllocated ()) {
       // Offsets tell us the allocation size in this case.
-      if (this->k_rowPtrs_.extent (0) == 0) {
+      if (rowPtrsUnpacked_host_.extent (0) == 0) {
         ret.offset1D  = 0;
         ret.allocSize = 0;
       }
       else {
-        ret.offset1D  = this->k_rowPtrs_(myRow);
-        ret.allocSize = this->k_rowPtrs_(myRow+1) - this->k_rowPtrs_(myRow);
+        ret.offset1D  = rowPtrsUnpacked_host_(myRow);
+        ret.allocSize = rowPtrsUnpacked_host_(myRow+1) - rowPtrsUnpacked_host_(myRow);
       }
 
       ret.numEntries = (this->k_numRowEntries_.extent (0) == 0) ?
@@ -1623,13 +1530,13 @@ namespace Tpetra {
       // graph data structures have the info that we need
       //
       // if static graph, offsets tell us the allocation size
-      if (this->k_rowPtrs_.extent (0) == 0) {
+      if (rowPtrsUnpacked_host_.extent (0) == 0) {
         ret.offset1D  = 0;
         ret.allocSize = 0;
       }
       else {
-        ret.offset1D  = this->k_rowPtrs_(myRow);
-        ret.allocSize = this->k_rowPtrs_(myRow+1) - this->k_rowPtrs_(myRow);
+        ret.offset1D  = rowPtrsUnpacked_host_(myRow);
+        ret.allocSize = rowPtrsUnpacked_host_(myRow+1) - rowPtrsUnpacked_host_(myRow);
       }
 
       ret.numEntries = (this->k_numRowEntries_.extent (0) == 0) ?
@@ -1722,7 +1629,7 @@ namespace Tpetra {
       ArrayView<const GO> new_ginds = newInds.ginds;
       numNewInds = new_ginds.size();
       if (I == GlobalIndices) { // store global indices
-        ArrayView<GO> gind_view = this->getGlobalViewNonConst (rowinfo);
+        auto gind_view = gblInds_wdv.getHostView(Access::ReadWrite);
         if (debug_) {
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
             (static_cast<size_t> (gind_view.size ()) <
@@ -1731,13 +1638,14 @@ namespace Tpetra {
              << " < rowinfo.numEntries (= " << rowinfo.numEntries
              << ") + numNewInds (= " << numNewInds << ").");
         }
-        GO* const gblColInds_out = gind_view.getRawPtr () + rowinfo.numEntries;
+        GO* const gblColInds_out = gind_view.data () + rowinfo.offset1D
+                                                     + rowinfo.numEntries;
         for (size_t k = 0; k < numNewInds; ++k) {
           gblColInds_out[k] = new_ginds[k];
         }
       }
       else if (I == LocalIndices) { // store local indices
-        ArrayView<LO> lind_view = this->getLocalViewNonConst (rowinfo);
+        auto lind_view = lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
         if (debug_) {
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
             (static_cast<size_t> (lind_view.size ()) <
@@ -1746,7 +1654,8 @@ namespace Tpetra {
              << " < rowinfo.numEntries (= " << rowinfo.numEntries
              << ") + numNewInds (= " << numNewInds << ").");
         }
-        LO* const lclColInds_out = lind_view.getRawPtr () + rowinfo.numEntries;
+        LO* const lclColInds_out = lind_view.data () + rowinfo.offset1D
+                                                     + rowinfo.numEntries;
         for (size_t k = 0; k < numNewInds; ++k) {
           lclColInds_out[k] = colMap_->getLocalElement (new_ginds[k]);
         }
@@ -1756,7 +1665,7 @@ namespace Tpetra {
       ArrayView<const LO> new_linds = newInds.linds;
       numNewInds = new_linds.size();
       if (I == LocalIndices) { // store local indices
-        ArrayView<LO> lind_view = this->getLocalViewNonConst (rowinfo);
+        auto lind_view = lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
         if (debug_) {
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
             (static_cast<size_t> (lind_view.size ()) <
@@ -1765,7 +1674,8 @@ namespace Tpetra {
              << " < rowinfo.numEntries (= " << rowinfo.numEntries
              << ") + numNewInds (= " << numNewInds << ").");
         }
-        LO* const lclColInds_out = lind_view.getRawPtr () + rowinfo.numEntries;
+        LO* const lclColInds_out = lind_view.data () + rowinfo.offset1D
+                                                     + rowinfo.numEntries;
         for (size_t k = 0; k < numNewInds; ++k) {
           lclColInds_out[k] = new_linds[k];
         }
@@ -1830,8 +1740,13 @@ namespace Tpetra {
     auto numEntries = rowInfo.numEntries;
     using inp_view_type = View<const GO*, Kokkos::HostSpace, MemoryUnmanaged>;
     inp_view_type inputInds(inputGblColInds, numInputInds);
-    size_t numInserted = Details::insertCrsIndices(lclRow, k_rowPtrs_,
-      this->k_gblInds1D_, numEntries, inputInds, fun);
+    size_t numInserted;
+    {
+      auto gblIndsHostView = this->gblInds_wdv.getHostView(Access::ReadWrite);
+      numInserted = Details::insertCrsIndices(lclRow, this->rowPtrsUnpacked_host_,
+                                              gblIndsHostView,
+                                              numEntries, inputInds, fun);
+    }
 
     const bool insertFailed =
       numInserted == Teuchos::OrdinalTraits<size_t>::invalid();
@@ -1855,9 +1770,8 @@ namespace Tpetra {
       verbosePrintArray(os, inputGblColIndsView, "Input global "
                         "column indices", maxNumToPrint);
       os << ", ";
-      const GO* const curGblColInds =
-        k_gblInds1D_.data() + rowInfo.offset1D;
-      ArrayView<const GO> curGblColIndsView(curGblColInds,
+      auto curGblColInds = getGlobalIndsViewHost(rowInfo);
+      ArrayView<const GO> curGblColIndsView(curGblColInds.data(),
                                             rowInfo.numEntries);
       verbosePrintArray(os, curGblColIndsView, "Current global "
                         "column indices", maxNumToPrint);
@@ -1866,6 +1780,7 @@ namespace Tpetra {
     }
 
     this->k_numRowEntries_(lclRow) += numInserted;
+
     this->setLocallyModified();
     return numInserted;
   }
@@ -1893,8 +1808,12 @@ namespace Tpetra {
     // Note: Teuchos::ArrayViews are in HostSpace
     using inp_view_type = View<const LO*, Kokkos::HostSpace, MemoryUnmanaged>;
     inp_view_type inputInds(indices.getRawPtr(), indices.size());
-    auto numInserted = Details::insertCrsIndices(myRow, k_rowPtrs_,
-      this->k_lclInds1D_, numEntries, inputInds, fun);
+    size_t numInserted = 0;
+    {
+      auto lclInds = lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
+      numInserted = Details::insertCrsIndices(myRow, rowPtrsUnpacked_host_, lclInds,
+                                              numEntries, inputInds, fun);
+    }
 
     const bool insertFailed =
       numInserted == Teuchos::OrdinalTraits<size_t>::invalid();
@@ -1941,7 +1860,7 @@ namespace Tpetra {
     using Kokkos::MemoryUnmanaged;
     auto invalidCount = Teuchos::OrdinalTraits<size_t>::invalid();
 
-    using inp_view_type = View<const GO*, device_type, MemoryUnmanaged>;
+    using inp_view_type = View<const GO*, Kokkos::HostSpace, MemoryUnmanaged>;
     inp_view_type inputInds(indices.getRawPtr(), indices.size());
 
     size_t numFound = 0;
@@ -1952,13 +1871,15 @@ namespace Tpetra {
         return invalidCount;
       const auto& colMap = *(this->colMap_);
       auto map = [&](GO const gblInd){return colMap.getLocalElement(gblInd);};
-      numFound = Details::findCrsIndices(lclRow, k_rowPtrs_, rowInfo.numEntries,
-        this->k_lclInds1D_, inputInds, map, fun);
+      numFound = Details::findCrsIndices(lclRow, rowPtrsUnpacked_host_,
+        rowInfo.numEntries,
+        lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun);
     }
     else if (this->isGloballyIndexed())
     {
-      numFound = Details::findCrsIndices(lclRow, k_rowPtrs_, rowInfo.numEntries,
-        this->k_gblInds1D_, inputInds, fun);
+      numFound = Details::findCrsIndices(lclRow, rowPtrsUnpacked_host_,
+        rowInfo.numEntries,
+        gblInds_wdv.getHostView(Access::ReadOnly), inputInds, fun);
     }
     return numFound;
   }
@@ -1974,18 +1895,16 @@ namespace Tpetra {
     const size_t origNumEnt = rowInfo.numEntries;
     if (origNumEnt != Tpetra::Details::OrdinalTraits<size_t>::invalid () &&
         origNumEnt != 0) {
-      auto lclColInds = this->getLocalKokkosRowViewNonConst (rowInfo);
+      auto lclColInds = this->getLocalIndsViewHostNonConst (rowInfo);
 
       LocalOrdinal* const lclColIndsRaw = lclColInds.data ();
       if (! sorted) {
-        // FIXME (mfh 08 May 2017) This assumes CUDA UVM.
         std::sort (lclColIndsRaw, lclColIndsRaw + origNumEnt);
       }
 
       if (! merged) {
         LocalOrdinal* const beg = lclColIndsRaw;
         LocalOrdinal* const end = beg + rowInfo.numEntries;
-        // FIXME (mfh 08 May 2017) This assumes CUDA UVM.
         LocalOrdinal* const newend = std::unique (beg, end);
         const size_t newNumEnt = newend - beg;
 
@@ -2138,45 +2057,56 @@ namespace Tpetra {
          "nonzero, or k_numAllocPerRow_ has nonzero dimension.  In other words, "
          "the graph is supposed to release its \"allocation specifications\" "
          "when it allocates its indices." << suffix);
-      if (isGloballyIndexed() && k_rowPtrs_.extent(0) != 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (rowPtrsUnpacked_host_.extent(0) != rowPtrsUnpacked_dev_.extent(0),
+         std::logic_error, "The host and device views of k_rowPtrs_ have "
+         "different sizes; rowPtrsUnpacked_host_ has size  "
+         << rowPtrsUnpacked_host_.extent(0)
+         << ", but rowPtrsUnpacked_dev_ has size "
+         << rowPtrsUnpacked_dev_.extent(0)
+         << "." << suffix);
+      if (isGloballyIndexed() && rowPtrsUnpacked_host_.extent(0) != 0) {
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (size_t(k_rowPtrs_.extent(0)) != size_t(lclNumRows + 1),
+          (size_t(rowPtrsUnpacked_host_.extent(0)) != size_t(lclNumRows + 1),
            std::logic_error, "The graph is globally indexed and "
-           "k_rowPtrs_ has nonzero size " << k_rowPtrs_.extent(0)
+           "k_rowPtrs has nonzero size " << rowPtrsUnpacked_host_.extent(0)
            << ", but that size does not equal lclNumRows+1 = "
            << (lclNumRows+1) << "." << suffix);
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (k_rowPtrs_(lclNumRows) != size_t(k_gblInds1D_.extent(0)),
+          (rowPtrsUnpacked_host_(lclNumRows) != size_t(gblInds_wdv.extent(0)),
            std::logic_error, "The graph is globally indexed and "
-           "k_rowPtrs_ has nonzero size " << k_rowPtrs_.extent(0)
+           "k_rowPtrs_ has nonzero size " << rowPtrsUnpacked_host_.extent(0)
            << ", but k_rowPtrs_(lclNumRows=" << lclNumRows << ")="
-           << k_rowPtrs_(lclNumRows) << " != k_gblInds1D_.extent(0)="
-           << k_gblInds1D_.extent(0) << "." << suffix);
+           << rowPtrsUnpacked_host_(lclNumRows) 
+           << " != gblInds_wdv.extent(0)="
+           << gblInds_wdv.extent(0) << "." << suffix);
       }
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (this->isLocallyIndexed () &&
-         this->k_rowPtrs_.extent (0) != 0 &&
-         (static_cast<size_t> (k_rowPtrs_.extent (0)) != static_cast<size_t> (lclNumRows + 1) ||
-          this->k_rowPtrs_(lclNumRows) != static_cast<size_t> (this->k_lclInds1D_.extent (0))),
+         this->rowPtrsUnpacked_host_.extent (0) != 0 &&
+         (static_cast<size_t> (rowPtrsUnpacked_host_.extent (0)) != 
+              static_cast<size_t> (lclNumRows + 1) ||
+          this->rowPtrsUnpacked_host_(lclNumRows) != 
+              static_cast<size_t> (this->lclIndsUnpacked_wdv.extent (0))),
          std::logic_error, "If k_rowPtrs_ has nonzero size and "
          "the graph is locally indexed, then "
          "k_rowPtrs_ must have N+1 rows, and "
-         "k_rowPtrs_(N) must equal k_lclInds1D_.extent(0)." << suffix);
+         "k_rowPtrs_(N) must equal lclIndsUnpacked_wdv.extent(0)." << suffix);
 
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (this->indicesAreAllocated () &&
          nodeAllocSize > 0 &&
-         this->k_lclInds1D_.extent (0) == 0 &&
-         this->k_gblInds1D_.extent (0) == 0,
+         this->lclIndsUnpacked_wdv.extent (0) == 0 &&
+         this->gblInds_wdv.extent (0) == 0,
          std::logic_error, "Graph is allocated nontrivially, but "
          "but 1-D allocations are not present." << suffix);
 
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (! this->indicesAreAllocated () &&
-         ((this->k_rowPtrs_.extent (0) != 0 ||
+         ((this->rowPtrsUnpacked_host_.extent (0) != 0 ||
            this->k_numRowEntries_.extent (0) != 0) ||
-          this->k_lclInds1D_.extent (0) != 0 ||
-          this->k_gblInds1D_.extent (0) != 0),
+          this->lclIndsUnpacked_wdv.extent (0) != 0 ||
+          this->gblInds_wdv.extent (0) != 0),
          std::logic_error, "If indices are not allocated, "
          "then none of the buffers should be." << suffix);
       // indices may be local or global only if they are allocated
@@ -2191,58 +2121,58 @@ namespace Tpetra {
         (this->indicesAreLocal_ && this->indicesAreGlobal_,
          std::logic_error, "Indices may not be both local and global." << suffix);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (indicesAreLocal_ && k_gblInds1D_.extent (0) != 0,
+        (indicesAreLocal_ && gblInds_wdv.extent (0) != 0,
          std::logic_error, "Indices are local, but "
-         "k_gblInds1D_.extent(0) (= " << k_gblInds1D_.extent (0)
+         "gblInds_wdv.extent(0) (= " << gblInds_wdv.extent (0)
          << ") != 0.  In other words, if indices are local, then "
          "allocations of global indices should not be present."
          << suffix);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (indicesAreGlobal_ && k_lclInds1D_.extent (0) != 0,
+        (indicesAreGlobal_ && lclIndsUnpacked_wdv.extent (0) != 0,
          std::logic_error, "Indices are global, but "
-         "k_lclInds1D_.extent(0) (= " << k_lclInds1D_.extent(0)
+         "lclIndsUnpacked_wdv.extent(0) (= " << lclIndsUnpacked_wdv.extent(0)
          << ") != 0.  In other words, if indices are global, "
          "then allocations for local indices should not be present."
          << suffix);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (indicesAreLocal_ && nodeAllocSize > 0 &&
-         k_lclInds1D_.extent (0) == 0 && getNodeNumRows () > 0,
+         lclIndsUnpacked_wdv.extent (0) == 0 && getNodeNumRows () > 0,
          std::logic_error, "Indices are local and "
          "getNodeAllocationSize() = " << nodeAllocSize << " > 0, but "
-         "k_lclInds1D_.extent(0) = 0 and getNodeNumRows() = "
+         "lclIndsUnpacked_wdv.extent(0) = 0 and getNodeNumRows() = "
          << getNodeNumRows () << " > 0." << suffix);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (indicesAreGlobal_ && nodeAllocSize > 0 &&
-         k_gblInds1D_.extent (0) == 0 && getNodeNumRows () > 0,
+         gblInds_wdv.extent (0) == 0 && getNodeNumRows () > 0,
          std::logic_error, "Indices are global and "
          "getNodeAllocationSize() = " << nodeAllocSize << " > 0, but "
-         "k_gblInds1D_.extent(0) = 0 and getNodeNumRows() = "
+         "gblInds_wdv.extent(0) = 0 and getNodeNumRows() = "
          << getNodeNumRows () << " > 0." << suffix);
       // check the actual allocations
       if (this->indicesAreAllocated () &&
-          this->k_rowPtrs_.extent (0) != 0) {
+          this->rowPtrsUnpacked_host_.extent (0) != 0) {
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (this->k_rowPtrs_.extent (0)) !=
+          (static_cast<size_t> (this->rowPtrsUnpacked_host_.extent (0)) !=
            this->getNodeNumRows () + 1,
            std::logic_error, "Indices are allocated and "
-           "k_rowPtrs_ has nonzero length, but k_rowPtrs_.extent(0) = "
-           << this->k_rowPtrs_.extent (0) << " != getNodeNumRows()+1 = "
+           "k_rowPtrs_ has nonzero length, but rowPtrsUnpacked_host_.extent(0) = "
+           << this->rowPtrsUnpacked_host_.extent (0) << " != getNodeNumRows()+1 = "
            << (this->getNodeNumRows () + 1) << "." << suffix);
-        const size_t actualNumAllocated =
-          ::Tpetra::Details::getEntryOnHost (this->k_rowPtrs_, this->getNodeNumRows ());
+        const size_t actualNumAllocated = 
+              this->rowPtrsUnpacked_host_(this->getNodeNumRows());
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
           (this->isLocallyIndexed () &&
-           static_cast<size_t> (this->k_lclInds1D_.extent (0)) != actualNumAllocated,
+           static_cast<size_t> (this->lclIndsUnpacked_wdv.extent (0)) != actualNumAllocated,
            std::logic_error, "Graph is locally indexed, indices are "
            "are allocated, and k_rowPtrs_ has nonzero length, but "
-           "k_lclInds1D_.extent(0) = " << this->k_lclInds1D_.extent (0)
+           "lclIndsUnpacked_wdv.extent(0) = " << this->lclIndsUnpacked_wdv.extent (0)
            << " != actualNumAllocated = " << actualNumAllocated << suffix);
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
           (this->isGloballyIndexed () &&
-           static_cast<size_t> (this->k_gblInds1D_.extent (0)) != actualNumAllocated,
+           static_cast<size_t> (this->gblInds_wdv.extent (0)) != actualNumAllocated,
            std::logic_error, "Graph is globally indexed, indices "
            "are allocated, and k_rowPtrs_ has nonzero length, but "
-           "k_gblInds1D_.extent(0) = " << this->k_gblInds1D_.extent (0)
+           "gblInds_wdv.extent(0) = " << this->gblInds_wdv.extent (0)
            << " != actualNumAllocated = " << actualNumAllocated << suffix);
       }
 
@@ -2321,14 +2251,13 @@ namespace Tpetra {
   getNodeRowPtrs () const
   {
     using Kokkos::ViewAllocateWithoutInitializing;
-    using Kokkos::create_mirror_view;
     using Teuchos::ArrayRCP;
-    typedef typename local_graph_type::row_map_type row_map_type;
+    typedef typename local_graph_device_type::row_map_type row_map_type;
     typedef typename row_map_type::non_const_value_type row_offset_type;
     const char prefix[] = "Tpetra::CrsGraph::getNodeRowPtrs: ";
     const char suffix[] = "  Please report this bug to the Tpetra developers.";
 
-    const size_t size = k_rowPtrs_.extent (0);
+    const size_t size = rowPtrsUnpacked_host_.extent (0);
     constexpr bool same = std::is_same<size_t, row_offset_type>::value;
 
     if (size == 0) {
@@ -2338,35 +2267,17 @@ namespace Tpetra {
     ArrayRCP<const row_offset_type> ptr_rot;
     ArrayRCP<const size_t> ptr_st;
     if (same) { // size_t == row_offset_type
-      // NOTE (mfh 22 Mar 2015) In a debug build of Kokkos, the result
-      // of create_mirror_view might actually be a new allocation.
-      // This helps with debugging when there are two memory spaces.
-      typename row_map_type::HostMirror ptr_h = create_mirror_view (k_rowPtrs_);
-      Kokkos::deep_copy (ptr_h, k_rowPtrs_);
-      if (debug_) {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (ptr_h.extent (0) != k_rowPtrs_.extent (0), std::logic_error,
-           prefix << "size_t == row_offset_type, but ptr_h.extent(0) = "
-           << ptr_h.extent (0) << " != k_rowPtrs_.extent(0) = "
-           << k_rowPtrs_.extent (0) << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (same && size != 0 && k_rowPtrs_.data () == nullptr, std::logic_error,
-           prefix << "size_t == row_offset_type and k_rowPtrs_.extent(0) = "
-           << size << " != 0, but k_rowPtrs_.data() == nullptr." << suffix);
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (same && size != 0 && ptr_h.data () == nullptr, std::logic_error,
-           prefix << "size_t == row_offset_type and k_rowPtrs_.extent(0) = "
-           << size << " != 0, but create_mirror_view(k_rowPtrs_).data() "
-           "== nullptr." << suffix);
-      }
-      ptr_rot = Kokkos::Compat::persistingView (ptr_h);
+      ptr_rot = Kokkos::Compat::persistingView (rowPtrsUnpacked_host_);
     }
     else { // size_t != row_offset_type
       typedef Kokkos::View<size_t*, device_type> ret_view_type;
       ret_view_type ptr_d (ViewAllocateWithoutInitializing ("ptr"), size);
-      ::Tpetra::Details::copyOffsets (ptr_d, k_rowPtrs_);
-      typename ret_view_type::HostMirror ptr_h = create_mirror_view (ptr_d);
-      Kokkos::deep_copy (ptr_h, ptr_d);
+
+      ::Tpetra::Details::copyOffsets (ptr_d, rowPtrsUnpacked_dev_);
+
+      typename ret_view_type::HostMirror ptr_h = 
+                                         Kokkos::create_mirror_view (ptr_d);
+      Kokkos::deep_copy(ptr_h, ptr_d);
       ptr_st = Kokkos::Compat::persistingView (ptr_h);
     }
     if (debug_) {
@@ -2400,10 +2311,64 @@ namespace Tpetra {
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   getNodePackedIndices () const
   {
-    return Kokkos::Compat::persistingView (k_lclInds1D_);
+    // KDDKDD  UVM REMOVAL:  3/21
+    // KDDKDD  This function used to return k_lclInds1D_.  
+    // KDDKDD  I retain its behavior by return lclIndsUnpacked_wdv.getHostView.
+    // KDDKDD  However, k_lclInds1D_ was not necessarily PACKED;
+    // KDDKDD  PACKED indices are in the static graph.
+    // KDDKDD  However, with OptimizeStorage, k_lclInds1D_ was PACKED.
+    // return Kokkos::Compat::persistingView (k_lclInds1D_);
+    return Kokkos::Compat::persistingView (
+                           lclIndsUnpacked_wdv.getHostView(Access::ReadOnly));
   }
 
 
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalRowCopy (LocalOrdinal localRow,
+                   nonconst_local_inds_host_view_type & indices,
+                   size_t& numEntries) const
+  {
+    using Teuchos::ArrayView;
+    const char tfecfFuncName[] = "getLocalRowCopy: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      isGloballyIndexed () && ! hasColMap (), std::runtime_error,
+      "Tpetra::CrsGraph::getLocalRowCopy: The graph is globally indexed and "
+      "does not have a column Map yet.  That means we don't have local indices "
+      "for columns yet, so it doesn't make sense to call this method.  If the "
+      "graph doesn't have a column Map yet, you should call fillComplete on "
+      "it first.");
+
+    // This does the right thing (reports an empty row) if the input
+    // row is invalid.
+    const RowInfo rowinfo = this->getRowInfo (localRow);
+    // No side effects on error.
+    const size_t theNumEntries = rowinfo.numEntries;
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) < theNumEntries,std::runtime_error,
+       "Specified storage (size==" << indices.size () << ") does not suffice "
+       "to hold all " << theNumEntries << " entry/ies for this row.");
+    numEntries = theNumEntries;
+
+    if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
+      if (isLocallyIndexed ()) {
+        auto lclInds = getLocalIndsViewHost(rowinfo);
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          indices[j] = lclInds(j);
+        }
+      }
+      else if (isGloballyIndexed ()) {
+        auto gblInds = getGlobalIndsViewHost(rowinfo);
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          indices[j] = colMap_->getLocalElement (gblInds(j));
+        }
+      }
+    }
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
@@ -2412,8 +2377,6 @@ namespace Tpetra {
                    size_t& numEntries) const
   {
     using Teuchos::ArrayView;
-    typedef LocalOrdinal LO;
-    typedef GlobalOrdinal GO;
     const char tfecfFuncName[] = "getLocalRowCopy: ";
 
     TEUCHOS_TEST_FOR_EXCEPTION(
@@ -2430,28 +2393,67 @@ namespace Tpetra {
     // No side effects on error.
     const size_t theNumEntries = rowinfo.numEntries;
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-      (static_cast<size_t> (indices.size ()) < theNumEntries, std::runtime_error,
+      (static_cast<size_t> (indices.size ()) < theNumEntries,std::runtime_error,
        "Specified storage (size==" << indices.size () << ") does not suffice "
        "to hold all " << theNumEntries << " entry/ies for this row.");
     numEntries = theNumEntries;
 
     if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
       if (isLocallyIndexed ()) {
-        ArrayView<const LO> lview = getLocalView (rowinfo);
+        auto lclInds = getLocalIndsViewHost(rowinfo);
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          indices[j] = lclInds(j);
+        }
+      }
+      else if (isGloballyIndexed ()) {
+        auto gblInds = getGlobalIndsViewHost(rowinfo);
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          indices[j] = colMap_->getLocalElement (gblInds(j));
+        }
+      }
+    }
+  }
+#endif
+
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getGlobalRowCopy (GlobalOrdinal globalRow,
+                    nonconst_global_inds_host_view_type &indices,
+                    size_t& numEntries) const
+  {
+    using Teuchos::ArrayView;
+    const char tfecfFuncName[] = "getGlobalRowCopy: ";
+
+    // This does the right thing (reports an empty row) if the input
+    // row is invalid.
+    const RowInfo rowinfo = getRowInfoFromGlobalRowIndex (globalRow);
+    const size_t theNumEntries = rowinfo.numEntries;
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      static_cast<size_t> (indices.size ()) < theNumEntries, std::runtime_error,
+      "Specified storage (size==" << indices.size () << ") does not suffice "
+      "to hold all " << theNumEntries << " entry/ies for this row.");
+    numEntries = theNumEntries; // first side effect
+
+    if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
+      if (isLocallyIndexed ()) {
+        auto lclInds = getLocalIndsViewHost(rowinfo);
         for (size_t j = 0; j < theNumEntries; ++j) {
-          indices[j] = lview[j];
+          indices[j] = colMap_->getGlobalElement (lclInds(j));
         }
       }
       else if (isGloballyIndexed ()) {
-        ArrayView<const GO> gview = getGlobalView (rowinfo);
+        auto gblInds = getGlobalIndsViewHost(rowinfo);
         for (size_t j = 0; j < theNumEntries; ++j) {
-          indices[j] = colMap_->getLocalElement (gview[j]);
+          indices[j] = gblInds(j);
         }
       }
     }
   }
 
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
@@ -2474,23 +2476,102 @@ namespace Tpetra {
 
     if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
       if (isLocallyIndexed ()) {
-        ArrayView<const LocalOrdinal> lview = getLocalView (rowinfo);
+        auto lclInds = getLocalIndsViewHost(rowinfo);
         for (size_t j = 0; j < theNumEntries; ++j) {
-          indices[j] = colMap_->getGlobalElement (lview[j]);
+          indices[j] = colMap_->getGlobalElement (lclInds(j));
         }
       }
       else if (isGloballyIndexed ()) {
-        ArrayView<const GlobalOrdinal> gview = getGlobalView (rowinfo);
+        auto gblInds = getGlobalIndsViewHost(rowinfo);
         for (size_t j = 0; j < theNumEntries; ++j) {
-          indices[j] = gview[j];
+          indices[j] = gblInds(j);
         }
       }
     }
   }
+#endif
 
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalRowView (
+    const LocalOrdinal localRow, 
+    local_inds_host_view_type &indices) const
+  {
+    const char tfecfFuncName[] = "getLocalRowView: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (isGloballyIndexed (), std::runtime_error, "The graph's indices are "
+       "currently stored as global indices, so we cannot return a view with "
+       "local column indices, whether or not the graph has a column Map.  If "
+       "the graph _does_ have a column Map, use getLocalRowCopy() instead.");
+
+    const RowInfo rowInfo = getRowInfo (localRow);
+    if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
+        rowInfo.numEntries > 0) {
+      indices = lclIndsUnpacked_wdv.getHostSubview(rowInfo.offset1D, 
+                                           rowInfo.numEntries,
+                                           Access::ReadOnly);
+    }
+    else {
+      // This does the right thing (reports an empty row) if the input
+      // row is invalid.
+      indices = local_inds_host_view_type();
+    }
+
+    if (debug_) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (indices.size ()) !=
+         getNumEntriesInLocalRow (localRow), std::logic_error, "indices.size() "
+         "= " << indices.extent(0) << " != getNumEntriesInLocalRow(localRow=" <<
+         localRow << ") = " << getNumEntriesInLocalRow(localRow) <<
+         ".  Please report this bug to the Tpetra developers.");
+    }
+  }
+
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getGlobalRowView (
+    const GlobalOrdinal globalRow,
+    global_inds_host_view_type &indices) const
+  {
+    const char tfecfFuncName[] = "getGlobalRowView: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (isLocallyIndexed (), std::runtime_error, "The graph's indices are "
+       "currently stored as local indices, so we cannot return a view with "
+       "global column indices.  Use getGlobalRowCopy() instead.");
+
+    // This does the right thing (reports an empty row) if the input
+    // row is invalid.
+    const RowInfo rowInfo = getRowInfoFromGlobalRowIndex (globalRow);
+    if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
+        rowInfo.numEntries > 0) {
+      indices = gblInds_wdv.getHostSubview(rowInfo.offset1D, 
+                                           rowInfo.numEntries,
+                                           Access::ReadOnly);
+    }
+    else {
+      indices = typename global_inds_dualv_type::t_host::const_type();
+    }
+    if (debug_) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (indices.size ()) !=
+         getNumEntriesInGlobalRow (globalRow),
+         std::logic_error, "indices.size() = " << indices.extent(0)
+         << " != getNumEntriesInGlobalRow(globalRow=" << globalRow << ") = "
+         << getNumEntriesInGlobalRow (globalRow)
+         << ".  Please report this bug to the Tpetra developers.");
+    }
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+//  TPETRA_DEPRECATED
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   getLocalRowView (const LocalOrdinal localRow,
                    Teuchos::ArrayView<const LocalOrdinal>& indices) const
@@ -2527,9 +2608,12 @@ namespace Tpetra {
     }
   }
 
+#endif
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void
+//  TPETRA_DEPRECATED
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   getGlobalRowView (const GlobalOrdinal globalRow,
                     Teuchos::ArrayView<const GlobalOrdinal>& indices) const
@@ -2560,6 +2644,7 @@ namespace Tpetra {
          << ".  Please report this bug to the Tpetra developers.");
     }
   }
+#endif
 
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -2855,8 +2940,8 @@ namespace Tpetra {
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
-  setAllIndices (const typename local_graph_type::row_map_type& rowPointers,
-                 const typename local_graph_type::entries_type::non_const_type& columnIndices)
+  setAllIndices (const typename local_graph_device_type::row_map_type& rowPointers,
+                 const typename local_graph_device_type::entries_type::non_const_type& columnIndices)
   {
     const char tfecfFuncName[] = "setAllIndices: ";
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
@@ -2881,8 +2966,8 @@ namespace Tpetra {
     if (debug_ && this->isSorted()) {
       // Verify that the local indices are actually sorted
       int notSorted = 0;
-      using exec_space = typename local_graph_type::execution_space;
-      using size_type = typename local_graph_type::size_type;
+      using exec_space = typename local_graph_device_type::execution_space;
+      using size_type = typename local_graph_device_type::size_type;
       Kokkos::parallel_reduce(Kokkos::RangePolicy<exec_space>(0, numLocalRows),
         KOKKOS_LAMBDA (const LocalOrdinal i, int& lNotSorted)
         {
@@ -2918,7 +3003,7 @@ namespace Tpetra {
     // since the future model will be allocation at construction, not
     // lazy allocation on first insert.
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-      ((this->k_lclInds1D_.extent (0) != 0 || this->k_gblInds1D_.extent (0) != 0),
+      ((this->lclIndsUnpacked_wdv.extent (0) != 0 || this->gblInds_wdv.extent (0) != 0),
        std::runtime_error, "You may not call this method if 1-D data "
        "structures are already allocated.");
 
@@ -2926,8 +3011,10 @@ namespace Tpetra {
     indicesAreLocal_     = true;
     indicesAreSorted_    = true;
     noRedundancies_      = true;
-    k_lclInds1D_         = columnIndices;
-    k_rowPtrs_           = rowPointers;
+    lclIndsPacked_wdv= local_inds_wdv_type(columnIndices);
+    lclIndsUnpacked_wdv          = lclIndsPacked_wdv;
+    setRowPtrsUnpacked(rowPointers);
+    setRowPtrsPacked(rowPointers);
 
     set_need_sync_host_uvm_access(); // columnIndices and rowPointers potentially still in a kernel
 
@@ -2935,9 +3022,6 @@ namespace Tpetra {
     // way to indicate any extra space at the end of each row.
     storageStatus_       = Details::STORAGE_1D_PACKED;
 
-    // Build the local graph.
-    lclGraph_ = local_graph_type (k_lclInds1D_, k_rowPtrs_);
-
     // These normally get cleared out at the end of allocateIndices.
     // It makes sense to clear them out here, because at the end of
     // this method, the graph is allocated on the calling process.
@@ -2955,7 +3039,7 @@ namespace Tpetra {
                  const Teuchos::ArrayRCP<LocalOrdinal>& columnIndices)
   {
     using Kokkos::View;
-    typedef typename local_graph_type::row_map_type row_map_type;
+    typedef typename local_graph_device_type::row_map_type row_map_type;
     typedef typename row_map_type::array_layout layout_type;
     typedef typename row_map_type::non_const_value_type row_offset_type;
     typedef View<size_t*, layout_type , Kokkos::HostSpace,
@@ -3034,7 +3118,7 @@ namespace Tpetra {
         // left with the case that we have optimized storage. in this
         // case, we have to construct a list of row sizes.
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (numRows != 0 && k_rowPtrs_.extent (0) == 0, std::logic_error,
+          (numRows != 0 && rowPtrsUnpacked_host_.extent (0) == 0, std::logic_error,
            "The graph has " << numRows << " (> 0) row"
            << (numRows != 1 ? "s" : "") << " on the calling process, "
            "but the k_rowPtrs_ array has zero entries." << suffix);
@@ -3047,7 +3131,7 @@ namespace Tpetra {
         // might as well check whether all rows' bounds are the same.
         bool allRowsReallySame = false;
         for (ptrdiff_t i = 0; i < numRows; ++i) {
-          numEnt[i] = this->k_rowPtrs_(i+1) - this->k_rowPtrs_(i);
+          numEnt[i] = rowPtrsUnpacked_host_(i+1) - rowPtrsUnpacked_host_(i);
           if (i != 0 && numEnt[i] != numEnt[i-1]) {
             allRowsReallySame = false;
           }
@@ -3526,6 +3610,7 @@ namespace Tpetra {
     // The method doesn't do any work if the indices are already local.
     const std::pair<size_t, std::string> makeIndicesLocalResult =
       this->makeIndicesLocal(verbose);
+
     if (debug_) {
       using Details::gathervPrint;
       using Teuchos::RCP;
@@ -3624,14 +3709,14 @@ namespace Tpetra {
       isFillComplete () || ! hasColMap (), std::runtime_error, "You may not "
       "call this method unless the graph has a column Map.");
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
-      getNodeNumRows () > 0 && k_rowPtrs_.extent (0) == 0,
+      getNodeNumRows () > 0 && rowPtrsUnpacked_host_.extent (0) == 0,
       std::runtime_error, "The calling process has getNodeNumRows() = "
       << getNodeNumRows () << " > 0 rows, but the row offsets array has not "
       "been set.");
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
-      static_cast<size_t> (k_rowPtrs_.extent (0)) != getNodeNumRows () + 1,
+      static_cast<size_t> (rowPtrsUnpacked_host_.extent (0)) != getNodeNumRows () + 1,
       std::runtime_error, "The row offsets array has length " <<
-      k_rowPtrs_.extent (0) << " != getNodeNumRows()+1 = " <<
+      rowPtrsUnpacked_host_.extent (0) << " != getNodeNumRows()+1 = " <<
       (getNodeNumRows () + 1) << ".");
 
     // Note: We don't need to do the following things which are normally done in fillComplete:
@@ -3746,21 +3831,15 @@ namespace Tpetra {
   {
     using ::Tpetra::Details::computeOffsetsFromCounts;
     typedef decltype (k_numRowEntries_) row_entries_type;
-    typedef typename local_graph_type::row_map_type row_map_type;
+    typedef typename local_graph_device_type::row_map_type row_map_type;
     typedef typename row_map_type::non_const_type non_const_row_map_type;
-    typedef typename local_graph_type::entries_type::non_const_type lclinds_1d_type;
+    typedef typename local_graph_device_type::entries_type::non_const_type lclinds_1d_type;
     const char tfecfFuncName[] = "fillLocalGraph (called from fillComplete or "
       "expertStaticFillComplete): ";
     const size_t lclNumRows = this->getNodeNumRows ();
 
     // This method's goal is to fill in the two arrays (compressed
     // sparse row format) that define the sparse graph's structure.
-    //
-    // Use the nonconst version of row_map_type for ptr_d, because
-    // the latter is const and we need to modify ptr_d here.
-    non_const_row_map_type ptr_d;
-    row_map_type ptr_d_const;
-    lclinds_1d_type ind_d;
 
     bool requestOptimizedStorage = true;
     if (! params.is_null () && ! params->get ("Optimize Storage", true)) {
@@ -3768,27 +3847,26 @@ namespace Tpetra {
     }
 
     // The graph's column indices are currently stored in a 1-D
-    // format, with row offsets in k_rowPtrs_ and local column indices
+    // format, with row offsets in rowPtrsUnpacked_host_ and local column indices
     // in k_lclInds1D_.
 
     if (debug_) {
       // The graph's array of row offsets must already be allocated.
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (k_rowPtrs_.extent (0) == 0, std::logic_error,
+        (rowPtrsUnpacked_host_.extent (0) == 0, std::logic_error,
          "k_rowPtrs_ has size zero, but shouldn't");
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (k_rowPtrs_.extent (0) != lclNumRows + 1, std::logic_error,
-         "k_rowPtrs_.extent(0) = "
-         << k_rowPtrs_.extent (0) << " != (lclNumRows + 1) = "
+        (rowPtrsUnpacked_host_.extent (0) != lclNumRows + 1, std::logic_error,
+         "rowPtrsUnpacked_host_.extent(0) = "
+         << rowPtrsUnpacked_host_.extent (0) << " != (lclNumRows + 1) = "
          << (lclNumRows + 1) << ".");
-      const size_t numOffsets = k_rowPtrs_.extent (0);
-      const auto valToCheck =
-        ::Tpetra::Details::getEntryOnHost (k_rowPtrs_, numOffsets - 1);
+      const size_t numOffsets = rowPtrsUnpacked_host_.extent (0);
+      const auto valToCheck = rowPtrsUnpacked_host_(numOffsets-1);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (numOffsets != 0 &&
-         k_lclInds1D_.extent (0) != valToCheck,
+         lclIndsUnpacked_wdv.extent (0) != valToCheck,
          std::logic_error, "numOffsets=" << numOffsets << " != 0 "
-         " and k_lclInds1D_.extent(0)=" << k_lclInds1D_.extent(0)
+         " and lclIndsUnpacked_wdv.extent(0)=" << lclIndsUnpacked_wdv.extent(0)
          << " != k_rowPtrs_(" << numOffsets << ")=" << valToCheck
          << ".");
     }
@@ -3819,6 +3897,11 @@ namespace Tpetra {
     }
 
     if (this->getNodeNumEntries () != allocSize) {
+      // Use the nonconst version of row_map_type for ptr_d, because
+      // the latter is const and we need to modify ptr_d here.
+      non_const_row_map_type ptr_d;
+      row_map_type ptr_d_const;
+
       // The graph's current 1-D storage is "unpacked."  This means
       // the row offsets may differ from what the final row offsets
       // should be.  This could happen, for example, if the user set
@@ -3826,17 +3909,16 @@ namespace Tpetra {
       // didn't fill all those entries.
 
       if (debug_) {
-        if (k_rowPtrs_.extent (0) != 0) {
+        if (rowPtrsUnpacked_host_.extent (0) != 0) {
           const size_t numOffsets =
-            static_cast<size_t> (k_rowPtrs_.extent (0));
-          const auto valToCheck =
-            ::Tpetra::Details::getEntryOnHost (k_rowPtrs_, numOffsets - 1);
+            static_cast<size_t> (rowPtrsUnpacked_host_.extent (0));
+          const auto valToCheck = rowPtrsUnpacked_host_(numOffsets - 1);
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-            (valToCheck != size_t(k_lclInds1D_.extent(0)),
+            (valToCheck != size_t(lclIndsUnpacked_wdv.extent(0)),
              std::logic_error, "(Unpacked branch) Before allocating "
              "or packing, k_rowPtrs_(" << (numOffsets-1) << ")="
-             << valToCheck << " != k_lclInds1D_.extent(0)="
-             << k_lclInds1D_.extent (0) << ".");
+             << valToCheck << " != lclIndsUnpacked_wdv.extent(0)="
+             << lclIndsUnpacked_wdv.extent (0) << ".");
         }
       }
 
@@ -3849,7 +3931,8 @@ namespace Tpetra {
       size_t lclTotalNumEntries = 0;
       {
         // Allocate the packed row offsets array.
-        ptr_d = non_const_row_map_type ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
+        ptr_d = 
+          non_const_row_map_type ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
         ptr_d_const = ptr_d;
 
         // It's ok that k_numRowEntries_ is a host View; the
@@ -3884,9 +3967,10 @@ namespace Tpetra {
       }
 
       // Allocate the array of packed column indices.
-      ind_d = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
+      lclinds_1d_type ind_d =
+         lclinds_1d_type ("Tpetra::CrsGraph::lclInd", lclTotalNumEntries);
 
-      // k_rowPtrs_ and k_lclInds1D_ are currently unpacked.  Pack
+      // k_rowPtrs_ and lclIndsUnpacked_wdv are currently unpacked.  Pack
       // them, using the packed row offsets array ptr_d that we
       // created above.
       //
@@ -3894,12 +3978,16 @@ namespace Tpetra {
       // CrsMatrix?), we need to keep around the unpacked row
       // offsets and column indices.
 
-      // Pack the column indices from unpacked k_lclInds1D_ into
-      // packed ind_d.  We will replace k_lclInds1D_ below.
+      // Pack the column indices from unpacked lclIndsUnpacked_wdv into
+      // packed ind_d.  We will replace lclIndsUnpacked_wdv below.
       typedef pack_functor<
-        typename local_graph_type::entries_type::non_const_type,
-        row_map_type> inds_packer_type;
-      inds_packer_type f (ind_d, k_lclInds1D_, ptr_d, k_rowPtrs_);
+        typename local_graph_device_type::entries_type::non_const_type,
+        typename local_inds_dualv_type::t_dev::const_type,
+        row_map_type,
+        typename local_graph_device_type::row_map_type> inds_packer_type;
+      inds_packer_type f (ind_d, 
+                          lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
+                          ptr_d, rowPtrsUnpacked_dev_);
       {
         typedef typename decltype (ind_d)::execution_space exec_space;
         typedef Kokkos::RangePolicy<exec_space, LocalOrdinal> range_type;
@@ -3924,46 +4012,51 @@ namespace Tpetra {
              << ind_d.extent(0) << ".");
         }
       }
+      // Build the local graph.
+      setRowPtrsPacked(ptr_d_const);
+      lclIndsPacked_wdv = local_inds_wdv_type(ind_d);
     }
     else { // We don't have to pack, so just set the pointers.
-      ptr_d_const = k_rowPtrs_;
-      ind_d = k_lclInds1D_;
+      setRowPtrsPacked(rowPtrsUnpacked_dev_);
+      lclIndsPacked_wdv = lclIndsUnpacked_wdv; 
 
       if (debug_) {
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (ptr_d_const.extent (0) == 0, std::logic_error,
+          (rowPtrsPacked_dev_.extent (0) == 0, std::logic_error,
            "(\"Optimize Storage\"=false branch) "
-           "ptr_d_const.extent(0) = 0.  This probably means that "
+           "rowPtrsPacked_dev_.extent(0) = 0.  "
+           "This probably means that "
            "k_rowPtrs_ was never allocated.");
-        if (ptr_d_const.extent (0) != 0) {
+        if (rowPtrsPacked_dev_.extent (0) != 0) {
           const size_t numOffsets =
-            static_cast<size_t> (ptr_d_const.extent (0));
+            static_cast<size_t> (rowPtrsPacked_dev_.extent (0));
           const size_t valToCheck =
-            ::Tpetra::Details::getEntryOnHost (ptr_d_const, numOffsets - 1);
+            rowPtrsPacked_host_(numOffsets - 1);
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-            (valToCheck != size_t(ind_d.extent (0)),
+            (valToCheck != size_t(lclIndsPacked_wdv.extent (0)),
              std::logic_error, "(\"Optimize Storage\"=false branch) "
-             "ptr_d_const(" << (numOffsets-1) << ")=" << valToCheck
-             << " != ind_d.extent(0)=" << ind_d.extent (0) << ".");
+             "rowPtrsPacked_dev_(" << (numOffsets-1) << ")=" 
+             << valToCheck
+             << " != lclIndsPacked_wdv.extent(0)=" 
+             << lclIndsPacked_wdv.extent (0) << ".");
         }
       }
     }
 
     if (debug_) {
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (static_cast<size_t> (ptr_d_const.extent (0)) != lclNumRows + 1,
-         std::logic_error, "After packing, ptr_d_const.extent(0) = " <<
-         ptr_d_const.extent (0) << " != lclNumRows+1 = " << (lclNumRows+1)
+        (static_cast<size_t> (rowPtrsPacked_dev_.extent (0)) != lclNumRows + 1,
+         std::logic_error, "After packing, rowPtrsPacked_dev_.extent(0) = " <<
+         rowPtrsPacked_dev_.extent (0) << " != lclNumRows+1 = " << (lclNumRows+1)
          << ".");
-      if (ptr_d_const.extent (0) != 0) {
-        const size_t numOffsets = static_cast<size_t> (ptr_d_const.extent (0));
-        const auto valToCheck =
-          ::Tpetra::Details::getEntryOnHost (ptr_d_const, numOffsets - 1);
+      if (rowPtrsPacked_dev_.extent (0) != 0) {
+        const size_t numOffsets = static_cast<size_t> (rowPtrsPacked_dev_.extent (0));
+        const auto valToCheck = rowPtrsPacked_host_(numOffsets - 1);
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (valToCheck) != ind_d.extent (0),
-           std::logic_error, "After packing, ptr_d_const(" << (numOffsets-1)
-           << ") = " << valToCheck << " != ind_d.extent(0) = "
-           << ind_d.extent (0) << ".");
+          (static_cast<size_t> (valToCheck) != lclIndsPacked_wdv.extent (0),
+           std::logic_error, "After packing, rowPtrsPacked_dev_(" << (numOffsets-1)
+           << ") = " << valToCheck << " != lclIndsPacked_wdv.extent(0) = "
+           << lclIndsPacked_wdv.extent (0) << ".");
       }
     }
 
@@ -3976,17 +4069,12 @@ namespace Tpetra {
       k_numRowEntries_ = row_entries_type ();
 
       // Keep the new 1-D packed allocations.
-      k_rowPtrs_   = ptr_d_const;
-      k_lclInds1D_ = ind_d;
+      setRowPtrsUnpacked(rowPtrsPacked_dev_);
+      lclIndsUnpacked_wdv = lclIndsPacked_wdv;
 
       storageStatus_ = Details::STORAGE_1D_PACKED;
     }
 
-    // FIXME (mfh 28 Aug 2014) "Local Graph" sublist no longer used.
-
-    // Build the local graph.
-    lclGraph_ = local_graph_type (ind_d, ptr_d_const);
-
     set_need_sync_host_uvm_access(); // make sure kernel setup of indices is fenced before a host access
   }
 
@@ -4021,7 +4109,7 @@ namespace Tpetra {
     using Teuchos::RCP;
     typedef GlobalOrdinal GO;
     typedef LocalOrdinal LO;
-    typedef typename local_graph_type::entries_type::non_const_type col_inds_type;
+    typedef typename local_inds_dualv_type::t_host col_inds_type;
     const char tfecfFuncName[] = "reindexColumns: ";
 
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
@@ -4073,8 +4161,8 @@ namespace Tpetra {
     // (is not empty) on the calling process.  In that case, we
     // allocate the first (1-D storage) if the graph has a static
     // profile, else we allocate the second (2-D storage).
-    typename local_graph_type::entries_type::non_const_type newLclInds1D;
-    Teuchos::ArrayRCP<Teuchos::Array<LO> > newLclInds2D;
+    col_inds_type newLclInds1D;
+    auto oldLclInds1D = lclIndsUnpacked_wdv.getHostView(Access::ReadOnly);
 
     // If indices aren't allocated, that means the calling process
     // owns no entries in the graph.  Thus, there is nothing to
@@ -4085,16 +4173,15 @@ namespace Tpetra {
           const map_type& oldColMap = * (getColMap ());
           // Allocate storage for the new local indices.
           const size_t allocSize = this->getNodeAllocationSize ();
-          newLclInds1D = col_inds_type ("Tpetra::CrsGraph::ind", allocSize);
+          newLclInds1D = col_inds_type("Tpetra::CrsGraph::lclIndsReindexedHost",
+                                       allocSize);
           // Attempt to convert the new indices locally.
           for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
             const RowInfo rowInfo = this->getRowInfo (lclRow);
             const size_t beg = rowInfo.offset1D;
             const size_t end = beg + rowInfo.numEntries;
             for (size_t k = beg; k < end; ++k) {
-              // FIXME (mfh 21 Aug 2014) This assumes UVM.  Should
-              // use a DualView instead.
-              const LO oldLclCol = k_lclInds1D_(k);
+              const LO oldLclCol = oldLclInds1D(k);
               if (oldLclCol == Teuchos::OrdinalTraits<LO>::invalid ()) {
                 allCurColIndsValid = false;
                 break; // Stop at the first invalid index
@@ -4114,8 +4201,6 @@ namespace Tpetra {
                   localSuffices = false;
                   break; // Stop at the first invalid index
                 }
-                // FIXME (mfh 21 Aug 2014) This assumes UVM.  Should
-                // use a DualView instead.
                 newLclInds1D(k) = newLclCol;
               }
             } // for each entry in the current row
@@ -4146,9 +4231,9 @@ namespace Tpetra {
         // column Map on the calling process.
         for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
           const RowInfo rowInfo = this->getRowInfo (lclRow);
-          Teuchos::ArrayView<const GO> oldGblRowView = getGlobalView (rowInfo);
+          auto oldGblRowView = this->getGlobalIndsViewHost (rowInfo);
           for (size_t k = 0; k < rowInfo.numEntries; ++k) {
-            const GO gblCol = oldGblRowView[k];
+            const GO gblCol = oldGblRowView(k);
             if (! newColMap->isNodeGlobalElement (gblCol)) {
               localSuffices = false;
               break; // Stop at the first invalid index
@@ -4189,7 +4274,15 @@ namespace Tpetra {
 
     // Commit the results.
     if (isLocallyIndexed ()) {
-      k_lclInds1D_ = newLclInds1D;
+      { // scope the device view; sortAndMergeAllIndices needs host
+        typename local_inds_dualv_type::t_dev newLclInds1D_dev(
+                 Kokkos::view_alloc("Tpetra::CrsGraph::lclIndReindexed",
+                                    Kokkos::WithoutInitializing),
+                 newLclInds1D.extent(0));
+        Kokkos::deep_copy(newLclInds1D_dev, newLclInds1D);
+        lclIndsUnpacked_wdv = local_inds_wdv_type(newLclInds1D_dev);
+      }
+
       // We've reindexed, so we don't know if the indices are sorted.
       //
       // FIXME (mfh 17 Sep 2014) It could make sense to check this,
@@ -4275,12 +4368,34 @@ namespace Tpetra {
     importer_ = Teuchos::rcp_const_cast<import_type> (newImporter);
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
-  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type
   CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
   getLocalGraph () const
   {
-    return lclGraph_;
+    return getLocalGraphDevice();
+  }
+#endif
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalGraphDevice () const
+  {
+    return local_graph_device_type(
+                 lclIndsPacked_wdv.getDeviceView(Access::ReadWrite),
+                 rowPtrsPacked_dev_);
+  }
+
+  template <class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_host_type
+  CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalGraphHost () const
+  {
+    return local_graph_host_type(
+                 lclIndsPacked_wdv.getHostView(Access::ReadWrite),
+                 rowPtrsPacked_host_);
   }
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -4347,7 +4462,7 @@ namespace Tpetra {
 
     using LO = local_ordinal_type;
 
-    auto ptr = this->lclGraph_.row_map;
+    auto ptr = this->rowPtrsPacked_dev_;
     const LO lclNumRows = ptr.extent(0) == 0 ?
       static_cast<LO> (0) :
       (static_cast<LO> (ptr.extent(0)) - static_cast<LO> (1));
@@ -4372,13 +4487,9 @@ namespace Tpetra {
     typedef LocalOrdinal LO;
     typedef GlobalOrdinal GO;
     typedef device_type DT;
-    typedef typename local_graph_type::row_map_type::non_const_value_type offset_type;
+    typedef typename local_graph_device_type::row_map_type::non_const_value_type offset_type;
     typedef decltype (k_numRowEntries_) row_entries_type;
     typedef typename row_entries_type::non_const_value_type num_ent_type;
-    typedef typename local_graph_type::entries_type::non_const_type
-      lcl_col_inds_type;
-    typedef Kokkos::View<GO*, typename lcl_col_inds_type::array_layout,
-      device_type> gbl_col_inds_type;
     const char tfecfFuncName[] = "makeIndicesLocal: ";
     ProfilingRegion regionMakeIndicesLocal ("Tpetra::CrsGraph::makeIndicesLocal");
 
@@ -4417,58 +4528,47 @@ namespace Tpetra {
         this->k_numRowEntries_;
 
       // Allocate space for local indices.
-      // If GO and LO are the same size, we can reuse the existing
-      // array of 1-D index storage to convert column indices from
-      // GO to LO.  Otherwise, we'll just allocate a new buffer.
-      constexpr bool LO_GO_same = std::is_same<LO, GO>::value;
-      if (LO_GO_same) {
-        // This prevents a build error (illegal assignment) if
-        // LO_GO_same is _not_ true.  Only the first branch
-        // (returning k_gblInds1D_) should ever get taken.
-        k_lclInds1D_ = Kokkos::Impl::if_c<LO_GO_same,
-          t_GlobalOrdinal_1D,
-          lcl_col_inds_type>::select (k_gblInds1D_, k_lclInds1D_);
-      }
-      else {
-        if (k_rowPtrs_.extent (0) == 0) {
-          errStrm << "k_rowPtrs_.extent(0) == 0.  This should never "
-            "happen here.  Please report this bug to the Tpetra developers."
-            << endl;
-          // Need to return early.
-          return std::make_pair (Tpetra::Details::OrdinalTraits<size_t>::invalid (),
-                                 errStrm.str ());
-        }
-        const auto numEnt = ::Tpetra::Details::getEntryOnHost (k_rowPtrs_, lclNumRows);
-
-        // mfh 17 Dec 2016: We don't need initial zero-fill of
-        // k_lclInds1D_, because we will fill it below anyway.
-        // AllowPadding would only help for aligned access (e.g.,
-        // for vectorization) if we also were to pad each row to the
-        // same alignment, so we'll skip AllowPadding for now.
-
-        // using Kokkos::AllowPadding;
-        using Kokkos::view_alloc;
-        using Kokkos::WithoutInitializing;
-
-        // When giving the label as an argument to
-        // Kokkos::view_alloc, the label must be a string and not a
-        // char*, else the code won't compile.  This is because
-        // view_alloc also allows a raw pointer as its first
-        // argument.  See
-        // https://github.com/kokkos/kokkos/issues/434.  This is a
-        // large allocation typically, so the overhead of creating
-        // an std::string is minor.
-        const std::string label ("Tpetra::CrsGraph::lclind");
-        if (verbose) {
-          std::ostringstream os;
-          os << *prefix << "(Re)allocate k_lclInds1D_: old="
-             << k_lclInds1D_.extent(0) << ", new=" << numEnt << endl;
-          std::cerr << os.str();
-        }
-        k_lclInds1D_ =
-          lcl_col_inds_type (view_alloc (label, WithoutInitializing), numEnt);
+      if (rowPtrsUnpacked_host_.extent (0) == 0) {
+        errStrm << "k_rowPtrs_.extent(0) == 0.  This should never "
+        "happen here.  Please report this bug to the Tpetra developers."
+        << endl;
+        // Need to return early.
+        return std::make_pair(Tpetra::Details::OrdinalTraits<size_t>::invalid (),
+                              errStrm.str ());
+      }
+      const auto numEnt = rowPtrsUnpacked_host_(lclNumRows);
+
+      // mfh 17 Dec 2016: We don't need initial zero-fill of
+      // lclIndsUnpacked_wdv, because we will fill it below anyway.
+      // AllowPadding would only help for aligned access (e.g.,
+      // for vectorization) if we also were to pad each row to the
+      // same alignment, so we'll skip AllowPadding for now.
+
+      // using Kokkos::AllowPadding;
+      using Kokkos::view_alloc;
+      using Kokkos::WithoutInitializing;
+
+      // When giving the label as an argument to
+      // Kokkos::view_alloc, the label must be a string and not a
+      // char*, else the code won't compile.  This is because
+      // view_alloc also allows a raw pointer as its first
+      // argument.  See
+      // https://github.com/kokkos/kokkos/issues/434.  This is a
+      // large allocation typically, so the overhead of creating
+      // an std::string is minor.
+      const std::string label ("Tpetra::CrsGraph::lclInd");
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "(Re)allocate lclInd_wdv: old="
+           << lclIndsUnpacked_wdv.extent(0) << ", new=" << numEnt << endl;
+        std::cerr << os.str();
       }
 
+      local_inds_dualv_type lclInds_dualv = 
+          local_inds_dualv_type(view_alloc(label, WithoutInitializing),
+                                  numEnt);
+      lclIndsUnpacked_wdv = local_inds_wdv_type(lclInds_dualv);
+
       auto lclColMap = colMap.getLocalMap ();
       // This is a "device mirror" of the host View h_numRowEnt.
       //
@@ -4483,15 +4583,17 @@ namespace Tpetra {
            << h_numRowEnt.extent(0) << endl;
         std::cerr << os.str();
       }
-      auto k_numRowEnt = Kokkos::create_mirror_view (device_type (), h_numRowEnt);
+      auto k_numRowEnt = 
+           Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt);
 
       using ::Tpetra::Details::convertColumnIndicesFromGlobalToLocal;
       lclNumErrs =
-        convertColumnIndicesFromGlobalToLocal<LO, GO, DT, offset_type, num_ent_type> (k_lclInds1D_,
-                                                                                      k_gblInds1D_,
-                                                                                      k_rowPtrs_,
-                                                                                      lclColMap,
-                                                                                      k_numRowEnt);
+        convertColumnIndicesFromGlobalToLocal<LO, GO, DT, offset_type, num_ent_type> (
+          lclIndsUnpacked_wdv.getDeviceView(Access::OverwriteAll),
+          gblInds_wdv.getDeviceView(Access::ReadOnly),
+          rowPtrsUnpacked_dev_,
+          lclColMap,
+          k_numRowEnt);
       if (lclNumErrs != 0) {
         const int myRank = [this] () {
           auto map = this->getMap ();
@@ -4516,14 +4618,13 @@ namespace Tpetra {
       // in 1-D storage, because the graph has static profile).
       if (verbose) {
         std::ostringstream os;
-        os << *prefix << "Free k_gblInds1D_: "
-           << k_gblInds1D_.extent(0) << endl;
+        os << *prefix << "Free gblInds_wdv: "
+           << gblInds_wdv.extent(0) << endl;
         std::cerr << os.str();
       }
-      k_gblInds1D_ = gbl_col_inds_type ();
+      gblInds_wdv = global_inds_wdv_type ();
     } // globallyIndexed() && lclNumRows > 0
 
-    this->lclGraph_ = local_graph_type (this->k_lclInds1D_, this->k_rowPtrs_);
     this->indicesAreLocal_  = true;
     this->indicesAreGlobal_ = false;
     this->checkInternalState ();
@@ -4646,7 +4747,6 @@ namespace Tpetra {
       const LO lclNumRows(this->getNodeNumRows());
       auto range = range_type(0, lclNumRows);
 
-      // FIXME (mfh 08 May 2017) Loops below assume CUDA UVM.
       if (verbose_) {
         size_t totalNumDups = 0;
         Kokkos::parallel_reduce(range,
@@ -4661,7 +4761,6 @@ namespace Tpetra {
         std::cerr << os.str();
       }
       else {
-        // FIXME (mfh 08 May 2017) This may assume CUDA UVM.
         Kokkos::parallel_for(range,
           [this, sorted, merged] (const LO lclRow)
           {
@@ -4868,12 +4967,18 @@ namespace Tpetra {
               if (vl == VERB_EXTREME) {
                 out << " ";
                 if (isGloballyIndexed()) {
-                  ArrayView<const GlobalOrdinal> rowview = getGlobalView(rowinfo);
-                  for (size_t j=0; j < rowinfo.numEntries; ++j) out << rowview[j] << " ";
+                  auto rowview = gblInds_wdv.getHostView(Access::ReadOnly);
+                  for (size_t j=0; j < rowinfo.numEntries; ++j){ 
+                    GlobalOrdinal colgid = rowview[j] + rowinfo.offset1D;
+                    out << colgid << " ";
+                  }
                 }
                 else if (isLocallyIndexed()) {
-                  ArrayView<const LocalOrdinal> rowview = getLocalView(rowinfo);
-                  for (size_t j=0; j < rowinfo.numEntries; ++j) out << colMap_->getGlobalElement(rowview[j]) << " ";
+                  auto rowview = lclIndsUnpacked_wdv.getHostView(Access::ReadOnly);
+                  for (size_t j=0; j < rowinfo.numEntries; ++j) {
+                    LocalOrdinal collid = rowview[j] + rowinfo.offset1D;
+                    out << colMap_->getGlobalElement(collid) << " ";
+                  }
                 }
               }
               out << std::endl;
@@ -4956,7 +5061,7 @@ namespace Tpetra {
     const map_type& srcRowMap = *(srcRowGraph.getRowMap());
     const map_type& tgtRowMap = *(getRowMap());
     const bool src_filled = srcRowGraph.isFillComplete();
-    Teuchos::Array<GO> row_copy;
+    nonconst_global_inds_host_view_type row_copy;
     LO myid = 0;
 
     //
@@ -4976,10 +5081,10 @@ namespace Tpetra {
       for (size_t i = 0; i < numSameIDs; ++i, ++myid) {
         const GO gid = srcRowMap.getGlobalElement (myid);
         size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid);
-        row_copy.resize (row_length);
+        Kokkos::resize(row_copy,row_length);
         size_t check_row_length = 0;
-        srcRowGraph.getGlobalRowCopy (gid, row_copy (), check_row_length);
-        this->insertGlobalIndices (gid, row_copy ());
+        srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length);
+        this->insertGlobalIndices (gid, row_length, row_copy.data());
       }
     } else {
       if (verbose) {
@@ -4989,9 +5094,9 @@ namespace Tpetra {
       }
       for (size_t i = 0; i < numSameIDs; ++i, ++myid) {
         const GO gid = srcRowMap.getGlobalElement (myid);
-        Teuchos::ArrayView<const GO> row;
+        global_inds_host_view_type row;
         srcCrsGraph->getGlobalRowView (gid, row);
-        this->insertGlobalIndices (gid, row);
+        this->insertGlobalIndices (gid, row.extent(0), row.data());
       }
     }
 
@@ -5006,18 +5111,18 @@ namespace Tpetra {
         const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]);
         const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]);
         size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid);
-        row_copy.resize (row_length);
+        Kokkos::resize(row_copy,row_length);
         size_t check_row_length = 0;
-        srcRowGraph.getGlobalRowCopy (srcgid, row_copy (), check_row_length);
-        this->insertGlobalIndices (mygid, row_copy ());
+        srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length);
+        this->insertGlobalIndices (mygid, row_length, row_copy.data());
       }
     } else {
       for (LO i = 0; i < static_cast<LO> (permuteToLIDs_h.extent (0)); ++i) {
         const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]);
         const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]);
-        Teuchos::ArrayView<const GO> row;
+        global_inds_host_view_type row;
         srcCrsGraph->getGlobalRowView (srcgid, row);
-        this->insertGlobalIndices (mygid, row);
+        this->insertGlobalIndices (mygid, row.extent(0), row.data());
       }
     }
 
@@ -5038,9 +5143,8 @@ namespace Tpetra {
     using Details::padCrsArrays;
     using std::endl;
     using LO = local_ordinal_type;
-    using execution_space = typename device_type::execution_space;
     using row_ptrs_type =
-      typename local_graph_type::row_map_type::non_const_type;
+      typename local_graph_device_type::row_map_type::non_const_type;
     using range_policy =
       Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
     const char tfecfFuncName[] = "applyCrsPadding";
@@ -5089,15 +5193,15 @@ namespace Tpetra {
     if (verbose) {
       std::ostringstream os;
       os << *prefix << "Allocate row_ptrs_beg: "
-         << k_rowPtrs_.extent(0) << endl;
+         << rowPtrsUnpacked_dev_.extent(0) << endl;
       std::cerr << os.str();
     }
     using Kokkos::view_alloc;
     using Kokkos::WithoutInitializing;
     row_ptrs_type row_ptrs_beg(
       view_alloc("row_ptrs_beg", WithoutInitializing),
-      k_rowPtrs_.extent(0));
-    Kokkos::deep_copy(row_ptrs_beg, k_rowPtrs_);
+      rowPtrsUnpacked_dev_.extent(0));
+    Kokkos::deep_copy(row_ptrs_beg, rowPtrsUnpacked_dev_);
 
     const size_t N = row_ptrs_beg.extent(0) == 0 ? size_t(0) :
       size_t(row_ptrs_beg.extent(0) - 1);
@@ -5108,12 +5212,15 @@ namespace Tpetra {
     }
     row_ptrs_type row_ptrs_end(
       view_alloc("row_ptrs_end", WithoutInitializing), N);
+    row_ptrs_type num_row_entries;
 
     const bool refill_num_row_entries = k_numRowEntries_.extent(0) != 0;
     if (refill_num_row_entries) { // Case 1: Unpacked storage
       // We can't assume correct *this capture until C++17, and it's
       // likely more efficient just to capture what we need anyway.
-      auto num_row_entries = this->k_numRowEntries_;
+      num_row_entries = 
+          row_ptrs_type(view_alloc("num_row_entries", WithoutInitializing), N);
+      Kokkos::deep_copy(num_row_entries, this->k_numRowEntries_);
       Kokkos::parallel_for
         ("Fill end row pointers", range_policy(0, N),
          KOKKOS_LAMBDA (const size_t i) {
@@ -5132,31 +5239,32 @@ namespace Tpetra {
     }
 
     if (isGloballyIndexed()) {
-      padCrsArrays(row_ptrs_beg, row_ptrs_end, k_gblInds1D_,
+      padCrsArrays(row_ptrs_beg, row_ptrs_end, gblInds_wdv,
                    padding, myRank, verbose);
     }
     else {
-      padCrsArrays(row_ptrs_beg, row_ptrs_end, k_lclInds1D_,
+      padCrsArrays(row_ptrs_beg, row_ptrs_end, lclIndsUnpacked_wdv,
                    padding, myRank, verbose);
     }
 
     if (refill_num_row_entries) {
-      auto num_row_entries = this->k_numRowEntries_;
       Kokkos::parallel_for
         ("Fill num entries", range_policy(0, N),
          KOKKOS_LAMBDA (const size_t i) {
           num_row_entries(i) = row_ptrs_end(i) - row_ptrs_beg(i);
         });
+      Kokkos::deep_copy(this->k_numRowEntries_, num_row_entries);
     }
     if (verbose) {
       std::ostringstream os;
       os << *prefix << "Reassign k_rowPtrs_; old size: "
-         << k_rowPtrs_.extent(0) << ", new size: "
+         << rowPtrsUnpacked_dev_.extent(0) << ", new size: "
          << row_ptrs_beg.extent(0) << endl;
       std::cerr << os.str();
-      TEUCHOS_ASSERT( k_rowPtrs_.extent(0) == row_ptrs_beg.extent(0) );
+      TEUCHOS_ASSERT( rowPtrsUnpacked_dev_.extent(0) == row_ptrs_beg.extent(0) );
     }
-    this->k_rowPtrs_ = row_ptrs_beg;
+
+    setRowPtrsUnpacked(row_ptrs_beg);
 
     set_need_sync_host_uvm_access(); // need fence before host UVM access of k_rowPtrs_
   }
@@ -5665,9 +5773,9 @@ namespace Tpetra {
       exports.modify_host ();
       Kokkos::deep_copy (exports.view_host (), exports_a_h);
     }
-    // packCrsGraphNew requires a valid localGraph.
+    // packCrsGraphNew requires k_rowPtrsPacked_ to be set
     else if (! getColMap ().is_null () &&
-        (lclGraph_.row_map.extent (0) != 0 ||
+        (rowPtrsPacked_dev_.extent (0) != 0 ||
          getRowMap ()->getNodeNumElements () == 0)) {
       if (verbose) {
         std::ostringstream os;
@@ -5706,8 +5814,8 @@ namespace Tpetra {
         Distributor& distor) const
   {
     auto col_map = this->getColMap();
-    // packCrsGraph requires a valid localGraph.
-    if( !col_map.is_null() && (lclGraph_.row_map.extent(0) != 0  ||  getRowMap()->getNodeNumElements() ==0)) {
+    // packCrsGraph requires k_rowPtrsPacked to be set
+    if( !col_map.is_null() && (rowPtrsPacked_dev_.extent(0) != 0  ||  getRowMap()->getNodeNumElements() ==0)) {
       using Tpetra::Details::packCrsGraph;
       packCrsGraph<LocalOrdinal,GlobalOrdinal,Node>(*this, exports, numPacketsPerLID,
                                                     exportLIDs, constantNumPackets, distor);
@@ -5862,47 +5970,31 @@ namespace Tpetra {
         else {
           const LO numEnt = static_cast<LO> (rowInfo.numEntries);
           if (this->isLocallyIndexed ()) {
-            const LO* lclColInds = nullptr;
-            LO capacity = 0;
-            const LO errCode =
-              this->getLocalViewRawConst (lclColInds, capacity, rowInfo);
-            if (errCode == 0) {
-              if (final) {
-                for (LO k = 0; k < numEnt; ++k) {
-                  const LO lclColInd = lclColInds[k];
-                  const GO gblColInd = colMapPtr->getGlobalElement (lclColInd);
-                  // Pack it, even if it's wrong.  Let the receiving
-                  // process deal with it.  Otherwise, we'll miss out
-                  // on any correct data.
-                  exports_raw[curOffset + k] = gblColInd;
-                } // for each entry in the row
-              } // final pass?
-              exportsOffset = curOffset + numEnt;
-            }
-            else { // error in getting local row view
-              Kokkos::atomic_add (&errCountView(), ONE);
-            }
+            auto lclColInds = getLocalIndsViewHost (rowInfo);
+            if (final) {
+              for (LO k = 0; k < numEnt; ++k) {
+                const LO lclColInd = lclColInds(k);
+                const GO gblColInd = colMapPtr->getGlobalElement (lclColInd);
+                // Pack it, even if it's wrong.  Let the receiving
+                // process deal with it.  Otherwise, we'll miss out
+                // on any correct data.
+                exports_raw[curOffset + k] = gblColInd;
+              } // for each entry in the row
+            } // final pass?
+            exportsOffset = curOffset + numEnt;
           }
           else if (this->isGloballyIndexed ()) {
-            const GO* gblColInds = nullptr;
-            LO capacity = 0;
-            const LO errCode =
-              this->getGlobalViewRawConst (gblColInds, capacity, rowInfo);
-            if (errCode == 0) {
-              if (final) {
-                for (LO k = 0; k < numEnt; ++k) {
-                  const GO gblColInd = gblColInds[k];
-                  // Pack it, even if it's wrong.  Let the receiving
-                  // process deal with it.  Otherwise, we'll miss out
-                  // on any correct data.
-                  exports_raw[curOffset + k] = gblColInd;
-                } // for each entry in the row
-              } // final pass?
-              exportsOffset = curOffset + numEnt;
-            }
-            else { // error in getting global row view
-              Kokkos::atomic_add (&errCountView(), ONE);
-            }
+            auto gblColInds = getGlobalIndsViewHost (rowInfo);
+            if (final) {
+              for (LO k = 0; k < numEnt; ++k) {
+                const GO gblColInd = gblColInds(k);
+                // Pack it, even if it's wrong.  Let the receiving
+                // process deal with it.  Otherwise, we'll miss out
+                // on any correct data.
+                exports_raw[curOffset + k] = gblColInd;
+              } // for each entry in the row
+            } // final pass?
+            exportsOffset = curOffset + numEnt;
           }
           // If neither globally nor locally indexed, then the graph
           // has no entries in this row (or indeed, in any row on this
@@ -6107,62 +6199,31 @@ namespace Tpetra {
 
          const LO numEnt = static_cast<LO> (rowInfo.numEntries);
          if (this->isLocallyIndexed ()) {
-           const LO* lclColInds = nullptr;
-           LO capacity = 0;
-           const LO errCode =
-             this->getLocalViewRawConst (lclColInds, capacity, rowInfo);
-           if (errCode == 0) {
-             if (final) {
-               for (LO k = 0; k < numEnt; ++k) {
-                 const LO lclColInd = lclColInds[k];
-                 const GO gblColInd = colMapPtr->getGlobalElement (lclColInd);
-                 // Pack it, even if it's wrong.  Let the receiving
-                 // process deal with it.  Otherwise, we'll miss out
-                 // on any correct data.
-                 exports_h(curOffset + k) = gblColInd;
-               } // for each entry in the row
-             } // final pass?
-             exportsOffset = curOffset + numEnt;
-           }
-           else { // error in getting local row view
-             if (verbose) {
-               std::ostringstream os;
-               os << *prefix << "For i=" << i << ", lclRow=" << lclRow
-                  << ", gblRow=" << gblRow << ": getLocalViewRawConst"
-                 "returned nonzero error code " << errCode << endl;
-               std::cerr << os.str();
-             }
-             Kokkos::atomic_add (&errCountView(), ONE);
-           }
+           auto lclColInds = getLocalIndsViewHost(rowInfo);
+           if (final) {
+             for (LO k = 0; k < numEnt; ++k) {
+               const LO lclColInd = lclColInds(k);
+               const GO gblColInd = colMapPtr->getGlobalElement (lclColInd);
+               // Pack it, even if it's wrong.  Let the receiving
+               // process deal with it.  Otherwise, we'll miss out
+               // on any correct data.
+               exports_h(curOffset + k) = gblColInd;
+             } // for each entry in the row
+           } // final pass?
+           exportsOffset = curOffset + numEnt;
          }
          else if (this->isGloballyIndexed ()) {
-           const GO* gblColInds = nullptr;
-           LO capacity = 0;
-           const LO errCode =
-             this->getGlobalViewRawConst (gblColInds, capacity, rowInfo);
-           if (errCode == 0) {
-             if (final) {
-               for (LO k = 0; k < numEnt; ++k) {
-                 const GO gblColInd = gblColInds[k];
-                 // Pack it, even if it's wrong.  Let the receiving
-                 // process deal with it.  Otherwise, we'll miss out
-                 // on any correct data.
-                 exports_h(curOffset + k) = gblColInd;
-               } // for each entry in the row
-             } // final pass?
-             exportsOffset = curOffset + numEnt;
-           }
-           else { // error in getting global row view
-             if (verbose) {
-               std::ostringstream os;
-               os << *prefix << "For i=" << i << ", lclRow=" << lclRow
-                  << ", gblRow=" << gblRow << ": "
-                 "getGlobalViewRawConst returned nonzero error code "
-                  << errCode << endl;
-               std::cerr << os.str();
-             }
-             Kokkos::atomic_add (&errCountView(), ONE);
-           }
+           auto gblColInds = getGlobalIndsViewHost(rowInfo);
+           if (final) {
+             for (LO k = 0; k < numEnt; ++k) {
+               const GO gblColInd = gblColInds(k);
+               // Pack it, even if it's wrong.  Let the receiving
+               // process deal with it.  Otherwise, we'll miss out
+               // on any correct data.
+               exports_h(curOffset + k) = gblColInd;
+             } // for each entry in the row
+           } // final pass?
+           exportsOffset = curOffset + numEnt;
          }
          // If neither globally nor locally indexed, then the graph
          // has no entries in this row (or indeed, in any row on this
@@ -6499,7 +6560,7 @@ namespace Tpetra {
 
     const bool sorted = this->isSorted ();
     if (isFillComplete ()) {
-      auto lclGraph = this->getLocalGraph ();
+      auto lclGraph = this->getLocalGraphDevice ();
       ::Tpetra::Details::getGraphDiagOffsets (offsets, lclRowMap, lclColMap,
                                               lclGraph.row_map,
                                               lclGraph.entries, sorted);
@@ -6527,7 +6588,7 @@ namespace Tpetra {
           if (static_cast<LO> (rowInfo.localRow) == lclRowInd &&
               rowInfo.numEntries > 0) {
 
-            auto colInds = this->getLocalKokkosRowView (rowInfo);
+            auto colInds = this->getLocalIndsViewHost (rowInfo);
             const size_t hint = 0; // not needed for this algorithm
             const size_t offset =
               KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
@@ -6539,23 +6600,23 @@ namespace Tpetra {
               // that it really does point to the diagonal entry.  Offsets
               // are _relative_ to each row, not absolute (for the whole
               // (local) graph).
-              Teuchos::ArrayView<const LO> lclColInds;
+              typename local_inds_dualv_type::t_host::const_type lclColInds;
               try {
-                this->getLocalRowView (lclRowInd, lclColInds);
+                lclColInds = this->getLocalIndsViewHost (rowInfo);
               }
               catch (...) {
                 noOtherWeirdness = false;
               }
               // Don't continue with error checking if the above failed.
               if (noOtherWeirdness) {
-                const size_t numEnt = lclColInds.size ();
+                const size_t numEnt = lclColInds.extent (0);
                 if (offset >= numEnt) {
                   // Offsets are relative to each row, so this means that
                   // the offset is out of bounds.
                   allOffsetsCorrect = false;
                   wrongOffsets.push_back (std::make_pair (lclRowInd, offset));
                 } else {
-                  const LO actualLclColInd = lclColInds[offset];
+                  const LO actualLclColInd = lclColInds(offset);
                   const GO actualGblColInd = lclColMap.getGlobalElement (actualLclColInd);
                   if (actualGblColInd != gblColInd) {
                     allOffsetsCorrect = false;
@@ -6789,7 +6850,6 @@ namespace Tpetra {
     using NT = node_type;
     using this_type = CrsGraph<LO, GO, NT>;
     using ivector_type = Vector<int, LO, GO, NT>;
-    using packet_type = typename this_type::packet_type;
 
     const char* prefix = "Tpetra::CrsGraph::transferAndFillComplete: ";
 
@@ -7479,7 +7539,8 @@ namespace Tpetra {
     std::swap(graph.importer_, this->importer_);
     std::swap(graph.exporter_, this->exporter_);
 
-    std::swap(graph.lclGraph_, this->lclGraph_);
+    std::swap(graph.rowPtrsPacked_dev_, this->rowPtrsPacked_dev_);
+    std::swap(graph.rowPtrsPacked_host_, this->rowPtrsPacked_host_);
 
     std::swap(graph.nodeMaxNumRowEntries_, this->nodeMaxNumRowEntries_);
 
@@ -7488,10 +7549,12 @@ namespace Tpetra {
 
     std::swap(graph.numAllocForAllRows_, this->numAllocForAllRows_);
 
-    std::swap(graph.k_rowPtrs_, this->k_rowPtrs_);
+    std::swap(graph.rowPtrsUnpacked_dev_, this->rowPtrsUnpacked_dev_);
+    std::swap(graph.rowPtrsUnpacked_host_, this->rowPtrsUnpacked_host_);
 
-    std::swap(graph.k_lclInds1D_, this->k_lclInds1D_);
-    std::swap(graph.k_gblInds1D_, this->k_gblInds1D_);
+    std::swap(graph.lclIndsUnpacked_wdv, this->lclIndsUnpacked_wdv);
+    std::swap(graph.gblInds_wdv, this->gblInds_wdv);
+    std::swap(graph.lclIndsPacked_wdv, this->lclIndsPacked_wdv);
 
     std::swap(graph.storageStatus_, this->storageStatus_);
 
@@ -7590,82 +7653,86 @@ namespace Tpetra {
     }
 
     // Compare this->k_rowPtrs_ isa Kokkos::View<LocalOrdinal*, ...>
-    output = this->k_rowPtrs_.extent(0) == graph.k_rowPtrs_.extent(0) ? output : false;
-    if(output && this->k_rowPtrs_.extent(0) > 0)
+    output = this->rowPtrsUnpacked_host_.extent(0) == graph.rowPtrsUnpacked_host_.extent(0) ? output : false;
+    if(output && this->rowPtrsUnpacked_host_.extent(0) > 0)
     {
-      typename local_graph_type::row_map_type::const_type::HostMirror k_rowPtrs_host_this = Kokkos::create_mirror_view(this->k_rowPtrs_);
-      typename local_graph_type::row_map_type::const_type::HostMirror k_rowPtrs_host_graph= Kokkos::create_mirror_view(graph.k_rowPtrs_);
-      Kokkos::deep_copy(k_rowPtrs_host_this, this->k_rowPtrs_);
-      Kokkos::deep_copy(k_rowPtrs_host_graph, graph.k_rowPtrs_);
-      for(size_t i=0; output && i<k_rowPtrs_host_this.extent(0); i++)
-        output = k_rowPtrs_host_this(i) == k_rowPtrs_host_graph(i) ? output : false;
+      auto rowPtrsThis = this->rowPtrsUnpacked_host_;
+      auto rowPtrsGraph = graph.rowPtrsUnpacked_host_;
+      for(size_t i=0; output && i<rowPtrsThis.extent(0); i++)
+        output = rowPtrsThis(i) == rowPtrsGraph(i) ? output : false;
     }
 
-    // Compare k_lclInds1D_ isa Kokkos::View<LocalOrdinal*, ...>
-    output = this->k_lclInds1D_.extent(0) == graph.k_lclInds1D_.extent(0) ? output : false;
-    if(output && this->k_lclInds1D_.extent(0) > 0)
+    // Compare lclIndsUnpacked_wdv isa Kokkos::View<LocalOrdinal*, ...>
+    output = this->lclIndsUnpacked_wdv.extent(0) == graph.lclIndsUnpacked_wdv.extent(0) ? output : false;
+    if(output && this->lclIndsUnpacked_wdv.extent(0) > 0)
     {
-      typename local_graph_type::entries_type::non_const_type::HostMirror k_lclInds1D_host_this = Kokkos::create_mirror_view(this->k_lclInds1D_);
-      typename local_graph_type::entries_type::non_const_type::HostMirror k_lclInds1D_host_graph= Kokkos::create_mirror_view(graph.k_lclInds1D_);
-      Kokkos::deep_copy(k_lclInds1D_host_this, this->k_lclInds1D_);
-      Kokkos::deep_copy(k_lclInds1D_host_graph, graph.k_lclInds1D_);
-      for(size_t i=0; output && i < k_lclInds1D_host_this.extent(0); i++)
-        output = k_lclInds1D_host_this(i) == k_lclInds1D_host_graph(i) ? output : false;
+      auto indThis = this->lclIndsUnpacked_wdv.getHostView(Access::ReadOnly);
+      auto indGraph = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadOnly);
+      for(size_t i=0; output && i < indThis.extent(0); i++)
+        output = indThis(i) == indGraph(i) ? output : false;
     }
 
-    // Compare k_gblInds1D_ isa Kokkos::View<GlobalOrdinal*, ...>
-    output = this->k_gblInds1D_.extent(0) == graph.k_gblInds1D_.extent(0) ? output : false;
-    if(output && this->k_gblInds1D_.extent(0) > 0)
+    // Compare gblInds_wdv isa Kokkos::View<GlobalOrdinal*, ...>
+    output = this->gblInds_wdv.extent(0) == graph.gblInds_wdv.extent(0) ? output : false;
+    if(output && this->gblInds_wdv.extent(0) > 0)
     {
-      typename t_GlobalOrdinal_1D::HostMirror k_gblInds1D_host_this  = Kokkos::create_mirror_view(this->k_gblInds1D_);
-      typename t_GlobalOrdinal_1D::HostMirror k_gblInds1D_host_graph = Kokkos::create_mirror_view(graph.k_gblInds1D_);
-      Kokkos::deep_copy(k_gblInds1D_host_this, this->k_gblInds1D_);
-      Kokkos::deep_copy(k_gblInds1D_host_graph, graph.k_gblInds1D_);
-      for(size_t i=0; output && i<k_gblInds1D_host_this.extent(0); i++)
-        output = k_gblInds1D_host_this(i) == k_gblInds1D_host_graph(i) ? output : false;
+      auto indtThis = this->gblInds_wdv.getHostView(Access::ReadOnly);
+      auto indtGraph = graph.gblInds_wdv.getHostView(Access::ReadOnly);
+      for(size_t i=0; output && i<indtThis.extent(0); i++)
+        output = indtThis(i) == indtGraph(i) ? output : false;
     }
 
-    // Check lclGraph_      // isa Kokkos::StaticCrsGraph<LocalOrdinal, Kokkos::LayoutLeft, execution_space>
+    // Check lclGraph_ isa
+    // Kokkos::StaticCrsGraph<LocalOrdinal, Kokkos::LayoutLeft, execution_space>
     // Kokkos::StaticCrsGraph has 3 data members in it:
-    //   Kokkos::View<size_type*, ...> row_map            (local_graph_type::row_map_type)
-    //   Kokkos::View<data_type*, ...> entries            (local_graph_type::entries_type)
-    //   Kokkos::View<size_type*, ...> row_block_offsets  (local_graph_type::row_block_type)
-    // There is currently no Kokkos::StaticCrsGraph comparison function that's built-in, so we will just compare
-    // the three data items here. This can be replaced if Kokkos ever puts in its own comparison routine.
-    output = this->lclGraph_.row_map.extent(0) == graph.lclGraph_.row_map.extent(0) ? output : false;
-    if(output && this->lclGraph_.row_map.extent(0) > 0)
+    //   Kokkos::View<size_type*, ...> row_map            
+    //           (local_graph_device_type::row_map_type)
+    //   Kokkos::View<data_type*, ...> entries            
+    //           (local_graph_device_type::entries_type)
+    //   Kokkos::View<size_type*, ...> row_block_offsets  
+    //           (local_graph_device_type::row_block_type)
+    // There is currently no Kokkos::StaticCrsGraph comparison function 
+    // that's built-in, so we will just compare
+    // the three data items here. This can be replaced if Kokkos ever 
+    // puts in its own comparison routine.
+    local_graph_host_type thisLclGraph = this->getLocalGraphHost();
+    local_graph_host_type graphLclGraph = graph.getLocalGraphHost();
+
+    output = thisLclGraph.row_map.extent(0) == graphLclGraph.row_map.extent(0) 
+           ? output : false;
+    if(output && thisLclGraph.row_map.extent(0) > 0)
     {
-      typename local_graph_type::row_map_type::HostMirror lclGraph_rowmap_host_this  = Kokkos::create_mirror_view(this->lclGraph_.row_map);
-      typename local_graph_type::row_map_type::HostMirror lclGraph_rowmap_host_graph = Kokkos::create_mirror_view(graph.lclGraph_.row_map);
-      Kokkos::deep_copy(lclGraph_rowmap_host_this, this->lclGraph_.row_map);
-      Kokkos::deep_copy(lclGraph_rowmap_host_graph, graph.lclGraph_.row_map);
-      for(size_t i=0; output && i<lclGraph_rowmap_host_this.extent(0); i++)
-        output = lclGraph_rowmap_host_this(i) == lclGraph_rowmap_host_graph(i) ? output : false;
+      auto lclGraph_rowmap_host_this = thisLclGraph.row_map;
+      auto lclGraph_rowmap_host_graph = graphLclGraph.row_map;
+      for (size_t i=0; output && i < lclGraph_rowmap_host_this.extent(0); i++)
+        output = lclGraph_rowmap_host_this(i) == lclGraph_rowmap_host_graph(i)
+               ? output : false;
     }
 
-    output = this->lclGraph_.entries.extent(0) == graph.lclGraph_.entries.extent(0) ? output : false;
-    if(output && this->lclGraph_.entries.extent(0) > 0)
+    output = thisLclGraph.entries.extent(0) == graphLclGraph.entries.extent(0)
+           ? output : false;
+    if(output && thisLclGraph.entries.extent(0) > 0)
     {
-      typename local_graph_type::entries_type::HostMirror lclGraph_entries_host_this = Kokkos::create_mirror_view(this->lclGraph_.entries);
-      typename local_graph_type::entries_type::HostMirror lclGraph_entries_host_graph = Kokkos::create_mirror_view(graph.lclGraph_.entries);
-      Kokkos::deep_copy(lclGraph_entries_host_this, this->lclGraph_.entries);
-      Kokkos::deep_copy(lclGraph_entries_host_graph, graph.lclGraph_.entries);
-      for(size_t i=0; output && i<lclGraph_entries_host_this.extent(0); i++)
-        output = lclGraph_entries_host_this(i) == lclGraph_entries_host_graph(i) ? output : false;
+      auto lclGraph_entries_host_this = thisLclGraph.entries;
+      auto lclGraph_entries_host_graph = graphLclGraph.entries;
+      for (size_t i=0; output && i < lclGraph_entries_host_this.extent(0); i++)
+        output = lclGraph_entries_host_this(i) == lclGraph_entries_host_graph(i)
+               ? output : false;
     }
 
-    output = this->lclGraph_.row_block_offsets.extent(0) == graph.lclGraph_.row_block_offsets.extent(0) ? output : false;
-    if(output && this->lclGraph_.row_block_offsets.extent(0) > 0)
+    output = 
+      thisLclGraph.row_block_offsets.extent(0) == 
+      graphLclGraph.row_block_offsets.extent(0) ? output : false;
+    if(output && thisLclGraph.row_block_offsets.extent(0) > 0)
     {
-      typename local_graph_type::row_block_type::HostMirror lclGraph_rbo_host_this = Kokkos::create_mirror_view(this->lclGraph_.row_block_offsets);
-      typename local_graph_type::row_block_type::HostMirror lclGraph_rbo_host_graph = Kokkos::create_mirror_view(graph.lclGraph_.row_block_offsets);
-      Kokkos::deep_copy(lclGraph_rbo_host_this, this->lclGraph_.row_block_offsets);
-      Kokkos::deep_copy(lclGraph_rbo_host_graph, graph.lclGraph_.row_block_offsets);
-      for(size_t i=0; output && i < lclGraph_rbo_host_this.extent(0); i++)
-        output = lclGraph_rbo_host_this(i) == lclGraph_rbo_host_graph(i) ? output : false;
+      auto lclGraph_rbo_host_this = thisLclGraph.row_block_offsets;
+      auto lclGraph_rbo_host_graph = graphLclGraph.row_block_offsets;
+      for (size_t i=0; output && i < lclGraph_rbo_host_this.extent(0); i++)
+        output = lclGraph_rbo_host_this(i) == lclGraph_rbo_host_graph(i) 
+               ? output : false;
     }
 
-    // For the Importer and Exporter, we shouldn't need to explicitly check them since
+    // For Importer and Exporter, we don't need to explicitly check them since
     // they will be consistent with the maps.
     // Note: importer_  isa Teuchos::RCP<const import_type>
     //       exporter_  isa Teuchos::RCP<const export_type>
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrixMultiplyOp.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrixMultiplyOp.hpp
index c88e36486e99..f025f39c65d6 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrixMultiplyOp.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrixMultiplyOp.hpp
@@ -108,8 +108,8 @@ namespace Tpetra {
     using map_type = Map<LocalOrdinal, GlobalOrdinal, Node>;
 
   private:
-    using local_matrix_type =
-      typename crs_matrix_type::local_matrix_type;
+    using local_matrix_device_type =
+      typename crs_matrix_type::local_matrix_device_type;
 
   public:
     //! @name Constructor and destructor
@@ -121,7 +121,8 @@ namespace Tpetra {
     ///   <tt>Operator<Scalar, ...></tt>.
     CrsMatrixMultiplyOp (const Teuchos::RCP<const crs_matrix_type>& A) :
       matrix_ (A),
-      localMultiply_ (std::make_shared<local_matrix_type> (A->getLocalMatrix ()))
+      localMultiply_ (std::make_shared<local_matrix_device_type> (
+                                       A->getLocalMatrixDevice ()))
     {}
 
     //! Destructor (virtual for memory safety of derived classes).
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
index 4172331ba017..ec397c885c3d 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
@@ -429,16 +429,6 @@ namespace Tpetra {
 
     //! The type of each entry in the matrix.
     using scalar_type = Scalar;
-    /// \brief The type used internally in place of \c Scalar.
-    ///
-    /// Some \c Scalar types might not work with Kokkos on all
-    /// execution spaces, due to missing CUDA device macros or
-    /// volatile overloads.  The C++ standard type std::complex<T> has
-    /// this problem.  To fix this, we replace std::complex<T> values
-    /// internally with the (usually) bitwise identical type
-    /// Kokkos::complex<T>.  The latter is the \c impl_scalar_type
-    /// corresponding to \c Scalar = std::complex.
-    using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
     //! The type of each local index in the matrix.
     using local_ordinal_type = LocalOrdinal;
     //! The type of each global index in the matrix.
@@ -454,13 +444,6 @@ namespace Tpetra {
     /// See e.g., GitHub Issue #57.
     using node_type = Node;
 
-    /// \brief Type of a norm result.
-    ///
-    /// This is usually the same as the type of the magnitude
-    /// (absolute value) of <tt>Scalar</tt>, but may differ for
-    /// certain <tt>Scalar</tt> types.
-    using mag_type = typename Kokkos::ArithTraits<impl_scalar_type>::mag_type;
-
     //! The Map specialization suitable for this CrsMatrix specialization.
     using map_type = Map<LocalOrdinal, GlobalOrdinal, Node>;
 
@@ -470,20 +453,50 @@ namespace Tpetra {
     //! The Export specialization suitable for this CrsMatrix specialization.
     using export_type = Export<LocalOrdinal, GlobalOrdinal, Node>;
 
+    //! The RowMatrix representing the base class of CrsMatrix
+    using row_matrix_type = RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+
+    /// \brief The type used internally in place of \c Scalar.
+    ///
+    /// Some \c Scalar types might not work with Kokkos on all
+    /// execution spaces, due to missing CUDA device macros or
+    /// volatile overloads.  The C++ standard type std::complex<T> has
+    /// this problem.  To fix this, we replace std::complex<T> values
+    /// internally with the (usually) bitwise identical type
+    /// Kokkos::complex<T>.  The latter is the \c impl_scalar_type
+    /// corresponding to \c Scalar = std::complex.
+    using impl_scalar_type = typename row_matrix_type::impl_scalar_type;
+    /// \brief Type of a norm result.
+    ///
+    /// This is usually the same as the type of the magnitude
+    /// (absolute value) of <tt>Scalar</tt>, but may differ for
+    /// certain <tt>Scalar</tt> types.
+    using mag_type = typename Kokkos::ArithTraits<impl_scalar_type>::mag_type;
+
     //! The CrsGraph specialization suitable for this CrsMatrix specialization.
     using crs_graph_type = CrsGraph<LocalOrdinal, GlobalOrdinal, Node>;
 
     //! The part of the sparse matrix's graph on each MPI process.
-    using local_graph_type = typename crs_graph_type::local_graph_type;
+    using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
+    using local_graph_host_type = typename crs_graph_type::local_graph_host_type;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    using local_graph_type = local_graph_device_type;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief The specialization of Kokkos::CrsMatrix that represents
     ///   the part of the sparse matrix on each MPI process.
-    using local_matrix_type =
+    using local_matrix_device_type =
       KokkosSparse::CrsMatrix<impl_scalar_type,
                               local_ordinal_type,
                               device_type,
                               void,
-                              typename local_graph_type::size_type>;
+                              typename local_graph_device_type::size_type>;
+    using local_matrix_host_type = 
+          typename local_matrix_device_type::HostMirror;
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+    using local_matrix_type = local_matrix_device_type;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief The type of the local matrix-vector operator (a wrapper of \c KokkosSparse::CrsMatrix )
     using local_multiply_op_type =
@@ -606,6 +619,38 @@ namespace Tpetra {
                const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
 
+    /// \brief Constructor specifying a matrix and a previously
+    ///   constructed graph, presumably a subset of the matrix's graph.
+    ///   This matrix will alias the first N values of the passed-in
+    ///   matrix, where N is the number of entries in the graph.
+    ///
+    /// Calling this constructor fixes the graph structure of the
+    /// sparse matrix.  We say in this case that the matrix has a
+    /// "static graph."  If you create a CrsMatrix with this
+    /// constructor, you are not allowed to insert new entries into
+    /// the matrix, but you are allowed to change values in the
+    /// matrix.
+    ///
+    /// The given graph must be fill complete.  Note that calling
+    /// resumeFill() on the graph makes it not fill complete, even if
+    /// you had previously called fillComplete() on the graph.  In
+    /// that case, you must call fillComplete() on the graph again
+    /// before invoking this CrsMatrix constructor.
+    ///
+    /// This constructor is marked \c explicit so that you can't
+    /// create a CrsMatrix by accident when passing a CrsGraph into a
+    /// function that takes a CrsMatrix.
+    ///
+    /// \param matrix [in] The existing matrix whose values this one will alias.
+    /// \param graph [in] The graph structure of the sparse matrix.
+    ///   The graph <i>must</i> be fill complete.
+    /// \param params [in/out] Optional list of parameters.  If not
+    ///   null, any missing parameters will be filled in with their
+    ///   default values.
+    explicit CrsMatrix (CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& matrix,
+                        const Teuchos::RCP<const crs_graph_type>& graph,
+                        const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
+
     /// \brief Constructor specifying a previously constructed graph.
     ///
     /// Calling this constructor fixes the graph structure of the
@@ -662,7 +707,7 @@ namespace Tpetra {
     ///   null, any missing parameters will be filled in with their
     ///   default values.
     explicit CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
-                        const typename local_matrix_type::values_type& values,
+                        const typename local_matrix_device_type::values_type& values,
                         const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
     /// \brief Constructor specifying column Map and arrays containing
@@ -693,9 +738,9 @@ namespace Tpetra {
     ///   default values.
     CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
                const Teuchos::RCP<const map_type>& colMap,
-               const typename local_matrix_type::row_map_type& rowPointers,
-               const typename local_graph_type::entries_type::non_const_type& columnIndices,
-               const typename local_matrix_type::values_type& values,
+               const typename local_graph_device_type::row_map_type& rowPointers,
+               const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
+               const typename local_matrix_device_type::values_type& values,
                const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
     /// \brief Constructor specifying column Map and arrays containing
@@ -754,7 +799,7 @@ namespace Tpetra {
     ///   default values.
     CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
                const Teuchos::RCP<const map_type>& colMap,
-               const local_matrix_type& lclMatrix,
+               const local_matrix_device_type& lclMatrix,
                const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
     /// \brief Constructor specifying column, domain and range Maps,
@@ -787,7 +832,7 @@ namespace Tpetra {
     /// \param params [in/out] Optional list of parameters.  If not
     ///   null, any missing parameters will be filled in with their
     ///   default values.
-    CrsMatrix (const local_matrix_type& lclMatrix,
+    CrsMatrix (const local_matrix_device_type& lclMatrix,
                const Teuchos::RCP<const map_type>& rowMap,
                const Teuchos::RCP<const map_type>& colMap,
                const Teuchos::RCP<const map_type>& domainMap = Teuchos::null,
@@ -798,7 +843,7 @@ namespace Tpetra {
     ///   \param lclMatrix [in] In almost all cases the local matrix
     ///     must be sorted on input, but if it isn't sorted,
     ///     "sorted" must be set to false in params.
-    CrsMatrix (const local_matrix_type& lclMatrix,
+    CrsMatrix (const local_matrix_device_type& lclMatrix,
                const Teuchos::RCP<const map_type>& rowMap,
                const Teuchos::RCP<const map_type>& colMap,
                const Teuchos::RCP<const map_type>& domainMap,
@@ -1025,7 +1070,7 @@ namespace Tpetra {
                              const RowInfo& rowInfo,
                              const GlobalOrdinal inds[],
                              const impl_scalar_type newVals[],
-                             const LocalOrdinal numElts) const;
+                             const LocalOrdinal numElts);
 
   public:
     /// \brief Replace one or more entries' values, using global indices.
@@ -1068,14 +1113,14 @@ namespace Tpetra {
     replaceGlobalValues(
       const global_ordinal_type globalRow,
       const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
-      const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) const;
+      const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals);
 
     /// \brief Overload of replaceGlobalValues (see above), that takes
     ///   Teuchos::ArrayView (host pointers) instead of Kokkos::View.
     LocalOrdinal
     replaceGlobalValues (const GlobalOrdinal globalRow,
                          const Teuchos::ArrayView<const GlobalOrdinal>& cols,
-                         const Teuchos::ArrayView<const Scalar>& vals) const;
+                         const Teuchos::ArrayView<const Scalar>& vals);
 
     /// \brief Overload of replaceGlobalValues (see above), that takes
     ///   raw pointers instead of Kokkos::View.
@@ -1095,7 +1140,7 @@ namespace Tpetra {
     replaceGlobalValues (const GlobalOrdinal globalRow,
                          const LocalOrdinal numEnt,
                          const Scalar vals[],
-                         const GlobalOrdinal cols[]) const;
+                         const GlobalOrdinal cols[]);
 
   private:
     /// \brief Implementation detail of replaceLocalValues.
@@ -1114,7 +1159,7 @@ namespace Tpetra {
                             const RowInfo& rowInfo,
                             const LocalOrdinal inds[],
                             const impl_scalar_type newVals[],
-                            const LocalOrdinal numElts) const;
+                            const LocalOrdinal numElts);
 
   public:
     /// \brief Replace one or more entries' values, using local
@@ -1156,7 +1201,7 @@ namespace Tpetra {
     replaceLocalValues(
       const local_ordinal_type localRow,
       const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
-      const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) const;
+      const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals);
 
     /// \brief Backwards compatibility version of replaceLocalValues
     ///   (see above), that takes Teuchos::ArrayView (host pointers)
@@ -1164,7 +1209,7 @@ namespace Tpetra {
     LocalOrdinal
     replaceLocalValues (const LocalOrdinal localRow,
                         const Teuchos::ArrayView<const LocalOrdinal>& cols,
-                        const Teuchos::ArrayView<const Scalar>& vals) const;
+                        const Teuchos::ArrayView<const Scalar>& vals);
 
     /// \brief Epetra compatibility version of replaceLocalValues,
     ///   that takes raw pointers instead of Kokkos::View.
@@ -1187,7 +1232,7 @@ namespace Tpetra {
     replaceLocalValues (const LocalOrdinal localRow,
                         const LocalOrdinal numEnt,
                         const Scalar inputVals[],
-                        const LocalOrdinal inputCols[]) const;
+                        const LocalOrdinal inputCols[]);
 
   private:
     /// \brief Whether sumIntoLocalValues and sumIntoGlobalValues
@@ -1231,7 +1276,7 @@ namespace Tpetra {
                              const GlobalOrdinal inds[],
                              const impl_scalar_type newVals[],
                              const LocalOrdinal numElts,
-                             const bool atomic = useAtomicUpdatesByDefault) const;
+                             const bool atomic = useAtomicUpdatesByDefault);
 
   public:
     /// \brief Sum into one or more sparse matrix entries, using
@@ -1325,7 +1370,7 @@ namespace Tpetra {
                             const LocalOrdinal inds[],
                             const impl_scalar_type newVals[],
                             const LocalOrdinal numElts,
-                            const bool atomic = useAtomicUpdatesByDefault) const;
+                            const bool atomic = useAtomicUpdatesByDefault);
 
   public:
     /// \brief Sum into one or more sparse matrix entries, using local
@@ -1369,7 +1414,7 @@ namespace Tpetra {
       const local_ordinal_type localRow,
       const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
       const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
-      const bool atomic = useAtomicUpdatesByDefault) const;
+      const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Sum into one or more sparse matrix entries, using local
     ///   row and column indices.
@@ -1404,7 +1449,7 @@ namespace Tpetra {
     sumIntoLocalValues (const LocalOrdinal localRow,
                         const Teuchos::ArrayView<const LocalOrdinal>& cols,
                         const Teuchos::ArrayView<const Scalar>& vals,
-                        const bool atomic = useAtomicUpdatesByDefault) const;
+                        const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Epetra compatibility version of sumIntoLocalValues (see
     ///   above) that takes raw pointers instead of Kokkos::View.
@@ -1432,7 +1477,7 @@ namespace Tpetra {
                         const LocalOrdinal numEnt,
                         const Scalar vals[],
                         const LocalOrdinal cols[],
-                        const bool atomic = useAtomicUpdatesByDefault) const;
+                        const bool atomic = useAtomicUpdatesByDefault);
 
   private:
     /// \brief Transform the given values using local indices.
@@ -1473,7 +1518,7 @@ namespace Tpetra {
                           const impl_scalar_type newVals[],
                           const LocalOrdinal numElts,
                           std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                          const bool atomic = useAtomicUpdatesByDefault) const;
+                          const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Transform the given values using global indices.
     ///
@@ -1513,7 +1558,7 @@ namespace Tpetra {
                            const impl_scalar_type newVals[],
                            const LocalOrdinal numElts,
                            std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                           const bool atomic = useAtomicUpdatesByDefault) const;
+                           const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Transform the given values using local indices.
     ///
@@ -1547,7 +1592,7 @@ namespace Tpetra {
                           const impl_scalar_type inputVals[],
                           const LocalOrdinal inputCols[],
                           std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                          const bool atomic = useAtomicUpdatesByDefault) const;
+                          const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Transform the given values using global indices.
     ///
@@ -1581,7 +1626,7 @@ namespace Tpetra {
                            const impl_scalar_type inputVals[],
                            const GlobalOrdinal inputCols[],
                            std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                           const bool atomic = useAtomicUpdatesByDefault) const;
+                           const bool atomic = useAtomicUpdatesByDefault);
 
   public:
     /// \brief Transform CrsMatrix entries in place, using local
@@ -1635,7 +1680,7 @@ namespace Tpetra {
                           const typename UnmanagedView<LocalIndicesViewType>::type& inputInds,
                           const typename UnmanagedView<ImplScalarViewType>::type& inputVals,
                           BinaryFunction f,
-                          const bool atomic = useAtomicUpdatesByDefault) const
+                          const bool atomic = useAtomicUpdatesByDefault)
     {
       // We use static_assert here to check the template parameters,
       // rather than std::enable_if (e.g., on the return value, to
@@ -1729,7 +1774,7 @@ namespace Tpetra {
                              InputMemorySpace,
                              Kokkos::MemoryUnmanaged>& inputVals,
                            BinaryFunction f,
-                           const bool atomic = useAtomicUpdatesByDefault) const
+                           const bool atomic = useAtomicUpdatesByDefault)
     {
       typedef LocalOrdinal LO;
       const LO numInputEnt = inputInds.extent (0);
@@ -1775,9 +1820,9 @@ namespace Tpetra {
     ///   shallow copy.  Any method that changes the matrix's values
     ///   may then change this.
     void
-    setAllValues (const typename local_matrix_type::row_map_type& ptr,
-                  const typename local_graph_type::entries_type::non_const_type& ind,
-                  const typename local_matrix_type::values_type& val);
+    setAllValues (const typename local_graph_device_type::row_map_type& ptr,
+                  const typename local_graph_device_type::entries_type::non_const_type& ind,
+                  const typename local_matrix_device_type::values_type& val);
 
     /// \brief Set the local matrix using three (compressed sparse row) arrays.
     ///
@@ -1813,6 +1858,14 @@ namespace Tpetra {
                   Teuchos::ArrayRCP<const LocalOrdinal>& columnIndices,
                   Teuchos::ArrayRCP<const Scalar>& values) const;
 
+    /// Gets just the values array.  This *will* be a shallow copy
+    /// of the array (at least on the host memory space.  This
+    /// is not a const function, since the user can change these
+    /// values.
+    ///
+    /// \param values [out] Array of values. 
+    void getAllValues(Teuchos::ArrayRCP<Scalar>& values);
+
     //@}
     //! @name Transformational methods
     //@{
@@ -2169,9 +2222,14 @@ namespace Tpetra {
     ///   least once.  This method will do no error checking, so you
     ///   are responsible for knowing when it is safe to call this
     ///   method.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     local_matrix_type getLocalMatrix () const;
+#endif
+    local_matrix_device_type getLocalMatrixDevice () const;
+    local_matrix_host_type getLocalMatrixHost () const;
 
-    /// \brief The local sparse matrix operator (a wrapper of \c getLocalMatrix()
+    /// \brief The local sparse matrix operator 
+    ///   (a wrapper of \c getLocalMatrixDevice()
     ///   that supports local matrix-vector multiply)
     ///
     /// \warning It is only valid to call this method if this->isFillComplete().
@@ -2389,6 +2447,64 @@ namespace Tpetra {
     ///   getGlobalRowView() are valid for this object.
     virtual bool supportsRowViews () const override;
 
+protected:
+    using values_dualv_type =
+          Kokkos::DualView<impl_scalar_type*, device_type>;
+    using values_wdv_type = 
+          Details::WrappedDualView<values_dualv_type>;
+    values_wdv_type valuesUnpacked_wdv;
+    mutable values_wdv_type valuesPacked_wdv;
+
+public:
+
+    using row_ptrs_device_view_type = 
+          typename row_matrix_type::row_ptrs_device_view_type;
+    using row_ptrs_host_view_type = 
+          typename row_matrix_type::row_ptrs_host_view_type;
+
+
+    using local_inds_device_view_type = 
+          typename row_matrix_type::local_inds_device_view_type;
+    using local_inds_host_view_type = 
+          typename row_matrix_type::local_inds_host_view_type;
+    using nonconst_local_inds_host_view_type = 
+          typename row_matrix_type::nonconst_local_inds_host_view_type;
+
+    using global_inds_device_view_type = 
+          typename row_matrix_type::global_inds_device_view_type;
+    using global_inds_host_view_type = 
+          typename row_matrix_type::global_inds_host_view_type;
+    using nonconst_global_inds_host_view_type = 
+          typename row_matrix_type::nonconst_global_inds_host_view_type;
+
+    using values_device_view_type = 
+          typename row_matrix_type::values_device_view_type;
+    using values_host_view_type = 
+          typename row_matrix_type::values_host_view_type;
+    using nonconst_values_host_view_type = 
+          typename row_matrix_type::nonconst_values_host_view_type;
+
+//KDDKDD INROW    using values_host_view_type = 
+//KDDKDD INROW          typename values_dualv_type::t_host::const_type;
+//KDDKDD INROW    using values_device_view_type = 
+//KDDKDD INROW          typename values_dualv_type::t_dev::const_type;
+
+//KDDKDD INROW    using local_inds_host_view_type =
+//KDDKDD INROW          typename crs_graph_type::local_inds_host_view_type;
+//KDDKDD INROW    using local_inds_device_view_type =
+//KDDKDD INROW          typename crs_graph_type::local_inds_device_view_type;
+
+//KDDKDD INROW    using global_inds_host_view_type =
+//KDDKDD INROW          typename crs_graph_type::global_inds_host_view_type;
+//KDDKDD INROW    using global_inds_device_view_type =
+//KDDKDD INROW          typename crs_graph_type::global_inds_device_view_type;
+
+//KDDKDD INROW    using row_ptrs_host_view_type =
+//KDDKDD INROW          typename crs_graph_type::row_ptrs_host_view_type;
+//KDDKDD INROW    using row_ptrs_device_view_type =
+//KDDKDD INROW          typename crs_graph_type::row_ptrs_device_view_type;
+
+
     /// \brief Fill given arrays with a deep copy of the locally owned
     ///   entries of the matrix in a given row, using global column
     ///   indices.
@@ -2437,20 +2553,26 @@ namespace Tpetra {
     /// <tt>GlobalRow</tt> is not owned by the calling process, then
     /// \c Indices and \c Values are unchanged and \c NumIndices is
     /// returned as Teuchos::OrdinalTraits<size_t>::invalid().
+    void
+    getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                      nonconst_global_inds_host_view_type &Indices,
+                      nonconst_values_host_view_type &Values,
+                      size_t& NumEntries) const override;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getGlobalRowCopy (GlobalOrdinal GlobalRow,
                       const Teuchos::ArrayView<GlobalOrdinal>& Indices,
                       const Teuchos::ArrayView<Scalar>& Values,
                       size_t& NumEntries) const override;
-
+#endif
     /// \brief Fill given arrays with a deep copy of the locally owned
     ///   entries of the matrix in a given row, using local column
     ///   indices.
     ///
-    /// \param localRow   [in]  Local index of the row for which to return entries.
-    /// \param colInds    [out] Local column indices corresponding to values.
-    /// \param vals       [out] Matrix values.
-    /// \param numEntries [out] Number of entries returned.
+    /// \param LocalRow   [in]  Local index of the row for which to return entries.
+    /// \param Indices    [out] Local column indices corresponding to values.
+    /// \param Values       [out] Matrix values.
+    /// \param NumEntries [out] Number of entries returned.
     ///
     /// Note: A std::runtime_error exception is thrown if either
     /// <tt>colInds</tt> or \c vals is not large enough to hold the
@@ -2458,12 +2580,18 @@ namespace Tpetra {
     /// not owned by the calling process, then <tt>colInds</tt> and
     /// <tt>vals</tt> are unchanged and <tt>numEntries</tt> is
     /// returned as Teuchos::OrdinalTraits<size_t>::invalid().
+    void
+    getLocalRowCopy (LocalOrdinal LocalRow,
+                     nonconst_local_inds_host_view_type &Indices,
+                     nonconst_values_host_view_type &Values,
+                     size_t& NumEntries) const override;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getLocalRowCopy (LocalOrdinal localRow,
                      const Teuchos::ArrayView<LocalOrdinal>& colInds,
                      const Teuchos::ArrayView<Scalar>& vals,
-                     size_t& numEntries) const override;
-
+                     size_t& NumEntries) const override;
+#endif
     /// \brief Get a constant, nonpersisting view of a row of this
     ///   matrix, using global row and column indices.
     ///
@@ -2476,12 +2604,18 @@ namespace Tpetra {
     ///
     /// If \c GlobalRow is not a valid global row index on the calling
     /// process, then \c indices is set to null.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getGlobalRowView (GlobalOrdinal GlobalRow,
                       Teuchos::ArrayView<const GlobalOrdinal>& indices,
                       Teuchos::ArrayView<const Scalar>& values) const override;
+#endif  // TPETRA_ENABLE_DEPRECATED_CODE
+    void
+    getGlobalRowView (GlobalOrdinal GlobalRow,
+                      global_inds_host_view_type &indices,
+                      values_host_view_type &values) const override;
 
-    /// \brief Get a constant, nonpersisting view of a row of this
+    /// \brief Get a constant view of a row of this
     ///   matrix, using local row and column indices.
     ///
     /// \param LocalRow [in]  Local index of the row to view.
@@ -2493,11 +2627,19 @@ namespace Tpetra {
     ///
     /// If \c LocalRow is not a valid local row index on the calling
     /// process, then \c indices is set to null.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     void
     getLocalRowView (LocalOrdinal LocalRow,
                      Teuchos::ArrayView<const LocalOrdinal>& indices,
                      Teuchos::ArrayView<const Scalar>& values) const override;
+#endif  // TPETRA_ENABLE_DEPRECATED_CODE
+
+    void
+    getLocalRowView(LocalOrdinal LocalRow,
+                    local_inds_host_view_type &indices,
+                    values_host_view_type &values) const override;
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Get a constant, nonpersisting, locally indexed view of
     ///   the given row of the matrix, using "raw" pointers instead of
     ///   Teuchos::ArrayView.
@@ -2527,7 +2669,9 @@ namespace Tpetra {
                         LocalOrdinal& numEnt,
                         const LocalOrdinal*& lclColInds,
                         const Scalar*& vals) const override;
+#endif  // TPETRA_ENABLE_DEPRECATED_CODE
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Get a constant, nonpersisting view of a row of this
     ///   matrix, using local row and column indices, with raw
     ///   pointers.
@@ -2556,6 +2700,7 @@ namespace Tpetra {
                      LocalOrdinal& numEnt,
                      const impl_scalar_type*& val,
                      const LocalOrdinal*& ind) const;
+#endif  // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief Get a constant, nonpersisting view of a row of this
     ///   matrix, using local row and column indices, with raw
@@ -2564,6 +2709,7 @@ namespace Tpetra {
     /// This overload exists only if Scalar differs from
     /// impl_scalar_type.  In that case, this overload takes a Scalar
     /// pointer.
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     template<class OutputScalarType>
     typename std::enable_if<! std::is_same<OutputScalarType, impl_scalar_type>::value &&
                             std::is_convertible<impl_scalar_type, OutputScalarType>::value,
@@ -2580,6 +2726,7 @@ namespace Tpetra {
       val = reinterpret_cast<const OutputScalarType*> (valTmp);
       return err;
     }
+#endif  // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief Get a copy of the diagonal entries of the matrix.
     ///
@@ -3207,8 +3354,9 @@ namespace Tpetra {
 
   public:
     //! Get the Kokkos local values
-    typename local_matrix_type::values_type getLocalValuesView () const {
-      return k_values1D_;
+    typename local_matrix_device_type::values_type getLocalValuesView () const {
+// KDDKDD UVM SHOULD ADD ACCESS TAGS; SAFEST TO ASSUME ReadWrite FOR NOW
+      return valuesPacked_wdv.getDeviceView(Access::ReadWrite);
     }
 
   private:
@@ -3515,7 +3663,7 @@ namespace Tpetra {
                            const Teuchos::ArrayView<const GlobalOrdinal>& indices,
                            const Teuchos::ArrayView<const Scalar>& values,
                            BinaryFunction f,
-                           const bool atomic = useAtomicUpdatesByDefault) const
+                           const bool atomic = useAtomicUpdatesByDefault)
     {
       typedef impl_scalar_type IST;
       typedef LocalOrdinal LO;
@@ -3781,76 +3929,27 @@ namespace Tpetra {
 
     // matrix data accessors
 
-    /// \brief Const pointer to all entries (including extra space) in
-    ///   the given row.
-    ///
-    /// Unlike getGlobalRowView(), this method returns
-    /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
-    /// this method is <i>not</i> part of the public interface of
-    /// CrsMatrix.
-    ///
-    /// \param vals [out] On output: Const pointer to all entries,
-    ///   including any extra space, in the given row.  \c numEnt
-    ///   includes the empty space, if any.
-    /// \param numEnt [out] Number of available entries in the row.
-    ///   "Available" includes extra empty space, if any.
-    /// \param rowinfo [in] Result of getRowInfo (for a local row
-    ///   index) or getRowInfoFromGlobalRowIndex (for a global row
-    ///   index) for the row.
-    ///
-    /// \return Zero if no error, else a nonzero error code.
-    LocalOrdinal
-    getViewRawConst (const impl_scalar_type*& vals,
-                     LocalOrdinal& numEnt,
-                     const RowInfo& rowinfo) const;
+    /// \brief Get a const Host view of the locally owned values
+    ///  row myRow, such that rowinfo = getRowInfo(myRow).
+    typename values_dualv_type::t_host::const_type
+    getValuesViewHost (const RowInfo& rowinfo) const;
 
-    /// \brief Nonconst pointer to all entries (including extra space)
-    ///   in the given row.
-    ///
-    /// Unlike getGlobalRowView(), this method returns
-    /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
-    /// this method is <i>not</i> part of the public interface of
-    /// CrsMatrix.
-    ///
-    /// \param vals [out] On output: Const pointer to all entries,
-    ///   including any extra space, in the given row.  \c numEnt
-    ///   includes the empty space, if any.
-    /// \param numEnt [out] Number of available entries in the row.
-    ///   "Available" includes extra empty space, if any.
-    /// \param rowinfo [in] Result of getRowInfo (for a local row
-    ///   index) or getRowInfoFromGlobalRowIndex (for a global row
-    ///   index) for the row.
-    ///
-    /// \return Zero if no error, else a nonzero error code.
-    LocalOrdinal
-    getViewRaw (impl_scalar_type*& vals,
-                LocalOrdinal& numEnt,
-                const RowInfo& rowinfo) const;
+    /// \brief Get a const Device view of the locally owned values
+    ///  row myRow, such that rowinfo = getRowInfo(myRow).
+    typename values_dualv_type::t_dev::const_type
+    getValuesViewDevice (const RowInfo& rowinfo) const;
 
-    /// \brief Constant view of all entries (including extra space) in
-    ///   the given row.
-    ///
-    /// Unlike getGlobalRowView(), this method returns
-    /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
-    /// this method is <i>not</i> part of the public interface of
-    /// CrsMatrix.
-    Teuchos::ArrayView<const impl_scalar_type> getView (RowInfo rowinfo) const;
+    /// \brief Get a non-const Host view of the locally owned values
+    ///  row myRow, such that rowinfo = getRowInfo(myRow).
+    typename values_dualv_type::t_host
+    getValuesViewHostNonConst (const RowInfo& rowinfo);
 
-    /// \brief Nonconst view of all entries (including extra space) in
-    ///   the given row.
-    ///
-    /// Unlike getGlobalRowView(), this method returns
-    /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
-    /// this method is <i>not</i> part of the public interface of
-    /// CrsMatrix.
-    ///
-    /// This method is \c const because it doesn't change allocations
-    /// (and thus doesn't change pointers).  Consider the difference
-    /// between <tt>const double*</tt> and <tt>double* const</tt>.
-    Teuchos::ArrayView<impl_scalar_type>
-    getViewNonConst (const RowInfo& rowinfo) const;
+    /// \brief Get a non-const Device view of the locally owned values
+    ///  row myRow, such that rowinfo = getRowInfo(myRow).
+    typename values_dualv_type::t_dev
+    getValuesViewDeviceNonConst (const RowInfo& rowinfo);
 
-  private:
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief Constant view of all entries (including extra space) in
     ///   the given row.
     ///
@@ -3858,23 +3957,8 @@ namespace Tpetra {
     /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
     /// this method is <i>not</i> part of the public interface of
     /// CrsMatrix.
-    Kokkos::View<const impl_scalar_type*, device_type, Kokkos::MemoryUnmanaged>
-    getRowView (const RowInfo& rowInfo) const;
-
-    /// \brief Nonconst view of all entries (including extra space) in
-    ///   the given row.
-    ///
-    /// Unlike getGlobalRowView(), this method returns
-    /// <tt>impl_scalar_type</tt>, not \c Scalar.  This is because
-    /// this method is <i>not</i> part of the public interface of
-    /// CrsMatrix.
-    ///
-    /// This method is \c const because it doesn't change allocations
-    /// (and thus doesn't change pointers).  Consider the difference
-    /// between <tt>const double*</tt> and <tt>double* const</tt>.
-    Kokkos::View<impl_scalar_type*, device_type, Kokkos::MemoryUnmanaged>
-    getRowViewNonConst (const RowInfo& rowInfo) const;
-
+    Teuchos::ArrayView<const impl_scalar_type> getView (RowInfo rowinfo) const;
+#endif
 
   protected:
 
@@ -3923,14 +4007,19 @@ namespace Tpetra {
     //@}
 
     //! The local sparse matrix, wrapped in a multiply operator.
-    std::shared_ptr<local_multiply_op_type> lclMatrix_;
+// KDDKDD DELETE
+//    std::shared_ptr<local_multiply_op_type> lclMatrix_;
+// KDDKDD DELETE
 
     /// \brief Sparse matrix values, as part of compressed sparse row
     ///   ("1-D") storage.
     ///
     /// Before allocation, this array is empty.
-    typename local_matrix_type::values_type k_values1D_;
+// KDDKDD DELETE
+    typename local_matrix_device_type::values_type k_values1D_;
+// KDDKDD DELETE
 
+protected:
     /// \brief Status of the matrix's storage, when not in a
     ///   fill-complete state.
     ///
@@ -3988,17 +4077,20 @@ namespace Tpetra {
     // public inner class of CrsMatrix?  It looks like it doesn't
     // depend on any implementation details of CrsMatrix at all.  It
     // should really be declared and defined outside of CrsMatrix.
-    template<class ViewType, class OffsetViewType>
+    template<class DestViewType, class SrcViewType,
+             class DestOffsetViewType, class SrcOffsetViewType>
     struct pack_functor {
-      typedef typename ViewType::execution_space execution_space;
-      ViewType src_;
-      ViewType dst_;
-      OffsetViewType src_offset_;
-      OffsetViewType dst_offset_;
-      typedef typename OffsetViewType::non_const_value_type scalar_index_type;
-
-      pack_functor (ViewType dst, ViewType src,
-                    OffsetViewType dst_offset, OffsetViewType src_offset) :
+      typedef typename DestViewType::execution_space execution_space;
+      SrcViewType src_;
+      DestViewType dst_;
+      SrcOffsetViewType src_offset_;
+      DestOffsetViewType dst_offset_;
+      typedef typename DestOffsetViewType::non_const_value_type scalar_index_type;
+
+      pack_functor (DestViewType dst, 
+                    const SrcViewType src,
+                    DestOffsetViewType dst_offset,
+                    const SrcOffsetViewType src_offset) :
         src_ (src),
         dst_ (dst),
         src_offset_ (src_offset),
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
index 653c799484e0..5a380cf1f95e 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
@@ -75,6 +75,7 @@
 #include "Teuchos_RCP.hpp"
 #include "Teuchos_DataAccess.hpp"
 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
+#include "KokkosBlas.hpp"
 
 #include <memory>
 #include <sstream>
@@ -289,7 +290,7 @@ namespace Tpetra {
     storageStatus_ (Details::STORAGE_1D_PACKED)
   {
     using std::endl;
-    typedef typename local_matrix_type::values_type values_type;
+    typedef typename local_matrix_device_type::values_type values_type;
     const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
       "RCP<ParameterList>]): ";
     const bool verbose = Details::Behavior::verbose("CrsMatrix");
@@ -318,31 +319,21 @@ namespace Tpetra {
     // local matrix's number of columns comes from the column Map, not
     // the domain Map.
 
-    const size_t numCols = graph->getColMap ()->getNodeNumElements ();
-    auto lclGraph = graph->getLocalGraph ();
-    const size_t numEnt = lclGraph.entries.extent (0);
+    const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
     if (verbose) {
       std::ostringstream os;
       os << *prefix << "Allocate values: " << numEnt << endl;
       std::cerr << os.str ();
     }
-    values_type val ("Tpetra::CrsMatrix::val", numEnt);
 
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", numCols, val, lclGraph);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
+    values_type val ("Tpetra::CrsMatrix::values", numEnt);
+    valuesPacked_wdv = values_wdv_type(val);
+    valuesUnpacked_wdv = valuesPacked_wdv;
 
     // FIXME (22 Jun 2016) I would very much like to get rid of
     // k_values1D_ at some point.  I find it confusing to have all
     // these extra references lying around.
-    if (verbose) {
-      std::ostringstream os;
-      os << *prefix << "Assign k_values1D_: old="
-         << k_values1D_.extent(0) << ", new="
-         << lclMat->values.extent(0) << endl;
-      std::cerr << os.str ();
-    }
-    k_values1D_ = lclMat->values;
+//    k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
 
     checkInternalState ();
 
@@ -353,17 +344,49 @@ namespace Tpetra {
     }
   }
 
+  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  CrsMatrix(CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& matrix,
+            const Teuchos::RCP<const crs_graph_type>& graph,
+            const Teuchos::RCP<Teuchos::ParameterList>& params) :
+    dist_object_type (graph->getRowMap ()),
+    staticGraph_ (graph),
+    storageStatus_ (matrix.storageStatus_)
+  {
+    const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
+      "local_matrix_device_type::values_type, "
+      "[,RCP<ParameterList>]): ";
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (graph.is_null (), std::runtime_error, "Input graph is null.");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (! graph->isFillComplete (), std::runtime_error, "Input graph "
+       "is not fill complete. You must call fillComplete on the "
+       "graph before using it to construct a CrsMatrix.  Note that "
+       "calling resumeFill on the graph makes it not fill complete, "
+       "even if you had previously called fillComplete.  In that "
+       "case, you must call fillComplete on the graph again.");
+
+    size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
+    valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
+
+    size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
+    valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
+
+    checkInternalState();
+  }
+
+
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
-             const typename local_matrix_type::values_type& values,
+             const typename local_matrix_device_type::values_type& values,
              const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
     dist_object_type (graph->getRowMap ()),
     staticGraph_ (graph),
     storageStatus_ (Details::STORAGE_1D_PACKED)
   {
     const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
-      "local_matrix_type::values_type, "
+      "local_matrix_device_type::values_type, "
       "[,RCP<ParameterList>]): ";
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (graph.is_null (), std::runtime_error, "Input graph is null.");
@@ -381,17 +404,14 @@ namespace Tpetra {
     // local matrix's number of columns comes from the column Map, not
     // the domain Map.
 
-    const size_t numCols = graph->getColMap ()->getNodeNumElements ();
-    auto lclGraph = graph->getLocalGraph ();
-
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
+    valuesPacked_wdv = values_wdv_type(values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
 
     // FIXME (22 Jun 2016) I would very much like to get rid of
     // k_values1D_ at some point.  I find it confusing to have all
     // these extra references lying around.
-    k_values1D_ = lclMat->values;
+    // KDDKDD ALMOST THERE, MARK!
+//    k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
 
     checkInternalState ();
   }
@@ -400,9 +420,9 @@ namespace Tpetra {
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
              const Teuchos::RCP<const map_type>& colMap,
-             const typename local_matrix_type::row_map_type& rowPointers,
-             const typename local_graph_type::entries_type::non_const_type& columnIndices,
-             const typename local_matrix_type::values_type& values,
+             const typename local_graph_device_type::row_map_type& rowPointers,
+             const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
+             const typename local_matrix_device_type::values_type& values,
              const Teuchos::RCP<Teuchos::ParameterList>& params) :
     dist_object_type (rowMap),
     storageStatus_ (Details::STORAGE_1D_PACKED)
@@ -465,7 +485,7 @@ namespace Tpetra {
     // deep-copies or shallow-copies the input, but the dimensions
     // have to be right.  That's how we tell whether the CrsGraph has
     // a local graph.
-    auto lclGraph = graph->getLocalGraph ();
+    auto lclGraph = graph->getLocalGraphDevice ();
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
        lclGraph.entries.extent (0) != columnIndices.extent (0),
@@ -491,24 +511,13 @@ namespace Tpetra {
     // Note that the local matrix's number of columns comes from the
     // column Map, not the domain Map.
 
-    const size_t numCols = graph->getColMap ()->getNodeNumElements ();
-
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
-
-    auto newValues = lclMat->values;
-    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-      (newValues.extent (0) != values.extent (0),
-       std::logic_error, "Local matrix's constructor did not set the "
-       "values correctly.  newValues.extent(0) = " <<
-       newValues.extent (0) << " != values.extent(0) = " <<
-       values.extent (0) << suffix);
+    valuesPacked_wdv = values_wdv_type(values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
 
     // FIXME (22 Jun 2016) I would very much like to get rid of
     // k_values1D_ at some point.  I find it confusing to have all
     // these extra references lying around.
-    this->k_values1D_ = newValues;
+//    this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
 
     checkInternalState ();
     if (verbose) {
@@ -532,7 +541,7 @@ namespace Tpetra {
     using Kokkos::Compat::getKokkosViewDeepCopy;
     using Teuchos::av_reinterpret_cast;
     using Teuchos::RCP;
-    using values_type = typename local_matrix_type::values_type;
+    using values_type = typename local_matrix_device_type::values_type;
     using IST = impl_scalar_type;
     const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
       "RCP<const Map>, ptr, ind, val[, params]): ";
@@ -565,7 +574,7 @@ namespace Tpetra {
     // really care whether CrsGraph's constructor deep-copies or
     // shallow-copies the input, but the dimensions have to be right.
     // That's how we tell whether the CrsGraph has a local graph.
-    auto lclGraph = staticGraph_->getLocalGraph ();
+    auto lclGraph = staticGraph_->getLocalGraphDevice ();
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
        size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
@@ -573,19 +582,15 @@ namespace Tpetra {
        "ptr, ind[, params]) did not set the local graph correctly.  "
        "Please report this bug to the Tpetra developers.");
 
-    const size_t numCols =
-      staticGraph_->getColMap ()->getNodeNumElements ();
     values_type valIn =
       getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
-
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", numCols, valIn, lclGraph);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
+    valuesPacked_wdv = values_wdv_type(valIn);
+    valuesUnpacked_wdv = valuesPacked_wdv;
 
     // FIXME (22 Jun 2016) I would very much like to get rid of
     // k_values1D_ at some point.  I find it confusing to have all
     // these extra references lying around.
-    this->k_values1D_ = lclMat->values;
+//    this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
 
     checkInternalState ();
   }
@@ -594,17 +599,14 @@ namespace Tpetra {
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
              const Teuchos::RCP<const map_type>& colMap,
-             const local_matrix_type& lclMatrix,
+             const local_matrix_device_type& lclMatrix,
              const Teuchos::RCP<Teuchos::ParameterList>& params) :
     dist_object_type (rowMap),
-    lclMatrix_ (std::make_shared<local_multiply_op_type>
-                (std::make_shared<local_matrix_type> (lclMatrix))),
-    k_values1D_ (lclMatrix.values),
     storageStatus_ (Details::STORAGE_1D_PACKED),
     fillComplete_ (true)
   {
     const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
-      "RCP<const Map>, local_matrix_type[, RCP<ParameterList>]): ";
+      "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
     const char suffix[] =
       "  Please report this bug to the Tpetra developers.";
 
@@ -616,12 +618,12 @@ namespace Tpetra {
     catch (std::exception& e) {
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
-         "RCP<const Map>, local_graph_type[, RCP<ParameterList>]) threw an "
+         "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
          "exception: " << e.what ());
     }
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
-       "<const Map>, RCP<const Map>, local_graph_type[, RCP<ParameterList>]) "
+       "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
        "did not produce a fill-complete graph.  Please report this bug to the "
        "Tpetra developers.");
     // myGraph_ not null means that the matrix owns the graph.  This
@@ -630,6 +632,11 @@ namespace Tpetra {
     myGraph_ = graph;
     staticGraph_ = graph;
 
+    valuesPacked_wdv = values_wdv_type(lclMatrix.values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
+
+//    k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
+
     const bool callComputeGlobalConstants = params.get () == nullptr ||
       params->get ("compute global constants", true);
     if (callComputeGlobalConstants) {
@@ -649,22 +656,19 @@ namespace Tpetra {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  CrsMatrix (const local_matrix_type& lclMatrix,
+  CrsMatrix (const local_matrix_device_type& lclMatrix,
              const Teuchos::RCP<const map_type>& rowMap,
              const Teuchos::RCP<const map_type>& colMap,
              const Teuchos::RCP<const map_type>& domainMap,
              const Teuchos::RCP<const map_type>& rangeMap,
              const Teuchos::RCP<Teuchos::ParameterList>& params) :
     dist_object_type (rowMap),
-    lclMatrix_ (std::make_shared<local_multiply_op_type>
-                (std::make_shared<local_matrix_type> (lclMatrix))),
-    k_values1D_ (lclMatrix.values),
     storageStatus_ (Details::STORAGE_1D_PACKED),
     fillComplete_ (true)
   {
     const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
       "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
-      "local_matrix_type[, RCP<ParameterList>]): ";
+      "local_matrix_device_type[, RCP<ParameterList>]): ";
     const char suffix[] =
       "  Please report this bug to the Tpetra developers.";
 
@@ -676,13 +680,13 @@ namespace Tpetra {
     catch (std::exception& e) {
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
-         "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_type[, "
+         "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
          "RCP<ParameterList>]) threw an exception: " << e.what ());
     }
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (! graph->isFillComplete (), std::logic_error, "CrsGraph "
        "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
-       "RCP<const Map>, local_graph_type[, RCP<ParameterList>]) did "
+       "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
        "not produce a fillComplete graph." << suffix);
     // myGraph_ not null means that the matrix owns the graph.  This
     // is true because the column indices come in as nonconst through
@@ -690,6 +694,10 @@ namespace Tpetra {
     myGraph_ = graph;
     staticGraph_ = graph;
 
+    valuesPacked_wdv = values_wdv_type(lclMatrix.values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
+//    k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
+
     const bool callComputeGlobalConstants = params.get () == nullptr ||
       params->get ("compute global constants", true);
     if (callComputeGlobalConstants) {
@@ -709,7 +717,7 @@ namespace Tpetra {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  CrsMatrix (const local_matrix_type& lclMatrix,
+  CrsMatrix (const local_matrix_device_type& lclMatrix,
              const Teuchos::RCP<const map_type>& rowMap,
              const Teuchos::RCP<const map_type>& colMap,
              const Teuchos::RCP<const map_type>& domainMap,
@@ -718,9 +726,6 @@ namespace Tpetra {
              const Teuchos::RCP<const export_type>& exporter,
              const Teuchos::RCP<Teuchos::ParameterList>& params) :
     dist_object_type (rowMap),
-    lclMatrix_ (std::make_shared<local_multiply_op_type>
-                (std::make_shared<local_matrix_type> (lclMatrix))),
-    k_values1D_ (lclMatrix.values),
     storageStatus_ (Details::STORAGE_1D_PACKED),
     fillComplete_ (true)
   {
@@ -739,12 +744,12 @@ namespace Tpetra {
     catch (std::exception& e) {
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (true, std::runtime_error, "CrsGraph constructor "
-         "(local_graph_type, Map, Map, Map, Map, Import, Export, "
+         "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
          "params) threw: " << e.what ());
     }
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (!graph->isFillComplete (), std::logic_error, "CrsGraph "
-       "constructor (local_graph_type, Map, Map, Map, Map, Import, "
+       "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
        "Export, params) did not produce a fill-complete graph.  "
        "Please report this bug to the Tpetra developers.");
     // myGraph_ not null means that the matrix owns the graph.  This
@@ -753,6 +758,10 @@ namespace Tpetra {
     myGraph_ = graph;
     staticGraph_ = graph;
 
+    valuesPacked_wdv = values_wdv_type(lclMatrix.values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
+//    k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
+
     const bool callComputeGlobalConstants = params.get () == nullptr ||
       params->get ("compute global constants", true);
     if (callComputeGlobalConstants) {
@@ -783,14 +792,16 @@ namespace Tpetra {
        "Source graph must be fillComplete().");
 
     if (copyOrView == Teuchos::Copy) {
-      using values_type = typename local_matrix_type::values_type;
+      using values_type = typename local_matrix_device_type::values_type;
       values_type vals = source.getLocalValuesView ();
       using Kokkos::view_alloc;
       using Kokkos::WithoutInitializing;
       values_type newvals (view_alloc ("val", WithoutInitializing),
                            vals.extent (0));
       Kokkos::deep_copy (newvals, vals);
-      k_values1D_ = newvals;
+      valuesPacked_wdv = values_wdv_type(newvals);
+      valuesUnpacked_wdv = valuesPacked_wdv;
+//      k_values1D_ = newvals;
       if (source.isFillComplete ()) {
         fillComplete (source.getDomainMap (), source.getRangeMap ());
       }
@@ -812,16 +823,16 @@ namespace Tpetra {
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   swap(CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> & crs_matrix)
   {
-    std::swap(crs_matrix.importMV_,      this->importMV_);        // mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
-    std::swap(crs_matrix.exportMV_,      this->exportMV_);        // mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
-    std::swap(crs_matrix.staticGraph_,   this->staticGraph_);     // Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
-    std::swap(crs_matrix.myGraph_,       this->myGraph_);         // Teuchos::RCP<      CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
-    std::swap(crs_matrix.lclMatrix_,     this->lclMatrix_);       // KokkosSparse::CrsMatrix<impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type>
-    std::swap(crs_matrix.k_values1D_,    this->k_values1D_);      // KokkosSparse::CrsMatrix<impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type>::values_type
-    std::swap(crs_matrix.storageStatus_, this->storageStatus_);   // ::Tpetra::Details::EStorageStatus (enum f/m Tpetra_CrsGraph_decl.hpp)
-    std::swap(crs_matrix.fillComplete_,  this->fillComplete_);    // bool
-    std::swap(crs_matrix.nonlocals_,     this->nonlocals_);       // std::map<GO, pair<Teuchos::Array<GO>,Teuchos::Array<Scalar>>
-    std::swap(crs_matrix.frobNorm_,      this->frobNorm_);        // mutable Kokkos::Details::ArithTraits<impl_scalar_type>::mag_type
+    std::swap(crs_matrix.importMV_,      this->importMV_);
+    std::swap(crs_matrix.exportMV_,      this->exportMV_);
+    std::swap(crs_matrix.staticGraph_,   this->staticGraph_);
+    std::swap(crs_matrix.myGraph_,       this->myGraph_);
+    std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
+    std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
+    std::swap(crs_matrix.storageStatus_, this->storageStatus_);
+    std::swap(crs_matrix.fillComplete_,  this->fillComplete_);
+    std::swap(crs_matrix.nonlocals_,     this->nonlocals_);
+    std::swap(crs_matrix.frobNorm_,      this->frobNorm_);
   }
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -1032,22 +1043,49 @@ namespace Tpetra {
     }
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_type
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   getLocalMatrix () const
   {
-    return lclMatrix_.get () == nullptr ?
-      local_matrix_type () :
-      lclMatrix_->getLocalMatrix ();
+    return getLocalMatrixDevice();
+  }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalMatrixDevice () const
+  {
+    auto numCols = staticGraph_->getColMap()->getNodeNumElements();
+    return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
+                              numCols,
+                              valuesPacked_wdv.getDeviceView(Access::ReadWrite),
+                              staticGraph_->getLocalGraphDevice());
   }
 
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalMatrixHost () const
+  {
+    auto numCols = staticGraph_->getColMap()->getNodeNumElements();
+    return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
+                                valuesPacked_wdv.getHostView(Access::ReadWrite),
+                                staticGraph_->getLocalGraphHost());
+  }
+
+// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   getLocalMultiplyOperator () const
   {
-    return lclMatrix_;
+// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
+    return std::make_shared<local_multiply_op_type>(
+                           std::make_shared<local_matrix_device_type>(
+                                           getLocalMatrixDevice()));
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -1171,28 +1209,30 @@ namespace Tpetra {
     }
 
     const size_t lclNumRows = this->staticGraph_->getNodeNumRows ();
-    typename Graph::local_graph_type::row_map_type k_ptrs =
-      this->staticGraph_->k_rowPtrs_;
+    typename Graph::local_graph_device_type::row_map_type k_ptrs =
+                                      this->staticGraph_->rowPtrsUnpacked_dev_;
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (k_ptrs.extent (0) != lclNumRows+1, std::logic_error,
       "With StaticProfile, row offsets array has length "
       << k_ptrs.extent (0) << " != (lclNumRows+1) = "
       << (lclNumRows+1) << ".");
 
-    const size_t lclTotalNumEntries =
-      ::Tpetra::Details::getEntryOnHost (k_ptrs, lclNumRows);
+    const size_t lclTotalNumEntries = 
+                 this->staticGraph_->rowPtrsUnpacked_host_(lclNumRows);
 
     // Allocate array of (packed???) matrix values.
-    using values_type = typename local_matrix_type::values_type;
+    using values_type = typename local_matrix_device_type::values_type;
     if (verbose) {
       std::ostringstream os;
-      os << *prefix << "Allocate k_values1D_: Pre "
-         << k_values1D_.extent(0) << ", post "
+      os << *prefix << "Allocate values_wdv: Pre "
+         << valuesUnpacked_wdv.extent(0) << ", post "
          << lclTotalNumEntries << endl;
       std::cerr << os.str();
     }
-    this->k_values1D_ =
-      values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
+//    this->k_values1D_ =
+    valuesUnpacked_wdv = values_wdv_type(
+                                    values_type("Tpetra::CrsMatrix::values",
+                                    lclTotalNumEntries));
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -1231,10 +1271,27 @@ namespace Tpetra {
         << e.what ());
     }
     Teuchos::ArrayRCP<const impl_scalar_type> vals =
-      Kokkos::Compat::persistingView (k_values1D_);
+//      Kokkos::Compat::persistingView (k_values1D_);
+      Kokkos::Compat::persistingView (valuesUnpacked_wdv.getHostView(Access::ReadOnly));
     values = Teuchos::arcp_reinterpret_cast<const Scalar> (vals);
   }
 
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getAllValues(Teuchos::ArrayRCP<Scalar>& values) {
+    using Teuchos::RCP;
+    const char tfecfFuncName[] = "getAllValues: ";
+    RCP<const crs_graph_type> relevantGraph = getCrsGraph ();
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      relevantGraph.is_null (), std::runtime_error,
+      "Requires that getCrsGraph() is not null.");
+    Teuchos::ArrayRCP<impl_scalar_type> vals =
+      Kokkos::Compat::persistingView (k_values1D_);
+    values = Teuchos::arcp_reinterpret_cast<Scalar> (vals);
+  }
+
+
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -1242,7 +1299,6 @@ namespace Tpetra {
   {
     using ::Tpetra::Details::computeOffsetsFromCounts;
     using ::Tpetra::Details::getEntryOnHost;
-    using Kokkos::create_mirror_view;
     using Teuchos::arcp_const_cast;
     using Teuchos::Array;
     using Teuchos::ArrayRCP;
@@ -1250,9 +1306,9 @@ namespace Tpetra {
     using Teuchos::RCP;
     using Teuchos::rcp;
     using std::endl;
-    using row_map_type = typename local_matrix_type::row_map_type;
-    using lclinds_1d_type = typename Graph::local_graph_type::entries_type::non_const_type;
-    using values_type = typename local_matrix_type::values_type;
+    using row_map_type = typename local_graph_device_type::row_map_type;
+    using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
+    using values_type = typename local_matrix_device_type::values_type;
     Details::ProfilingRegion regionFLGAM
       ("Tpetra::CrsGraph::fillLocalGraphAndMatrix");
 
@@ -1288,28 +1344,20 @@ namespace Tpetra {
     // sparse row format) that define the sparse graph's and matrix's
     // structure, and the sparse matrix's values.
     //
-    // Use the nonconst version of row_map_type for k_ptrs,
-    // because row_map_type is const and we need to modify k_ptrs here.
-    typename row_map_type::non_const_type k_ptrs;
-    row_map_type k_ptrs_const;
-    lclinds_1d_type k_inds;
-    values_type k_vals;
-
     // Get references to the data in myGraph_, so we can modify them
     // as well.  Note that we only call fillLocalGraphAndMatrix() if
     // the matrix owns the graph, which means myGraph_ is not null.
-    lclinds_1d_type k_lclInds1D_ = myGraph_->k_lclInds1D_;
 
     typedef decltype (myGraph_->k_numRowEntries_) row_entries_type;
 
     // StaticProfile means that the matrix's column indices and
     // values are currently stored in a 1-D format, with row offsets
-    // in k_rowPtrs_ and local column indices in k_lclInds1D_.
+    // in rowPtrsUnpacked_ and local column indices in lclIndsUnpacked_wdv.
 
     // StaticProfile also means that the graph's array of row
     // offsets must already be allocated.
-    typename Graph::local_graph_type::row_map_type curRowOffsets =
-      myGraph_->k_rowPtrs_;
+    typename Graph::local_graph_device_type::row_map_type curRowOffsets = 
+                                                   myGraph_->rowPtrsUnpacked_dev_;
 
     if (debug) {
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
@@ -1321,19 +1369,26 @@ namespace Tpetra {
          << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
          << (lclNumRows + 1) << ".");
       const size_t numOffsets = curRowOffsets.extent (0);
-      const auto valToCheck =
-        getEntryOnHost (curRowOffsets, numOffsets - 1);
+      const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (numOffsets != 0 &&
-         myGraph_->k_lclInds1D_.extent (0) != valToCheck,
+         myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
          std::logic_error, "(StaticProfile branch) numOffsets = " <<
-         numOffsets << " != 0 and myGraph_->k_lclInds1D_.extent(0) = "
-         << myGraph_->k_lclInds1D_.extent (0) << " != curRowOffsets("
+         numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
+         << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
          << numOffsets << ") = " << valToCheck << ".");
     }
 
     if (myGraph_->getNodeNumEntries() !=
         myGraph_->getNodeAllocationSize()) {
+
+      // Use the nonconst version of row_map_type for k_ptrs,
+      // because row_map_type is const and we need to modify k_ptrs here.
+      typename row_map_type::non_const_type k_ptrs;
+      row_map_type k_ptrs_const;
+      lclinds_1d_type k_inds;
+      values_type k_vals;
+
       if (verbose) {
         std::ostringstream os;
         const auto numEnt = myGraph_->getNodeNumEntries();
@@ -1351,23 +1406,22 @@ namespace Tpetra {
       if (debug && curRowOffsets.extent (0) != 0) {
         const size_t numOffsets =
           static_cast<size_t> (curRowOffsets.extent (0));
-        const auto valToCheck =
-          getEntryOnHost (curRowOffsets, numOffsets - 1);
+        const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
           (static_cast<size_t> (valToCheck) !=
-           static_cast<size_t> (k_values1D_.extent (0)),
+           static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
            std::logic_error, "(StaticProfile unpacked branch) Before "
            "allocating or packing, curRowOffsets(" << (numOffsets-1)
-           << ") = " << valToCheck << " != k_values1D_.extent(0)"
-           " = " << k_values1D_.extent (0) << ".");
+           << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
+           " = " << valuesUnpacked_wdv.extent (0) << ".");
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
           (static_cast<size_t> (valToCheck) !=
-           static_cast<size_t> (myGraph_->k_lclInds1D_.extent (0)),
+           static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
            std::logic_error, "(StaticProfile unpacked branch) Before "
            "allocating or packing, curRowOffsets(" << (numOffsets-1)
            << ") = " << valToCheck
-           << " != myGraph_->k_lclInds1D_.extent(0) = "
-           << myGraph_->k_lclInds1D_.extent (0) << ".");
+           << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
+           << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
       }
       // Pack the row offsets into k_ptrs, by doing a sum-scan of
       // the array of valid entry counts per row.
@@ -1376,8 +1430,6 @@ namespace Tpetra {
       // process.  We will compute this in the loop below.  It's
       // cheap to compute and useful as a sanity check.
       size_t lclTotalNumEntries = 0;
-      // This will be a host view of packed row offsets.
-      typename row_map_type::non_const_type::HostMirror h_ptrs;
       {
         // Allocate the packed row offsets array.  We use a nonconst
         // temporary (packedRowOffsets) here, because k_ptrs is
@@ -1425,41 +1477,51 @@ namespace Tpetra {
            << lclTotalNumEntries << endl;
         std::cerr << os.str ();
       }
-      k_inds = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
+      k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
       if (verbose) {
         std::ostringstream os;
         os << *prefix << "Allocate packed values: "
            << lclTotalNumEntries << endl;
         std::cerr << os.str ();
       }
-      k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
+      k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
 
-      // curRowOffsets (myGraph_->k_rowPtrs_) (???), k_lclInds1D_,
-      // and k_values1D_ are currently unpacked.  Pack them, using
+      // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
+      // and valuesUnpacked_wdv are currently unpacked.  Pack them, using
       // the packed row offsets array k_ptrs that we created above.
       //
       // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
       // need to keep around the unpacked row offsets, column
       // indices, and values arrays.
 
-      // Pack the column indices from unpacked k_lclInds1D_ into
-      // packed k_inds.  We will replace k_lclInds1D_ below.
+      // Pack the column indices from unpacked lclIndsUnpacked_wdv into
+      // packed k_inds.  We will replace lclIndsUnpacked_wdv below.
       using inds_packer_type = pack_functor<
-        typename Graph::local_graph_type::entries_type::non_const_type,
-        typename Graph::local_graph_type::row_map_type>;
-      inds_packer_type indsPacker (k_inds, myGraph_->k_lclInds1D_,
-                                   k_ptrs, curRowOffsets);
+        typename Graph::local_graph_device_type::entries_type::non_const_type,
+        typename Graph::local_inds_dualv_type::t_dev::const_type,
+        typename Graph::local_graph_device_type::row_map_type::non_const_type,
+        typename Graph::local_graph_device_type::row_map_type>;
+      inds_packer_type indsPacker (
+                  k_inds,
+                  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
+                  k_ptrs, curRowOffsets);
       using exec_space = typename decltype (k_inds)::execution_space;
       using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
       Kokkos::parallel_for
         ("Tpetra::CrsMatrix pack column indices",
          range_type (0, lclNumRows), indsPacker);
 
-      // Pack the values from unpacked k_values1D_ into packed
-      // k_vals.  We will replace k_values1D_ below.
-      using vals_packer_type = pack_functor<values_type, row_map_type>;
-      vals_packer_type valsPacker (k_vals, this->k_values1D_,
-                                   k_ptrs, curRowOffsets);
+      // Pack the values from unpacked valuesUnpacked_wdv into packed
+      // k_vals.  We will replace valuesPacked_wdv below.
+      using vals_packer_type = pack_functor<
+        typename values_type::non_const_type,
+        typename values_type::const_type, 
+        typename row_map_type::non_const_type, 
+        typename row_map_type::const_type>;
+      vals_packer_type valsPacker (
+                       k_vals,
+                       this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
+                       k_ptrs, curRowOffsets);
       Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
                             range_type (0, lclNumRows), valsPacker);
 
@@ -1469,7 +1531,7 @@ namespace Tpetra {
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
           (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
            << "k_ptrs.extent(0) = 0.  This probably means that "
-           "k_rowPtrs_ was never allocated.");
+           "rowPtrsUnpacked_ was never allocated.");
         if (k_ptrs.extent (0) != 0) {
           const size_t numOffsets (k_ptrs.extent (0));
           const auto valToCheck =
@@ -1486,41 +1548,48 @@ namespace Tpetra {
              " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
         }
       }
+      // Build the local graph.
+      myGraph_->setRowPtrsPacked(k_ptrs_const);
+      myGraph_->lclIndsPacked_wdv = 
+                typename crs_graph_type::local_inds_wdv_type(k_inds);
+      valuesPacked_wdv = values_wdv_type(k_vals);
     }
     else { // We don't have to pack, so just set the pointers.
+      myGraph_->setRowPtrsPacked(myGraph_->rowPtrsUnpacked_dev_);
+      myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
+      valuesPacked_wdv = valuesUnpacked_wdv;
+
       if (verbose) {
         std::ostringstream os;
-        os << *prefix << "Storage already packed: k_rowPtrs_: "
-           << myGraph_->k_rowPtrs_.extent(0) << ", k_lclInds1D_: "
-           << myGraph_->k_lclInds1D_.extent(0) << ", k_values1D_: "
-           << k_values1D_.extent(0) << endl;
+        os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
+           << myGraph_->rowPtrsUnpacked_host_.extent(0) << ", lclIndsUnpacked_wdv: "
+           << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
+           << valuesUnpacked_wdv.extent(0) << endl;
         std::cerr << os.str();
       }
-      k_ptrs_const = myGraph_->k_rowPtrs_;
-      k_inds = myGraph_->k_lclInds1D_;
-      k_vals = this->k_values1D_;
 
       if (debug) {
         const char myPrefix[] =
           "(StaticProfile \"Optimize Storage\"=false branch) ";
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (k_ptrs_const.extent (0) == 0, std::logic_error, myPrefix
-           << "k_ptrs_const.extent(0) = 0.  This probably means "
-           "that k_rowPtrs_ was never allocated.");
-        if (k_ptrs_const.extent (0) != 0) {
-          const size_t numOffsets (k_ptrs_const.extent (0));
-          const auto valToCheck =
-            getEntryOnHost (k_ptrs_const, numOffsets - 1);
+          (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
+           << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0.  This probably means "
+           "that rowPtrsUnpacked_ was never allocated.");
+        if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
+          const size_t numOffsets (myGraph_->rowPtrsUnpacked_host_.extent (0));
+          const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-            (size_t (valToCheck) != k_vals.extent (0),
+            (size_t (valToCheck) != valuesPacked_wdv.extent (0),
              std::logic_error, myPrefix <<
              "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
-             << " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
+             << " != valuesPacked_wdv.extent(0) = " 
+             << valuesPacked_wdv.extent (0) << ".");
           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-            (size_t (valToCheck) != k_inds.extent (0),
+            (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
              std::logic_error, myPrefix <<
              "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
-             << " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
+             << " != myGraph_->lclIndsPacked.extent(0) = " 
+             << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
         }
       }
     }
@@ -1528,24 +1597,25 @@ namespace Tpetra {
     if (debug) {
       const char myPrefix[] = "After packing, ";
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (size_t (k_ptrs_const.extent (0)) != size_t (lclNumRows + 1),
-         std::logic_error, myPrefix << "k_ptrs_const.extent(0) = "
-         << k_ptrs_const.extent (0) << " != lclNumRows+1 = " <<
+        (size_t (myGraph_->rowPtrsUnpacked_host_.extent (0)) != size_t (lclNumRows + 1),
+         std::logic_error, myPrefix << "myGraph_->rowPtrsUnpacked_host_.extent(0) = "
+         << myGraph_->rowPtrsUnpacked_host_.extent (0) << " != lclNumRows+1 = " <<
          (lclNumRows+1) << ".");
-      if (k_ptrs_const.extent (0) != 0) {
-        const size_t numOffsets (k_ptrs_const.extent (0));
-        const size_t k_ptrs_const_numOffsetsMinus1 =
-          getEntryOnHost (k_ptrs_const, numOffsets - 1);
+      if (myGraph_->rowPtrsUnpacked_host_.extent (0) != 0) {
+        const size_t numOffsets (myGraph_->rowPtrsUnpacked_host_.extent (0));
+        const size_t valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets-1);
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (k_ptrs_const_numOffsetsMinus1 != size_t (k_vals.extent (0)),
+          (valToCheck != size_t (valuesPacked_wdv.extent (0)),
            std::logic_error, myPrefix << "k_ptrs_const(" <<
-           (numOffsets-1) << ") = " << k_ptrs_const_numOffsetsMinus1
-           << " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
+           (numOffsets-1) << ") = " << valToCheck
+           << " != valuesPacked_wdv.extent(0) = " 
+           << valuesPacked_wdv.extent (0) << ".");
         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (k_ptrs_const_numOffsetsMinus1 != size_t (k_inds.extent (0)),
+          (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
            std::logic_error, myPrefix << "k_ptrs_const(" <<
-           (numOffsets-1) << ") = " << k_ptrs_const_numOffsetsMinus1
-           << " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
+           (numOffsets-1) << ") = " << valToCheck
+           << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = " 
+           << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
       }
     }
 
@@ -1577,33 +1647,14 @@ namespace Tpetra {
            << myGraph_->k_numRowEntries_.extent(0) << endl;
         std::cerr << os.str();
       }
+
       myGraph_->k_numRowEntries_ = row_entries_type ();
 
       // Keep the new 1-D packed allocations.
-      if (verbose) {
-        std::ostringstream os;
-        os << *prefix << "Assign k_rowPtrs_: old="
-           << myGraph_->k_rowPtrs_.extent(0) << ", new="
-           << k_ptrs_const.extent(0) << endl;
-        std::cerr << os.str();
-      }
-      myGraph_->k_rowPtrs_ = k_ptrs_const;
-      if (verbose) {
-        std::ostringstream os;
-        os << *prefix << "Assign k_lclInds1D_: old="
-           << myGraph_->k_lclInds1D_.extent(0) << ", new="
-           << k_inds.extent(0) << endl;
-        std::cerr << os.str();
-      }
-      myGraph_->k_lclInds1D_ = k_inds;
-      if (verbose) {
-        std::ostringstream os;
-        os << *prefix << "Assign k_values1D_: old="
-           << k_values1D_.extent(0) << ", new="
-           << k_vals.extent(0) << endl;
-        std::cerr << os.str();
-      }
-      this->k_values1D_ = k_vals;
+      myGraph_->setRowPtrsUnpacked(myGraph_->rowPtrsPacked_dev_);
+      myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
+      valuesUnpacked_wdv = valuesPacked_wdv;
+//      k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
 
       myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
       this->storageStatus_ = Details::STORAGE_1D_PACKED;
@@ -1611,28 +1662,11 @@ namespace Tpetra {
     else {
       if (verbose) {
         std::ostringstream os;
-        os << *prefix << "User requestetd NOT to optimize storage"
+        os << *prefix << "User requested NOT to optimize storage"
            << endl;
         std::cerr << os.str();
       }
     }
-
-    // Make the local graph, using the arrays of row offsets and
-    // column indices that we built above.  The local graph should be
-    // null, but we delete it first so that any memory can be freed
-    // before we allocate the new one.
-    //
-    // FIXME (mfh 06,28 Aug 2014) It would make more sense for
-    // Tpetra::CrsGraph to have a protected method that accepts k_inds
-    // and k_ptrs, and creates the local graph lclGraph_.
-    myGraph_->lclGraph_ =
-      typename Graph::local_graph_type (k_inds, k_ptrs_const);
-
-    // Make the local matrix, using the local graph and vals array.
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", getNodeNumCols (),
-       k_vals, myGraph_->lclGraph_);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -1641,16 +1675,15 @@ namespace Tpetra {
   fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
   {
     using ::Tpetra::Details::ProfilingRegion;
-    using Kokkos::create_mirror_view;
     using Teuchos::ArrayRCP;
     using Teuchos::Array;
     using Teuchos::null;
     using Teuchos::RCP;
     using Teuchos::rcp;
     using std::endl;
-    using row_map_type = typename Graph::local_graph_type::row_map_type;
+    using row_map_type = typename Graph::local_graph_device_type::row_map_type;
     using non_const_row_map_type = typename row_map_type::non_const_type;
-    using values_type = typename local_matrix_type::values_type;
+    using values_type = typename local_matrix_device_type::values_type;
     ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
     const size_t lclNumRows = getNodeNumRows();
 
@@ -1674,7 +1707,7 @@ namespace Tpetra {
     // get data from staticGraph_
     size_t nodeNumEntries   = staticGraph_->getNodeNumEntries ();
     size_t nodeNumAllocated = staticGraph_->getNodeAllocationSize ();
-    row_map_type k_rowPtrs_ = staticGraph_->lclGraph_.row_map;
+    row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_; 
 
     row_map_type k_ptrs; // "packed" row offsets array
     values_type k_vals; // "packed" values array
@@ -1749,11 +1782,11 @@ namespace Tpetra {
       size_t lclTotalNumEntries = 0;
       k_ptrs = tmpk_ptrs;
       {
-        typename row_entries_type::const_type numRowEnt_d =
+        typename row_entries_type::const_type numRowEnt_h =
           staticGraph_->k_numRowEntries_;
         // This function can handle the counts being a host View.
         lclTotalNumEntries =
-          Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_d);
+          Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
       }
 
       // Allocate the "packed" values array.
@@ -1766,42 +1799,39 @@ namespace Tpetra {
       }
       k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
 
-      // Pack k_values1D_ into k_vals.  We will replace k_values1D_ below.
-      pack_functor<values_type, row_map_type> valsPacker
-        (k_vals, k_values1D_, tmpk_ptrs, k_rowPtrs_);
+      // Pack values_wdv into k_vals.  We will replace values_wdv below.
+      pack_functor<
+        typename values_type::non_const_type,
+        typename values_type::const_type, 
+        typename row_map_type::non_const_type, 
+        typename row_map_type::const_type> valsPacker
+        (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
+         tmpk_ptrs, k_rowPtrs);
 
       using exec_space = typename decltype (k_vals)::execution_space;
       using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
       Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
                             range_type (0, lclNumRows), valsPacker);
+      valuesPacked_wdv = values_wdv_type(k_vals);
     }
     else { // We don't have to pack, so just set the pointer.
+      valuesPacked_wdv = valuesUnpacked_wdv;
       if (verbose) {
         std::ostringstream os;
         os << *prefix << "Storage already packed: "
-           << "k_values1D_: " << k_values1D_.extent(0) << endl;
+           << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
         std::cerr << os.str();
       }
-      k_vals = k_values1D_;
     }
 
     // May we ditch the old allocations for the packed one?
     if (requestOptimizedStorage) {
       // The user requested optimized storage, so we can dump the
       // unpacked 1-D storage, and keep the packed storage.
-      k_values1D_ = k_vals;
+      valuesUnpacked_wdv = valuesPacked_wdv;
+//      k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
       this->storageStatus_ = Details::STORAGE_1D_PACKED;
     }
-
-    // Build the local sparse matrix object.  At this point, the local
-    // matrix certainly has a column Map.  Remember that the local
-    // matrix's number of columns comes from the column Map, not the
-    // domain Map.
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_",
-       getColMap ()->getNodeNumElements (),
-       k_vals, staticGraph_->getLocalGraph ());
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
   }
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -1906,7 +1936,7 @@ namespace Tpetra {
 
     RowInfo rowInfo = graph.getRowInfo (lclRow);
 
-    Teuchos::ArrayView<IST> valsView = this->getViewNonConst(rowInfo);
+    auto valsView = this->getValuesViewHostNonConst(rowInfo);
     auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
                  valsView[offset] += values[k]; };
     std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
@@ -1955,8 +1985,8 @@ namespace Tpetra {
       rowInfo = graph.getRowInfo (rowInfo.localRow);
     }
 
-    Teuchos::ArrayView<IST> valsView = this->getViewNonConst(rowInfo);
-    auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
+    auto valsView = this->getValuesViewHostNonConst(rowInfo);
+    auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
                  valsView[offset] += vals[k];
                  };
     std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
@@ -1995,13 +2025,15 @@ namespace Tpetra {
       os << "]" << std::endl;
 
       if (this->supportsRowViews ()) {
-        Teuchos::ArrayView<const Scalar> vals2;
+        values_host_view_type vals2;
         if (this->isGloballyIndexed ()) {
-          Teuchos::ArrayView<const GlobalOrdinal> gblColInds2;
+          global_inds_host_view_type gblColInds2;
           const GlobalOrdinal gblRow =
             graph.rowMap_->getGlobalElement (rowInfo.localRow);
-          if (gblRow == Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
-            os << "Local row index " << rowInfo.localRow << " is invalid!" << std::endl;
+          if (gblRow == 
+              Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
+            os << "Local row index " << rowInfo.localRow << " is invalid!" 
+               << std::endl;
           }
           else {
             bool getViewThrew = false;
@@ -2014,18 +2046,28 @@ namespace Tpetra {
                  << e.what () << std::endl;
             }
             if (! getViewThrew) {
-              os << "\tNew global column indices: "
-                 << Teuchos::toString (gblColInds2) << std::endl
-                 << "\tNew values: " << Teuchos::toString (vals2) << std::endl;
+              os << "\tNew global column indices: ";
+              for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
+                 os << gblColInds2[jjj] << " ";
+              os << std::endl;
+              os << "\tNew values: "; 
+              for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
+                 os << vals2[jjj] << " ";
+              os << std::endl;
             }
           }
         }
         else if (this->isLocallyIndexed ()) {
-          Teuchos::ArrayView<const LocalOrdinal> lclColInds2;
+          local_inds_host_view_type lclColInds2;
           this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
-          os << "\tNew local column indices: " << Teuchos::toString (lclColInds2)
-             << std::endl;
-          os << "\tNew values: " << Teuchos::toString (vals2) << std::endl;
+          os << "\tNew local column indices: ";
+          for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
+             os << lclColInds2[jjj] << " ";
+          os << std::endl;
+          os << "\tNew values: "; 
+          for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
+             os << vals2[jjj] << " ";
+          os << std::endl;
         }
       }
 
@@ -2348,7 +2390,7 @@ namespace Tpetra {
                           const RowInfo& rowInfo,
                           const LocalOrdinal inds[],
                           const impl_scalar_type newVals[],
-                          const LocalOrdinal numElts) const
+                          const LocalOrdinal numElts) 
   {
     typedef LocalOrdinal LO;
     typedef GlobalOrdinal GO;
@@ -2357,14 +2399,10 @@ namespace Tpetra {
     size_t hint = 0; // Guess for the current index k into rowVals
     LO numValid = 0; // number of valid local column indices
 
-    // NOTE (mfh 11 Oct 2015) This method assumes UVM.  More
-    // accurately, it assumes that the host execution space can
-    // access data in both InputMemorySpace and ValsMemorySpace.
-
     if (graph.isLocallyIndexed ()) {
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getLocalKokkosRowView (rowInfo);
+      auto colInds = graph.getLocalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const LO lclColInd = inds[j];
@@ -2386,7 +2424,7 @@ namespace Tpetra {
 
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getGlobalKokkosRowView (rowInfo);
+      auto colInds = graph.getGlobalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const GO gblColInd = colMap.getGlobalElement (inds[j]);
@@ -2419,7 +2457,7 @@ namespace Tpetra {
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   replaceLocalValues (const LocalOrdinal localRow,
                       const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
-                      const Teuchos::ArrayView<const Scalar>& vals) const
+                      const Teuchos::ArrayView<const Scalar>& vals)
   {
     typedef LocalOrdinal LO;
 
@@ -2440,7 +2478,7 @@ namespace Tpetra {
   replaceLocalValues(
     const local_ordinal_type localRow,
     const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
-    const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) const
+    const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
   {
     using LO = local_ordinal_type;
     const LO numInputEnt = inputInds.extent(0);
@@ -2459,7 +2497,7 @@ namespace Tpetra {
   replaceLocalValues (const LocalOrdinal localRow,
                       const LocalOrdinal numEnt,
                       const Scalar inputVals[],
-                      const LocalOrdinal inputCols[]) const
+                      const LocalOrdinal inputCols[])
   {
     typedef impl_scalar_type IST;
     typedef LocalOrdinal LO;
@@ -2476,7 +2514,7 @@ namespace Tpetra {
       // allowed to modify its values.
       return static_cast<LO> (0);
     }
-    auto curRowVals = this->getRowViewNonConst (rowInfo);
+    auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
     const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
     return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
                                          inputCols, inVals, numEnt);
@@ -2490,7 +2528,7 @@ namespace Tpetra {
                            const RowInfo& rowInfo,
                            const GlobalOrdinal inds[],
                            const impl_scalar_type newVals[],
-                           const LocalOrdinal numElts) const
+                           const LocalOrdinal numElts)
   {
     Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
     auto fun =
@@ -2506,7 +2544,7 @@ namespace Tpetra {
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   replaceGlobalValues (const GlobalOrdinal globalRow,
                        const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
-                       const Teuchos::ArrayView<const Scalar>& inputVals) const
+                       const Teuchos::ArrayView<const Scalar>& inputVals)
   {
     typedef LocalOrdinal LO;
 
@@ -2525,7 +2563,7 @@ namespace Tpetra {
   replaceGlobalValues (const GlobalOrdinal globalRow,
                        const LocalOrdinal numEnt,
                        const Scalar inputVals[],
-                       const GlobalOrdinal inputGblColInds[]) const
+                       const GlobalOrdinal inputGblColInds[])
   {
     typedef impl_scalar_type IST;
     typedef LocalOrdinal LO;
@@ -2543,7 +2581,7 @@ namespace Tpetra {
       return static_cast<LO> (0);
     }
 
-    auto curRowVals = this->getRowViewNonConst (rowInfo);
+    auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
     const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
     return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
                                           inputGblColInds, inVals, numEnt);
@@ -2556,7 +2594,7 @@ namespace Tpetra {
   replaceGlobalValues(
     const global_ordinal_type globalRow,
     const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
-    const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) const
+    const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
   {
     // We use static_assert here to check the template parameters,
     // rather than std::enable_if (e.g., on the return value, to
@@ -2584,7 +2622,7 @@ namespace Tpetra {
                            const GlobalOrdinal inds[],
                            const impl_scalar_type newVals[],
                            const LocalOrdinal numElts,
-                           const bool atomic) const
+                           const bool atomic)
   {
     typedef LocalOrdinal LO;
     typedef GlobalOrdinal GO;
@@ -2594,10 +2632,6 @@ namespace Tpetra {
     size_t hint = 0; // guess at the index's relative offset in the row
     LO numValid = 0; // number of valid input column indices
 
-    // NOTE (mfh 11 Oct 2015) This method assumes UVM.  More
-    // accurately, it assumes that the host execution space can
-    // access data in both InputMemorySpace and ValsMemorySpace.
-
     if (graph.isLocallyIndexed ()) {
       // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
       // pointer does NOT change its reference count.  Thus, this
@@ -2612,7 +2646,7 @@ namespace Tpetra {
 
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getLocalKokkosRowView (rowInfo);
+      auto colInds = graph.getLocalIndsViewHost (rowInfo);
       const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
 
       for (LO j = 0; j < numElts; ++j) {
@@ -2637,7 +2671,7 @@ namespace Tpetra {
     else if (graph.isGloballyIndexed ()) {
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getGlobalKokkosRowView (rowInfo);
+      auto colInds = graph.getGlobalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const GO gblColInd = inds[j];
@@ -2730,7 +2764,7 @@ namespace Tpetra {
       return numInputEnt;
     }
     else { // input row is in the row Map on the calling process
-      auto curRowVals = this->getRowViewNonConst (rowInfo);
+      auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
       const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
       return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
                                             inputGblColInds, inVals,
@@ -2746,7 +2780,7 @@ namespace Tpetra {
                         const impl_scalar_type inputVals[],
                         const LocalOrdinal inputCols[],
                         std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                        const bool atomic) const
+                        const bool atomic)
   {
     using Tpetra::Details::OrdinalTraits;
     typedef LocalOrdinal LO;
@@ -2763,7 +2797,7 @@ namespace Tpetra {
       // allowed to modify its values.
       return static_cast<LO> (0);
     }
-    auto curRowVals = this->getRowViewNonConst (rowInfo);
+    auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
     return this->transformLocalValues (curRowVals.data (), graph,
                                        rowInfo, inputCols, inputVals,
                                        numInputEnt, f, atomic);
@@ -2777,7 +2811,7 @@ namespace Tpetra {
                          const impl_scalar_type inputVals[],
                          const GlobalOrdinal inputCols[],
                          std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                         const bool atomic) const
+                         const bool atomic)
   {
     using Tpetra::Details::OrdinalTraits;
     typedef LocalOrdinal LO;
@@ -2794,7 +2828,7 @@ namespace Tpetra {
       // allowed to modify its values.
       return static_cast<LO> (0);
     }
-    auto curRowVals = this->getRowViewNonConst (rowInfo);
+    auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
     return this->transformGlobalValues (curRowVals.data (), graph,
                                         rowInfo, inputCols, inputVals,
                                         numInputEnt, f, atomic);
@@ -2810,7 +2844,7 @@ namespace Tpetra {
                         const impl_scalar_type newVals[],
                         const LocalOrdinal numElts,
                         std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                        const bool atomic) const
+                        const bool atomic)
   {
     typedef impl_scalar_type ST;
     typedef LocalOrdinal LO;
@@ -2829,7 +2863,7 @@ namespace Tpetra {
     if (graph.isLocallyIndexed ()) {
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getLocalKokkosRowView (rowInfo);
+      auto colInds = graph.getLocalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const LO lclColInd = inds[j];
@@ -2871,7 +2905,7 @@ namespace Tpetra {
       const map_type& colMap = * (graph.colMap_);
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getGlobalKokkosRowView (rowInfo);
+      auto colInds = graph.getGlobalIndsViewHost (rowInfo);
 
       const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
       for (LO j = 0; j < numElts; ++j) {
@@ -2919,7 +2953,7 @@ namespace Tpetra {
                          const impl_scalar_type newVals[],
                          const LocalOrdinal numElts,
                          std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
-                         const bool atomic) const
+                         const bool atomic)
   {
     typedef impl_scalar_type ST;
     typedef LocalOrdinal LO;
@@ -2938,7 +2972,7 @@ namespace Tpetra {
     if (graph.isGloballyIndexed ()) {
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getGlobalKokkosRowView (rowInfo);
+      auto colInds = graph.getGlobalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const GO gblColInd = inds[j];
@@ -2979,7 +3013,7 @@ namespace Tpetra {
       const map_type& colMap = * (graph.colMap_);
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getLocalKokkosRowView (rowInfo);
+      auto colInds = graph.getLocalIndsViewHost (rowInfo);
 
       const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
       for (LO j = 0; j < numElts; ++j) {
@@ -3026,7 +3060,7 @@ namespace Tpetra {
                           const LocalOrdinal inds[],
                           const impl_scalar_type newVals[],
                           const LocalOrdinal numElts,
-                          const bool atomic) const
+                          const bool atomic)
   {
     typedef LocalOrdinal LO;
     typedef GlobalOrdinal GO;
@@ -3036,14 +3070,10 @@ namespace Tpetra {
     size_t hint = 0; // Guess for the current index k into rowVals
     LO numValid = 0; // number of valid local column indices
 
-    // NOTE (mfh 11 Oct 2015) This method assumes UVM.  More
-    // accurately, it assumes that the host execution space can
-    // access data in both InputMemorySpace and ValsMemorySpace.
-
     if (graph.isLocallyIndexed ()) {
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getLocalKokkosRowView (rowInfo);
+      auto colInds = graph.getLocalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const LO lclColInd = inds[j];
@@ -3070,7 +3100,7 @@ namespace Tpetra {
 
       // Get a view of the column indices in the row.  This amortizes
       // the cost of getting the view over all the entries of inds.
-      auto colInds = graph.getGlobalKokkosRowView (rowInfo);
+      auto colInds = graph.getGlobalIndsViewHost (rowInfo);
 
       for (LO j = 0; j < numElts; ++j) {
         const GO gblColInd = colMap.getGlobalElement (inds[j]);
@@ -3109,7 +3139,7 @@ namespace Tpetra {
   sumIntoLocalValues (const LocalOrdinal localRow,
                       const Teuchos::ArrayView<const LocalOrdinal>& indices,
                       const Teuchos::ArrayView<const Scalar>& values,
-                      const bool atomic) const
+                      const bool atomic)
   {
     using LO = local_ordinal_type;
     const LO numInputEnt = static_cast<LO>(indices.size());
@@ -3130,7 +3160,7 @@ namespace Tpetra {
     const local_ordinal_type localRow,
     const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
     const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
-    const bool atomic) const
+    const bool atomic)
   {
     using LO = local_ordinal_type;
     const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
@@ -3150,7 +3180,7 @@ namespace Tpetra {
                       const LocalOrdinal numEnt,
                       const Scalar vals[],
                       const LocalOrdinal cols[],
-                      const bool atomic) const
+                      const bool atomic)
   {
     typedef impl_scalar_type IST;
     typedef LocalOrdinal LO;
@@ -3167,12 +3197,69 @@ namespace Tpetra {
       // allowed to modify its values.
       return static_cast<LO> (0);
     }
-    auto curRowVals = this->getRowViewNonConst (rowInfo);
+    auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
     const IST* const inputVals = reinterpret_cast<const IST*> (vals);
     return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
                                          cols, inputVals, numEnt, atomic);
   }
 
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+                    values_dualv_type::t_host::const_type
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getValuesViewHost (const RowInfo& rowinfo) const
+  {
+    if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
+      return typename values_dualv_type::t_host::const_type ();
+    else
+      return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
+                                               rowinfo.allocSize,
+                                               Access::ReadOnly);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+                    values_dualv_type::t_host
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getValuesViewHostNonConst (const RowInfo& rowinfo)
+  {
+    if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
+      return typename values_dualv_type::t_host ();
+    else
+      return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
+                                               rowinfo.allocSize,
+                                               Access::ReadWrite);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+                    values_dualv_type::t_dev::const_type
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getValuesViewDevice (const RowInfo& rowinfo) const
+  {
+    if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
+      return typename values_dualv_type::t_dev::const_type ();
+    else
+      return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
+                                                 rowinfo.allocSize,
+                                                 Access::ReadOnly);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+                    values_dualv_type::t_dev
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getValuesViewDeviceNonConst (const RowInfo& rowinfo)
+  {
+    if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
+      return typename values_dualv_type::t_dev ();
+    else
+      return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
+                                                 rowinfo.allocSize,
+                                                 Access::ReadWrite);
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   Teuchos::ArrayView<const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3184,15 +3271,18 @@ namespace Tpetra {
     using ST = impl_scalar_type;
     using range_type = std::pair<size_t, size_t>;
 
-    if (k_values1D_.extent (0) != 0 && rowinfo.allocSize > 0) {
+    if (valuesUnpacked_wdv.extent (0) != 0 && rowinfo.allocSize > 0) {
+
 #ifdef HAVE_TPETRA_DEBUG
       TEUCHOS_TEST_FOR_EXCEPTION(
-        rowinfo.offset1D + rowinfo.allocSize > k_values1D_.extent (0),
+        rowinfo.offset1D + rowinfo.allocSize > valuesUnpacked_wdv.extent (0),
         std::range_error, "Tpetra::CrsMatrix::getView: Invalid access "
         "to 1-D storage of values." << std::endl << "rowinfo.offset1D (" <<
         rowinfo.offset1D << ") + rowinfo.allocSize (" << rowinfo.allocSize <<
-        ") > k_values1D_.extent(0) (" << k_values1D_.extent (0) << ").");
+        ") > valuesUnpacked_wdv.extent(0) (" << valuesUnpacked_wdv.extent (0)
+        << ").");
 #endif // HAVE_TPETRA_DEBUG
+
       range_type range (rowinfo.offset1D, rowinfo.offset1D + rowinfo.allocSize);
       typedef View<const ST*, device_type, MemoryUnmanaged> subview_type;
       // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
@@ -3201,7 +3291,10 @@ namespace Tpetra {
       // reference count, which costs performance in a measurable way.
       // Instead, we create a temporary unmanaged view, then create
       // the subview from that.
-      subview_type sv = Kokkos::subview (subview_type (k_values1D_), range);
+      // KDDKDD UVM REMOVAL This method is unsafe and deprecated
+      auto sv = valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
+                                                  rowinfo.allocSize,
+                                                  Access::ReadOnly);
       const ST* const sv_raw = (rowinfo.allocSize == 0) ? nullptr : sv.data ();
       return ArrayView<const ST> (sv_raw, rowinfo.allocSize);
     }
@@ -3209,135 +3302,63 @@ namespace Tpetra {
       return ArrayView<impl_scalar_type> ();
     }
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  LocalOrdinal
+  void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  getViewRawConst (const impl_scalar_type*& vals,
-                   LocalOrdinal& numEnt,
-                   const RowInfo& rowinfo) const
-  {
-#ifdef HAVE_TPETRA_DEBUG
-    constexpr bool debug = true;
-#else
-    constexpr bool debug = false;
-#endif // HAVE_TPETRA_DEBUG
-
-    if (k_values1D_.extent (0) != 0 && rowinfo.allocSize > 0) {
-      if (debug) {
-        if (rowinfo.offset1D + rowinfo.allocSize > k_values1D_.extent (0)) {
-          vals = nullptr;
-          numEnt = 0;
-          return Teuchos::OrdinalTraits<LocalOrdinal>::invalid ();
-        }
-      }
-      vals = k_values1D_.data () + rowinfo.offset1D;
-      numEnt = rowinfo.allocSize;
-    }
-    else {
-      vals = nullptr;
-      numEnt = 0;
-    }
-
-    return static_cast<LocalOrdinal> (0);
-  }
+    getLocalRowCopy (local_ordinal_type localRow,
+                     nonconst_local_inds_host_view_type &indices,
+                     nonconst_values_host_view_type &values,
+                     size_t& numEntries) const 
+ {
+    using Teuchos::ArrayView;
+    using Teuchos::av_reinterpret_cast;
+    const char tfecfFuncName[] = "getLocalRowCopy: ";
 
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  LocalOrdinal
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  getViewRaw (impl_scalar_type*& vals,
-              LocalOrdinal& numEnt,
-              const RowInfo& rowinfo) const
-  {
-    const impl_scalar_type* valsConst;
-    const LocalOrdinal err = this->getViewRawConst (valsConst, numEnt, rowinfo);
-    vals = const_cast<impl_scalar_type*> (valsConst);
-    return err;
-  }
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (! this->hasColMap (), std::runtime_error,
+       "The matrix does not have a column Map yet.  This means we don't have "
+       "local indices for columns yet, so it doesn't make sense to call this "
+       "method.  If the matrix doesn't have a column Map yet, you should call "
+       "fillComplete on it first.");
 
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  Kokkos::View<const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,
-               typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::device_type,
-               Kokkos::MemoryUnmanaged>
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  getRowView (const RowInfo& rowInfo) const
-  {
-    using Kokkos::MemoryUnmanaged;
-    using Kokkos::View;
-    typedef impl_scalar_type ST;
-    typedef View<const ST*, device_type, MemoryUnmanaged> subview_type;
-    typedef std::pair<size_t, size_t> range_type;
+    const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
+    const size_t theNumEntries = rowinfo.numEntries;
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) < theNumEntries ||
+       static_cast<size_t> (values.size ()) < theNumEntries,
+       std::runtime_error, "Row with local index " << localRow << " has " <<
+       theNumEntries << " entry/ies, but indices.size() = " <<
+       indices.size () << " and values.size() = " << values.size () << ".");
+    numEntries = theNumEntries; // first side effect
 
-    if (k_values1D_.extent (0) != 0 && rowInfo.allocSize > 0) {
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (rowInfo.offset1D + rowInfo.allocSize > this->k_values1D_.extent (0),
-         std::range_error, "Tpetra::CrsMatrix::getRowView: Invalid access "
-         "to 1-D storage of values.  rowInfo.offset1D ("
-         << rowInfo.offset1D << ") + rowInfo.allocSize (" << rowInfo.allocSize
-         << ") > this->k_values1D_.extent(0) ("
-         << this->k_values1D_.extent (0) << ").");
-#endif // HAVE_TPETRA_DEBUG
-      range_type range (rowInfo.offset1D, rowInfo.offset1D + rowInfo.allocSize);
-      // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
-      // directly, because that first creates a _managed_ subview,
-      // then returns an unmanaged version of that.  That touches the
-      // reference count, which costs performance in a measurable way.
-      // Instead, we create a temporary unmanaged view, then create
-      // the subview from that.
-      return Kokkos::subview (subview_type (this->k_values1D_), range);
-    }
-    else {
-      return subview_type ();
-    }
-  }
+    if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
+      if (staticGraph_->isLocallyIndexed ()) {
+        auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,
-               typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::device_type,
-               Kokkos::MemoryUnmanaged>
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  getRowViewNonConst (const RowInfo& rowInfo) const
-  {
-    using Kokkos::MemoryUnmanaged;
-    using Kokkos::View;
-    typedef impl_scalar_type ST;
-    typedef View<ST*, device_type, MemoryUnmanaged> subview_type;
-    typedef std::pair<size_t, size_t> range_type;
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          values[j] = curVals[j];
+          indices[j] = curLclInds(j);
+        }
+      }
+      else if (staticGraph_->isGloballyIndexed ()) {
+        // Don't call getColMap(), because it touches RCP's reference count.
+        const map_type& colMap = * (staticGraph_->colMap_);
+        auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
-    if (k_values1D_.extent (0) != 0 && rowInfo.allocSize > 0) {
-#ifdef HAVE_TPETRA_DEBUG
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (rowInfo.offset1D + rowInfo.allocSize > this->k_values1D_.extent (0),
-         std::range_error, "Tpetra::CrsMatrix::getRowViewNonConst: Invalid "
-         "access to 1-D storage of values.  rowInfo.offset1D ("
-         << rowInfo.offset1D << ") + rowInfo.allocSize (" << rowInfo.allocSize
-         << ") > this->k_values1D_.extent(0) ("
-         << this->k_values1D_.extent (0) << ").");
-#endif // HAVE_TPETRA_DEBUG
-      range_type range (rowInfo.offset1D, rowInfo.offset1D + rowInfo.allocSize);
-      // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
-      // directly, because that first creates a _managed_ subview,
-      // then returns an unmanaged version of that.  That touches the
-      // reference count, which costs performance in a measurable way.
-      // Instead, we create a temporary unmanaged view, then create
-      // the subview from that.
-      return Kokkos::subview (subview_type (this->k_values1D_), range);
-    }
-    else {
-      return subview_type ();
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          values[j] = curVals[j];
+          indices[j] = colMap.getLocalElement (curGblInds(j));
+        }
+      }
     }
   }
 
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>
-  CrsMatrix<Scalar, LocalOrdinal,GlobalOrdinal, Node>::
-  getViewNonConst (const RowInfo& rowinfo) const
-  {
-    return Teuchos::av_const_cast<impl_scalar_type> (this->getView (rowinfo));
-  }
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3369,86 +3390,76 @@ namespace Tpetra {
 
     if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
       if (staticGraph_->isLocallyIndexed ()) {
-        const LocalOrdinal* curLclInds;
-        const impl_scalar_type* curVals;
-        LocalOrdinal numSpots; // includes both current entries and extra space
-
-        // If we got this far, rowinfo should be correct and should
-        // refer to a valid local row.  Thus, these error checks are
-        // superfluous, but we retain them in a debug build.
-#ifdef HAVE_TPETRA_DEBUG
-        int err =
-          staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "staticGraph_->getLocalViewRawConst returned nonzero error code "
-           << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
-           "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
-           << ".");
-        const LocalOrdinal numSpotsBefore = numSpots;
-        err = getViewRawConst (curVals, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "getViewRaw returned nonzero error code " << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (numSpotsBefore != numSpots, std::logic_error,
-           "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
-           << numSpots << ".");
-#else
-        (void) staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
-        (void) getViewRawConst (curVals, numSpots, rowinfo);
-#endif // HAVE_TPETRA_DEBUG
+        auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
         for (size_t j = 0; j < theNumEntries; ++j) {
           values[j] = curVals[j];
-          indices[j] = curLclInds[j];
+          indices[j] = curLclInds(j);
         }
       }
       else if (staticGraph_->isGloballyIndexed ()) {
         // Don't call getColMap(), because it touches RCP's reference count.
         const map_type& colMap = * (staticGraph_->colMap_);
-        const GlobalOrdinal* curGblInds;
-        const impl_scalar_type* curVals;
-        LocalOrdinal numSpots; // includes both current entries and extra space
+        auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
-        // If we got this far, rowinfo should be correct and should
-        // refer to a valid local row.  Thus, these error checks are
-        // superfluous, but we retain them in a debug build.
-#ifdef HAVE_TPETRA_DEBUG
-        int err =
-          staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "staticGraph_->getGlobalViewRawConst returned nonzero error code "
-           << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
-           "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
-           << ".");
-        const LocalOrdinal numSpotsBefore = numSpots;
-        err = getViewRawConst (curVals, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "getViewRawConst returned nonzero error code " << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (numSpotsBefore != numSpots, std::logic_error,
-           "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
-           << numSpots << ".");
-#else
-        (void) staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
-        (void) getViewRawConst (curVals, numSpots, rowinfo);
-#endif //HAVE_TPETRA_DEBUG
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          values[j] = curVals[j];
+          indices[j] = colMap.getLocalElement (curGblInds(j));
+        }
+      }
+    }
+  }
+#endif
+
+template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+void
+CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+    getGlobalRowCopy (global_ordinal_type globalRow,
+                      nonconst_global_inds_host_view_type &indices,
+                      nonconst_values_host_view_type &values,
+                      size_t& numEntries) const
+  {
+    using Teuchos::ArrayView;
+    using Teuchos::av_reinterpret_cast;
+    const char tfecfFuncName[] = "getGlobalRowCopy: ";
+
+    const RowInfo rowinfo =
+      staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
+    const size_t theNumEntries = rowinfo.numEntries;
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      static_cast<size_t> (indices.size ()) < theNumEntries ||
+      static_cast<size_t> (values.size ()) < theNumEntries,
+      std::runtime_error, "Row with global index " << globalRow << " has "
+      << theNumEntries << " entry/ies, but indices.size() = " <<
+      indices.size () << " and values.size() = " << values.size () << ".");
+    numEntries = theNumEntries; // first side effect
+
+    if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
+      if (staticGraph_->isLocallyIndexed ()) {
+        const map_type& colMap = * (staticGraph_->colMap_);
+        auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
         for (size_t j = 0; j < theNumEntries; ++j) {
           values[j] = curVals[j];
-          indices[j] = colMap.getLocalElement (curGblInds[j]);
+          indices[j] = colMap.getGlobalElement (curLclInds(j));
+        }
+      }
+      else if (staticGraph_->isGloballyIndexed ()) {
+        auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
+
+        for (size_t j = 0; j < theNumEntries; ++j) {
+          values[j] = curVals[j];
+          indices[j] = curGblInds(j);
         }
       }
     }
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3475,84 +3486,86 @@ namespace Tpetra {
     if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
       if (staticGraph_->isLocallyIndexed ()) {
         const map_type& colMap = * (staticGraph_->colMap_);
-        const LocalOrdinal* curLclInds;
-        const impl_scalar_type* curVals;
-        LocalOrdinal numSpots; // includes both current entries and extra space
-
-        // If we got this far, rowinfo should be correct and should
-        // refer to a valid local row.  Thus, these error checks are
-        // superfluous, but we retain them in a debug build.
-#ifdef HAVE_TPETRA_DEBUG
-        int err =
-          staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "staticGraph_->getLocalViewRawConst returned nonzero error code "
-           << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
-           "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
-           << ".");
-        const LocalOrdinal numSpotsBefore = numSpots;
-        err = getViewRawConst (curVals, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "getViewRaw returned nonzero error code " << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (numSpotsBefore != numSpots, std::logic_error,
-           "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
-           << numSpots << ".");
-#else
-        (void) staticGraph_->getLocalViewRawConst (curLclInds, numSpots, rowinfo);
-        (void) getViewRawConst (curVals, numSpots, rowinfo);
-#endif //HAVE_TPETRA_DEBUG
+        auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
         for (size_t j = 0; j < theNumEntries; ++j) {
           values[j] = curVals[j];
-          indices[j] = colMap.getGlobalElement (curLclInds[j]);
+          indices[j] = colMap.getGlobalElement (curLclInds(j));
         }
       }
       else if (staticGraph_->isGloballyIndexed ()) {
-        const GlobalOrdinal* curGblInds;
-        const impl_scalar_type* curVals;
-        LocalOrdinal numSpots; // includes both current entries and extra space
-
-        // If we got this far, rowinfo should be correct and should
-        // refer to a valid local row.  Thus, these error checks are
-        // superfluous, but we retain them in a debug build.
-#ifdef HAVE_TPETRA_DEBUG
-        int err =
-          staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "staticGraph_->getGlobalViewRawConst returned nonzero error code "
-           << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (static_cast<size_t> (numSpots) < theNumEntries, std::logic_error,
-           "numSpots = " << numSpots << " < theNumEntries = " << theNumEntries
-           << ".");
-        const LocalOrdinal numSpotsBefore = numSpots;
-        err = getViewRawConst (curVals, numSpots, rowinfo);
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (err != static_cast<LocalOrdinal> (0), std::logic_error,
-           "getViewRawConst returned nonzero error code " << err << ".");
-        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-          (numSpotsBefore != numSpots, std::logic_error,
-           "numSpotsBefore = " << numSpotsBefore << " != numSpots = "
-           << numSpots << ".");
-#else
-        (void) staticGraph_->getGlobalViewRawConst (curGblInds, numSpots, rowinfo);
-        (void) getViewRawConst (curVals, numSpots, rowinfo);
-#endif //HAVE_TPETRA_DEBUG
+        auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
+        auto curVals = getValuesViewHost(rowinfo);
 
         for (size_t j = 0; j < theNumEntries; ++j) {
           values[j] = curVals[j];
-          indices[j] = curGblInds[j];
+          indices[j] = curGblInds(j);
         }
       }
     }
   }
+#endif
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalRowView(LocalOrdinal localRow,
+                  local_inds_host_view_type &indices,
+                  values_host_view_type &values) const 
+  {
+    const char tfecfFuncName[] = "getLocalRowView: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
+      "its indices as global indices, so you cannot get a view with local "
+      "column indices.  If the matrix has a column Map, you may call "
+      "getLocalRowCopy() to get local column indices; otherwise, you may get "
+      "a view with global column indices by calling getGlobalRowCopy().");
+
+    const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
+    if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
+        rowInfo.numEntries > 0) {
+      indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
+                                                         rowInfo.offset1D,
+                                                         rowInfo.numEntries,
+                                                         Access::ReadOnly);
+      values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
+                                                 rowInfo.numEntries,
+                                                 Access::ReadOnly);
+    }
+    else {
+      // This does the right thing (reports an empty row) if the input
+      // row is invalid.
+      indices = local_inds_host_view_type();
+      values = values_host_view_type();
+    }
+
+#ifdef HAVE_TPETRA_DEBUG
+    const char suffix[] = ".  This should never happen.  Please report this "
+      "bug to the Tpetra developers.";
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) !=
+       static_cast<size_t> (values.size ()), std::logic_error,
+       "At the end of this method, for local row " << localRow << ", "
+       "indices.size() = " << indices.size () << " != values.size () = "
+       << values.size () << suffix);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) !=
+       static_cast<size_t> (rowInfo.numEntries), std::logic_error,
+       "At the end of this method, for local row " << localRow << ", "
+       "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
+       << rowInfo.numEntries << suffix);
+    const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
+       "of this method, for local row " << localRow << ", rowInfo.numEntries = "
+       << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
+       expectedNumEntries << suffix);
+#endif // HAVE_TPETRA_DEBUG
+  }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3606,7 +3619,9 @@ namespace Tpetra {
        expectedNumEntries << suffix);
 #endif // HAVE_TPETRA_DEBUG
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   LocalOrdinal
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3636,14 +3651,20 @@ namespace Tpetra {
       }
       else {
         numEnt = static_cast<LO> (rowInfo.numEntries);
-        auto lclColInds = staticGraph_->getLocalKokkosRowView (rowInfo);
-        ind = lclColInds.data (); // FIXME (mfh 18 Jul 2016) UVM
-        const LO err = this->getViewRawConst (val, numEnt, rowInfo);
-        return err;
+        auto lclColInds = staticGraph_->getLocalIndsViewHost (rowInfo);
+        // KDDKDD UVM Breaks reference counting; unsafe
+        ind = lclColInds.data (); 
+
+        auto values = getValuesViewHost (rowInfo);
+        // KDDKDD UVM Breaks reference counting; unsafe
+        val = values.data();
+        return values.extent(0);
       }
     }
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   LocalOrdinal
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3658,7 +3679,65 @@ namespace Tpetra {
     vals = reinterpret_cast<const Scalar*> (vals_ist);
     return errCode;
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getGlobalRowView (GlobalOrdinal globalRow,
+                    global_inds_host_view_type &indices,
+                    values_host_view_type &values) const
+  {
+    const char tfecfFuncName[] = "getGlobalRowView: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      isLocallyIndexed (), std::runtime_error,
+      "The matrix is locally indexed, so we cannot return a view of the row "
+      "with global column indices.  Use getGlobalRowCopy() instead.");
+
+    // This does the right thing (reports an empty row) if the input
+    // row is invalid.
+    const RowInfo rowInfo = 
+          staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
+    if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
+        rowInfo.numEntries > 0) {
+      indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
+                                                         rowInfo.numEntries,
+                                                         Access::ReadOnly);
+      values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
+                                                 rowInfo.numEntries,
+                                                 Access::ReadOnly);
+    }
+    else {
+      indices = global_inds_host_view_type();
+      values = values_host_view_type();
+    }
+
+#ifdef HAVE_TPETRA_DEBUG
+    const char suffix[] = ".  This should never happen.  Please report this "
+      "bug to the Tpetra developers.";
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) !=
+       static_cast<size_t> (values.size ()), std::logic_error,
+       "At the end of this method, for global row " << globalRow << ", "
+       "indices.size() = " << indices.size () << " != values.size () = "
+       << values.size () << suffix);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (indices.size ()) !=
+       static_cast<size_t> (rowInfo.numEntries), std::logic_error,
+       "At the end of this method, for global row " << globalRow << ", "
+       "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
+       << rowInfo.numEntries << suffix);
+    const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
+       "of this method, for global row " << globalRow << ", rowInfo.numEntries "
+       "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
+       " " << expectedNumEntries << suffix);
+#endif // HAVE_TPETRA_DEBUG
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -3720,6 +3799,7 @@ namespace Tpetra {
        " " << expectedNumEntries << suffix);
 #endif // HAVE_TPETRA_DEBUG
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
@@ -3742,16 +3822,10 @@ namespace Tpetra {
       // do nothing
     }
     else {
-      auto lclMat = this->getLocalMatrix ();
-
-      const LO lclNumRows = lclMat.numRows ();
-      for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
-        auto row_i = lclMat.row (lclRow);
-        for (LO k = 0; k < row_i.length; ++k) {
-          // FIXME (mfh 02 Jan 2015) This assumes CUDA UVM.
-          row_i.value (k) *= theAlpha;
-        }
-      }
+
+      auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
+      KokkosBlas::scal(vals, theAlpha, vals);
+   
     }
   }
 
@@ -3776,19 +3850,17 @@ namespace Tpetra {
       // do nothing
     }
     else {
-      // FIXME (mfh 24 Dec 2014) Once CrsMatrix implements DualView
-      // semantics, this would be the place to mark memory as
-      // modified.
-      Kokkos::deep_copy (k_values1D_, theAlpha);
+      Kokkos::deep_copy (valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
+                         theAlpha);
     }
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  setAllValues (const typename local_matrix_type::row_map_type& rowPointers,
-                const typename local_graph_type::entries_type::non_const_type& columnIndices,
-                const typename local_matrix_type::values_type& values)
+  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
+                const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
+                const typename local_matrix_device_type::values_type& values)
   {
     const char tfecfFuncName[] = "setAllValues: ";
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
@@ -3810,7 +3882,7 @@ namespace Tpetra {
     // fillComplete yet, so it's important to check.  We don't care
     // whether setAllIndices() did a shallow copy or a deep copy, so a
     // good way to check is to compare dimensions.
-    auto lclGraph = myGraph_->getLocalGraph ();
+    auto lclGraph = myGraph_->getLocalGraphDevice ();
     const size_t numEnt = lclGraph.entries.extent (0);
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
       (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
@@ -3818,16 +3890,13 @@ namespace Tpetra {
        std::logic_error, "myGraph_->setAllIndices() did not correctly create "
        "local graph.  Please report this bug to the Tpetra developers.");
 
-    const size_t numCols = myGraph_->getColMap ()->getNodeNumElements ();
-
-    auto lclMat = std::make_shared<local_matrix_type>
-      ("Tpetra::CrsMatrix::lclMatrix_", numCols, values, lclGraph);
-    lclMatrix_ = std::make_shared<local_multiply_op_type> (lclMat);
+    valuesPacked_wdv = values_wdv_type(values);
+    valuesUnpacked_wdv = valuesPacked_wdv;
 
     // FIXME (22 Jun 2016) I would very much like to get rid of
     // k_values1D_ at some point.  I find it confusing to have all
     // these extra references lying around.
-    k_values1D_ = lclMat->values;
+//    k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
 
     // Storage MUST be packed, since the interface doesn't give any
     // way to indicate any extra space at the end of each row.
@@ -3848,7 +3917,7 @@ namespace Tpetra {
     using Teuchos::av_reinterpret_cast;
     typedef device_type DT;
     typedef impl_scalar_type IST;
-    typedef typename local_matrix_type::row_map_type row_map_type;
+    typedef typename local_graph_device_type::row_map_type row_map_type;
     //typedef typename row_map_type::non_const_value_type row_offset_type;
     const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
 
@@ -3968,7 +4037,7 @@ namespace Tpetra {
       using ::Tpetra::Details::getDiagCopyWithoutOffsets;
       (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
                                         lclColMap,
-                                        lclMatrix_->getLocalMatrix ());
+                                        getLocalMatrixDevice ());
     }
     else {
       using ::Tpetra::Details::getLocalDiagCopyWithoutOffsetsNotFillComplete;
@@ -4010,7 +4079,7 @@ namespace Tpetra {
       Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
 
     KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
-                               lclMatrix_->getLocalMatrix ());
+                               getLocalMatrixDevice ());
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -4055,15 +4124,17 @@ namespace Tpetra {
     const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
     const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
 
-    local_matrix_type lclMat = lclMatrix_->getLocalMatrix ();
+    auto rowPtrsPackedHost = staticGraph_->rowPtrsPacked_host_;
+    auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
     Kokkos::parallel_for
       ("Tpetra::CrsMatrix::getLocalDiagCopy",
        range_type (0, myNumRows),
        [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
         lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
         if (h_offsets[lclRow] != INV) {
-          auto curRow = lclMat.rowConst (lclRow);
-          lclVecHost1d(lclRow) = static_cast<IST> (curRow.value(h_offsets[lclRow]));
+          auto curRowOffset = rowPtrsPackedHost (lclRow);
+          lclVecHost1d(lclRow) = 
+            static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
         }
       });
     //diag.sync_device ();
@@ -4114,7 +4185,7 @@ namespace Tpetra {
       auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
       auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
       using ::Tpetra::Details::leftScaleLocalCrsMatrix;
-      leftScaleLocalCrsMatrix (lclMatrix_->getLocalMatrix (),
+      leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
                                x_lcl_1d, false, false);
     }
     else {
@@ -4168,7 +4239,7 @@ namespace Tpetra {
       auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
       auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
       using ::Tpetra::Details::rightScaleLocalCrsMatrix;
-      rightScaleLocalCrsMatrix (lclMatrix_->getLocalMatrix (),
+      rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
                                 x_lcl_1d, false, false);
     }
     else {
@@ -4188,7 +4259,6 @@ namespace Tpetra {
     using Teuchos::outArg;
     using Teuchos::REDUCE_SUM;
     using Teuchos::reduceAll;
-    typedef typename Teuchos::ArrayRCP<const impl_scalar_type>::size_type size_type;
 
     // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
     // local part of this computation.  It could make sense to put
@@ -4202,11 +4272,10 @@ namespace Tpetra {
         if (isStorageOptimized ()) {
           // "Optimized" storage is packed storage.  That means we can
           // iterate in one pass through the 1-D values array.
-          const size_type numEntries =
-            static_cast<size_type> (getNodeNumEntries ());
-          for (size_type k = 0; k < numEntries; ++k) {
-            // FIXME (mfh 05 Aug 2014) This assumes UVM.
-            const impl_scalar_type val = k_values1D_(k);
+          const size_t numEntries = getNodeNumEntries ();
+          auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
+          for (size_t k = 0; k < numEntries; ++k) {
+            auto val = values[k];
             // Note (etp 06 Jan 2015) We need abs() here for composite types
             // (in general, if mag_type is on the left-hand-side, we need
             // abs() on the right-hand-side)
@@ -4219,11 +4288,9 @@ namespace Tpetra {
             static_cast<LocalOrdinal> (this->getNodeNumRows ());
           for (LocalOrdinal r = 0; r < numRows; ++r) {
             const RowInfo rowInfo = myGraph_->getRowInfo (r);
-            const size_type numEntries =
-              static_cast<size_type> (rowInfo.numEntries);
-            ArrayView<const impl_scalar_type> A_r =
-              this->getView (rowInfo).view (0, numEntries);
-            for (size_type k = 0; k < numEntries; ++k) {
+            const size_t numEntries = rowInfo.numEntries;
+            auto A_r = this->getValuesViewHost(rowInfo);
+            for (size_t k = 0; k < numEntries; ++k) {
               const impl_scalar_type val = A_r[k];
               const mag_type val_abs = STS::abs (val);
               mySum += val_abs * val_abs;
@@ -4278,16 +4345,19 @@ namespace Tpetra {
 
     crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
     const bool sortGraph = false; // we'll sort graph & matrix together below
+
     theGraph.reindexColumns (newColMap, newImport, sortGraph);
+
     if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
       const LocalOrdinal lclNumRows =
         static_cast<LocalOrdinal> (theGraph.getNodeNumRows ());
+
       for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
+
         const RowInfo rowInfo = theGraph.getRowInfo (row);
-        auto lclColInds = theGraph.getLocalKokkosRowViewNonConst (rowInfo);
-        auto vals = this->getRowViewNonConst (rowInfo);
-        // FIXME (mfh 09 May 2017) This assumes CUDA UVM, at least for
-        // lclColInds, if not also for values.
+        auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
+        auto vals = this->getValuesViewHostNonConst (rowInfo);
+
         sort2 (lclColInds.data (),
                lclColInds.data () + rowInfo.numEntries,
                vals.data ());
@@ -4953,10 +5023,11 @@ namespace Tpetra {
     const char tfecfFuncName[] = "mergeRowIndicesAndValues: ";
 #endif // HAVE_TPETRA_DEBUG
 
-    auto rowValues = this->getRowViewNonConst (rowInfo);
+    auto rowValues = this->getValuesViewHostNonConst (rowInfo);
     typedef typename std::decay<decltype (rowValues[0]) >::type value_type;
     value_type* rowValueIter = rowValues.data ();
-    auto inds_view = graph.getLocalKokkosRowViewNonConst (rowInfo);
+
+    auto inds_view = graph.getLocalIndsViewHostNonConst (rowInfo);
 
     // beg,end define a half-exclusive interval over which to iterate.
     LocalOrdinal* beg = inds_view.data ();
@@ -5037,10 +5108,9 @@ namespace Tpetra {
         [this, &graph, sorted, merged] (const LO& lclRow, size_t& numDups) {
           const RowInfo rowInfo = graph.getRowInfo (lclRow);
           if (! sorted) {
-            auto lclColInds = graph.getLocalKokkosRowViewNonConst (rowInfo);
-            auto vals = this->getRowViewNonConst (rowInfo);
-            // FIXME (mfh 09 May 2017) This assumes CUDA UVM, at least
-            // for lclColInds, if not also for values.
+            auto lclColInds = graph.getLocalIndsViewHostNonConst (rowInfo);
+            auto vals = this->getValuesViewHostNonConst (rowInfo);
+
             sort2 (lclColInds.data (),
                    lclColInds.data () + rowInfo.numEntries,
                    vals.data ());
@@ -5375,13 +5445,11 @@ namespace Tpetra {
 
     auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
     auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
+    auto matrix_lcl = getLocalMultiplyOperator();
 
     const bool debug = ::Tpetra::Details::Behavior::debug ();
     if (debug) {
       const char tfecfFuncName[] = "localApply: ";
-      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-        (lclMatrix_.get () == nullptr, std::logic_error,
-         "lclMatrix_ not created yet.");
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
          "X.getNumVectors() = " << X.getNumVectors () << " != "
@@ -5434,10 +5502,11 @@ namespace Tpetra {
     LocalOrdinal maxRowImbalance = 0;
     if(nrows != 0)
       maxRowImbalance = getNodeMaxNumRowEntries() - (getNodeNumEntries() / nrows);
+
     if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
-      lclMatrix_->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
+      matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
     else
-      lclMatrix_->apply (X_lcl, Y_lcl, mode, alpha, beta);
+      matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -5504,8 +5573,8 @@ namespace Tpetra {
     // Copy old values into new values.  impl_scalar_type and T may
     // differ, so we can't use Kokkos::deep_copy.
     using ::Tpetra::Details::copyConvert;
-    copyConvert (newMatrix->lclMatrix_->getLocalMatrix ().values,
-                 this->lclMatrix_->getLocalMatrix ().values);
+    copyConvert (newMatrix->getLocalMatrixDevice ().values,
+                 this->getLocalMatrixDevice ().values);
     // Since newmat has a static (const) graph, the graph already has
     // a column Map, and Import and Export objects already exist (if
     // applicable).  Thus, calling fillComplete is cheap.
@@ -5547,7 +5616,7 @@ namespace Tpetra {
         (staticGraph_->indicesAreAllocated () &&
          staticGraph_->getNodeAllocationSize() > 0 &&
          staticGraph_->getNodeNumRows() > 0 &&
-         k_values1D_.extent (0) == 0,
+         valuesUnpacked_wdv.extent (0) == 0,
          std::logic_error, err);
     }
   }
@@ -5783,8 +5852,8 @@ namespace Tpetra {
               << std::setw(width) << nE;
           if (vl == VERB_EXTREME) {
             if (isGloballyIndexed()) {
-              ArrayView<const GlobalOrdinal> rowinds;
-              ArrayView<const Scalar> rowvals;
+              global_inds_host_view_type rowinds;
+              values_host_view_type rowvals;
               getGlobalRowView (gid, rowinds, rowvals);
               for (size_t j = 0; j < nE; ++j) {
                 out << " (" << rowinds[j]
@@ -5793,8 +5862,8 @@ namespace Tpetra {
               }
             }
             else if (isLocallyIndexed()) {
-              ArrayView<const LocalOrdinal> rowinds;
-              ArrayView<const Scalar> rowvals;
+              local_inds_host_view_type rowinds;
+              values_host_view_type rowvals;
               getLocalRowView (r, rowinds, rowvals);
               for (size_t j=0; j < nE; ++j) {
                 out << " (" << getColMap()->getGlobalElement(rowinds[j])
@@ -5843,9 +5912,8 @@ namespace Tpetra {
     using Details::padCrsArrays;
     using std::endl;
     using LO = local_ordinal_type;
-    using execution_space = typename device_type::execution_space;
     using row_ptrs_type =
-      typename local_graph_type::row_map_type::non_const_type;
+      typename local_graph_device_type::row_map_type::non_const_type;
     using range_policy =
       Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
     const char tfecfFuncName[] = "applyCrsPadding";
@@ -5889,21 +5957,21 @@ namespace Tpetra {
     // size needs to increase.  That should be the job of
     // padCrsArrays.
 
-    // Making copies here because k_rowPtrs_ has a const type. Otherwise, we
+    // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
     // would use it directly.
 
     if (verbose) {
       std::ostringstream os;
       os << *prefix << "Allocate row_ptrs_beg: "
-         << myGraph_->k_rowPtrs_.extent(0) << endl;
+         << myGraph_->rowPtrsUnpacked_host_.extent(0) << endl;
       std::cerr << os.str();
     }
     using Kokkos::view_alloc;
     using Kokkos::WithoutInitializing;
     row_ptrs_type row_ptr_beg(
       view_alloc("row_ptr_beg", WithoutInitializing),
-      myGraph_->k_rowPtrs_.extent(0));
-    Kokkos::deep_copy(row_ptr_beg, myGraph_->k_rowPtrs_);
+                 myGraph_->rowPtrsUnpacked_dev_.extent(0));
+    Kokkos::deep_copy(row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
 
     const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
       size_t(row_ptr_beg.extent(0) - 1);
@@ -5915,17 +5983,20 @@ namespace Tpetra {
     row_ptrs_type row_ptr_end(
       view_alloc("row_ptr_end", WithoutInitializing), N);
 
+    row_ptrs_type num_row_entries_d;
+
     const bool refill_num_row_entries =
       myGraph_->k_numRowEntries_.extent(0) != 0;
-
+    
     if (refill_num_row_entries) { // unpacked storage
       // We can't assume correct *this capture until C++17, and it's
       // likely more efficient just to capture what we need anyway.
-      auto num_row_entries = myGraph_->k_numRowEntries_;
+      num_row_entries_d = create_mirror_view_and_copy(execution_space(),
+                                                 myGraph_->k_numRowEntries_);
       Kokkos::parallel_for
         ("Fill end row pointers", range_policy(0, N),
          KOKKOS_LAMBDA (const size_t i) {
-          row_ptr_end(i) = row_ptr_beg(i) + num_row_entries(i);
+          row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
         });
     }
     else {
@@ -5940,47 +6011,49 @@ namespace Tpetra {
     }
 
     if (myGraph_->isGloballyIndexed()) {
-      padCrsArrays(row_ptr_beg, row_ptr_end, myGraph_->k_gblInds1D_,
-                   k_values1D_, padding, myRank, verbose);
-      const auto newValuesLen = k_values1D_.extent(0);
-      const auto newColIndsLen = myGraph_->k_gblInds1D_.extent(0);
+      padCrsArrays(row_ptr_beg, row_ptr_end,
+                   myGraph_->gblInds_wdv,
+                   valuesUnpacked_wdv, padding, myRank, verbose);
+      const auto newValuesLen = valuesUnpacked_wdv.extent(0);
+      const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (newValuesLen != newColIndsLen, std::logic_error,
-         ": After padding, k_values1D_.extent(0)=" << newValuesLen
-         << " != myGraph_->k_gblInds1D_.extent(0)=" << newColIndsLen
+         ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
+         << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
          << suffix);
     }
     else {
-      padCrsArrays(row_ptr_beg, row_ptr_end, myGraph_->k_lclInds1D_,
-                   k_values1D_, padding, myRank, verbose);
-      const auto newValuesLen = k_values1D_.extent(0);
-      const auto newColIndsLen = myGraph_->k_lclInds1D_.extent(0);
+      padCrsArrays(row_ptr_beg, row_ptr_end,
+                   myGraph_->lclIndsUnpacked_wdv,
+                   valuesUnpacked_wdv, padding, myRank, verbose);
+      const auto newValuesLen = valuesUnpacked_wdv.extent(0);
+      const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (newValuesLen != newColIndsLen, std::logic_error,
-         ": After padding, k_values1D_.extent(0)=" << newValuesLen
-         << " != myGraph_->k_lclInds1D_.extent(0)=" << newColIndsLen
+         ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
+         << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
          << suffix);
     }
 
     if (refill_num_row_entries) {
-      auto num_row_entries = myGraph_->k_numRowEntries_;
       Kokkos::parallel_for
         ("Fill num entries", range_policy(0, N),
          KOKKOS_LAMBDA (const size_t i) {
-          num_row_entries(i) = row_ptr_end(i) - row_ptr_beg(i);
+          num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
         });
+      Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
     }
 
     if (verbose) {
       std::ostringstream os;
-      os << *prefix << "Assign myGraph_->k_rowPtrs_; "
-         << "old size: " << myGraph_->k_rowPtrs_.extent(0)
+      os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
+         << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
          << ", new size: " << row_ptr_beg.extent(0) << endl;
       std::cerr << os.str();
-      TEUCHOS_ASSERT( myGraph_->k_rowPtrs_.extent(0) ==
+      TEUCHOS_ASSERT( myGraph_->rowPtrsUnpacked_host_.extent(0) ==
                       row_ptr_beg.extent(0) );
     }
-    myGraph_->k_rowPtrs_ = row_ptr_beg;
+    myGraph_->setRowPtrsUnpacked(row_ptr_beg);
   }
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -6022,8 +6095,8 @@ namespace Tpetra {
     // This involves copying rows corresponding to LIDs [0, numSame-1].
     //
     const map_type& srcRowMap = * (srcMat.getRowMap ());
-    Array<GO> rowInds;
-    Array<Scalar> rowVals;
+    nonconst_global_inds_host_view_type rowInds;
+    nonconst_values_host_view_type rowVals;
     const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
     for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
       // Global ID for the current row index in the source matrix.
@@ -6032,19 +6105,19 @@ namespace Tpetra {
       const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
       const GO targetGID = sourceGID;
 
-      ArrayView<const GO> rowIndsConstView;
+      ArrayView<const GO>rowIndsConstView;
       ArrayView<const Scalar> rowValsConstView;
 
       if (sourceIsLocallyIndexed) {
         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
         if (rowLength > static_cast<size_t> (rowInds.size())) {
-          rowInds.resize (rowLength);
-          rowVals.resize (rowLength);
+          Kokkos::resize(rowInds,rowLength);
+          Kokkos::resize(rowVals,rowLength);
         }
         // Resizing invalidates an Array's views, so we must make new
         // ones, even if rowLength hasn't changed.
-        ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
-        ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
+        nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
+        nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
 
         // The source matrix is locally indexed, so we have to get a
         // copy.  Really it's the GIDs that have to be copied (because
@@ -6060,12 +6133,37 @@ namespace Tpetra {
              "of " << rowLength << ", but getGlobalRowCopy reports "
              "a row length of " << checkRowLength << "." << suffix);
         }
-        rowIndsConstView = rowIndsView.view (0, rowLength);
-        rowValsConstView = rowValsView.view (0, rowLength);
+
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
       }
       else { // source matrix is globally indexed.
-        srcMat.getGlobalRowView(sourceGID, rowIndsConstView,
-                                rowValsConstView);
+        global_inds_host_view_type rowIndsView;
+        values_host_view_type rowValsView;
+        srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
+
       }
 
       // Applying a permutation to a matrix with a static graph
@@ -6091,13 +6189,13 @@ namespace Tpetra {
       if (sourceIsLocallyIndexed) {
         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
         if (rowLength > static_cast<size_t> (rowInds.size ())) {
-          rowInds.resize (rowLength);
-          rowVals.resize (rowLength);
+          Kokkos::resize(rowInds,rowLength);
+          Kokkos::resize(rowVals,rowLength);
         }
         // Resizing invalidates an Array's views, so we must make new
         // ones, even if rowLength hasn't changed.
-        ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
-        ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
+        nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
+        nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
 
         // The source matrix is locally indexed, so we have to get a
         // copy.  Really it's the GIDs that have to be copied (because
@@ -6113,12 +6211,36 @@ namespace Tpetra {
              rowLength << ", but getGlobalRowCopy a row length of "
              << checkRowLength << "." << suffix);
         }
-        rowIndsConstView = rowIndsView.view (0, rowLength);
-        rowValsConstView = rowValsView.view (0, rowLength);
+
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
       }
       else {
-        srcMat.getGlobalRowView(sourceGID, rowIndsConstView,
-                                rowValsConstView);
+        global_inds_host_view_type rowIndsView;
+        values_host_view_type rowValsView;
+        srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
       }
 
       combineGlobalValues(targetGID, rowIndsConstView,
@@ -6179,9 +6301,11 @@ namespace Tpetra {
     // This involves copying rows corresponding to LIDs [0, numSame-1].
     //
     const map_type& srcRowMap = * (srcMat.getRowMap ());
-    Array<GO> rowInds;
-    Array<Scalar> rowVals;
     const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
+    using gids_type = nonconst_global_inds_host_view_type;
+    using vals_type = nonconst_values_host_view_type;
+    gids_type rowInds;
+    vals_type rowVals;
     for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
       // Global ID for the current row index in the source matrix.
       // The first numSameIDs GIDs in the two input lists are the
@@ -6193,15 +6317,16 @@ namespace Tpetra {
       ArrayView<const Scalar> rowValsConstView;
 
       if (sourceIsLocallyIndexed) {
+
         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
-        if (rowLength > static_cast<size_t> (rowInds.size())) {
-          rowInds.resize (rowLength);
-          rowVals.resize (rowLength);
+        if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
+          Kokkos::resize(rowInds,rowLength);
+          Kokkos::resize(rowVals,rowLength);
         }
         // Resizing invalidates an Array's views, so we must make new
         // ones, even if rowLength hasn't changed.
-        ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
-        ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
+        gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
+        vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
 
         // The source matrix is locally indexed, so we have to get a
         // copy.  Really it's the GIDs that have to be copied (because
@@ -6217,12 +6342,26 @@ namespace Tpetra {
              "of " << rowLength << ", but getGlobalRowCopy reports "
              "a row length of " << checkRowLength << "." << suffix);
         }
-        rowIndsConstView = rowIndsView.view (0, rowLength);
-        rowValsConstView = rowValsView.view (0, rowLength);
+        rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
+        rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
       }
       else { // source matrix is globally indexed.
-        srcMat.getGlobalRowView(sourceGID, rowIndsConstView,
-                                rowValsConstView);
+        global_inds_host_view_type rowIndsView;
+        values_host_view_type rowValsView;
+        srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
+
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
       }
 
       // Combine the data into the target matrix.
@@ -6247,14 +6386,14 @@ namespace Tpetra {
 
       if (sourceIsLocallyIndexed) {
         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
-        if (rowLength > static_cast<size_t> (rowInds.size ())) {
-          rowInds.resize (rowLength);
-          rowVals.resize (rowLength);
+        if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
+          Kokkos::resize(rowInds,rowLength);
+          Kokkos::resize(rowVals,rowLength);
         }
         // Resizing invalidates an Array's views, so we must make new
         // ones, even if rowLength hasn't changed.
-        ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
-        ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
+        gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
+        vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
 
         // The source matrix is locally indexed, so we have to get a
         // copy.  Really it's the GIDs that have to be copied (because
@@ -6270,12 +6409,26 @@ namespace Tpetra {
              rowLength << ", but getGlobalRowCopy a row length of "
              << checkRowLength << "." << suffix);
         }
-        rowIndsConstView = rowIndsView.view (0, rowLength);
-        rowValsConstView = rowValsView.view (0, rowLength);
+        rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
+        rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
       }
       else {
-        srcMat.getGlobalRowView(sourceGID, rowIndsConstView,
-                                rowValsConstView);
+        global_inds_host_view_type rowIndsView;
+        values_host_view_type rowValsView;
+        srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
+
+        // KDDKDD UVM TEMPORARY:  refactor combineGlobalValues to take
+        // KDDKDD UVM TEMPORARY:  Kokkos::View instead of ArrayView
+        // KDDKDD UVM TEMPORARY:  For now, wrap the view in ArrayViews
+        // KDDKDD UVM TEMPORARY:  Should be safe because we hold the KokkosViews
+        rowIndsConstView = Teuchos::ArrayView<const GO> (  // BAD BAD BAD
+                           rowIndsView.data(), rowIndsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        rowValsConstView = Teuchos::ArrayView<const Scalar> (  // BAD BAD BAD
+                           reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
+                           Teuchos::RCP_DISABLE_NODE_LOOKUP);
+        // KDDKDD UVM TEMPORARY:  Add replace, sum, transform methods with
+        // KDDKDD UVM TEMPORARY:  KokkosView interface
       }
 
       // Combine the data into the target matrix.
@@ -6935,10 +7088,22 @@ namespace Tpetra {
     // Compute the number of "packets" (in this case, bytes) per
     // export LID (in this case, local index of the row to send), and
     // actually pack the data.
+    auto maxRowNumEnt = this->getNodeMaxNumRowEntries();
+
+
+    // Temporary buffer for global column indices.
+    typename global_inds_host_view_type::non_const_type gidsIn_k;
+    if (this->isLocallyIndexed()) { // Need storage for Global IDs
+      gidsIn_k = 
+        typename global_inds_host_view_type::non_const_type("packGids",
+                                                            maxRowNumEnt);
+    }
+
     size_t offset = 0; // current index into 'exports' array.
     for (size_t i = 0; i < numExportLIDs; ++i) {
       const LO lclRow = exportLIDs_h[i];
 
+      size_t numBytes;
       size_t numEnt;
       numEnt = this->getNumEntriesInLocalRow (lclRow);
 
@@ -6951,52 +7116,48 @@ namespace Tpetra {
         continue;
       }
 
-      // Temporary buffer for global column indices.
-      using Details::ScalarViewTraits;
-      View<GO*, HES> gidsIn_k =
-        ScalarViewTraits<GO, HES>::allocateArray (GO (0), numEnt, "gids");
-
-      Teuchos::ArrayView<const Scalar> valsIn;
       if (this->isLocallyIndexed ()) {
+        typename global_inds_host_view_type::non_const_type gidsIn; 
+        values_host_view_type valsIn;
         // If the matrix is locally indexed on the calling process, we
         // have to use its column Map (which it _must_ have in this
         // case) to convert to global indices.
-        Teuchos::ArrayView<const LO> lidsIn;
+        local_inds_host_view_type lidsIn;
         this->getLocalRowView (lclRow, lidsIn, valsIn);
         const map_type& colMap = * (this->getColMap ());
         for (size_t k = 0; k < numEnt; ++k) {
           gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
         }
+        gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
+
+        const size_t numBytesPerValue =
+          PackTraits<ST>::packValueCount (valsIn[0]);
+        numBytes = this->packRow (exports_h.data (), offset, numEnt,
+                                  gidsIn.data (), valsIn.data (), 
+                                  numBytesPerValue);
       }
       else if (this->isGloballyIndexed ()) {
+        global_inds_host_view_type gidsIn; 
+        values_host_view_type valsIn;
         // If the matrix is globally indexed on the calling process,
         // then we can use the column indices directly.  However, we
         // have to get the global row index.  The calling process must
         // have a row Map, since otherwise it shouldn't be participating
         // in packing operations.
-        Teuchos::ArrayView<const GO> gblIndView;;
         const map_type& rowMap = * (this->getRowMap ());
         const GO gblRow = rowMap.getGlobalElement (lclRow);
-        this->getGlobalRowView (gblRow, gblIndView, valsIn);
-        for (size_t k = 0; k < numEnt; ++k) {
-          gidsIn_k[k] = gblIndView[k];
-        }
+        this->getGlobalRowView (gblRow, gidsIn, valsIn);
+
+        const size_t numBytesPerValue =
+          PackTraits<ST>::packValueCount (valsIn[0]);
+        numBytes = this->packRow (exports_h.data (), offset, numEnt, 
+                                  gidsIn.data (), valsIn.data (),
+                                  numBytesPerValue);
       }
       // mfh 11 Sep 2017: Currently, if the matrix is neither globally
       // nor locally indexed, then it has no entries.  Therefore,
       // there is nothing to pack.  No worries!
 
-      typename HES::device_type outputDevice;
-      auto valsIn_k =
-        create_mirror_view_from_raw_host_array (outputDevice,
-                                                reinterpret_cast<const ST*> (valsIn.getRawPtr ()),
-                                                valsIn.size (),
-                                                true, "valsIn");
-      const size_t numBytesPerValue =
-        PackTraits<ST>::packValueCount (valsIn[0]);
-      const size_t numBytes =
-        this->packRow (exports_h.data (), offset, numEnt, gidsIn_k.data (),
-                       valsIn_k.data (), numBytesPerValue);
       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
         (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
          "First invalid offset into 'exports' pack buffer at index i = " << i
@@ -7875,9 +8036,10 @@ namespace Tpetra {
       os << *prefix << "Compute C = alpha*A + beta*B" << endl;
       std::cerr << os.str ();
     }
-
-    Array<GO> ind;
-    Array<Scalar> val;
+    using gids_type = nonconst_global_inds_host_view_type;
+    using vals_type = nonconst_values_host_view_type;
+    gids_type ind;
+    vals_type val;
 
     if (alpha != ZERO) {
       const LO A_localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
@@ -7885,11 +8047,11 @@ namespace Tpetra {
         size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
         const GO globalRow = A_rowMap->getGlobalElement (localRow);
         if (A_numEntries > static_cast<size_t> (ind.size ())) {
-          ind.resize (A_numEntries);
-          val.resize (A_numEntries);
+          Kokkos::resize(ind,A_numEntries);
+          Kokkos::resize(val,A_numEntries);
         }
-        ArrayView<GO> indView = ind (0, A_numEntries);
-        ArrayView<Scalar> valView = val (0, A_numEntries);
+        gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
+        vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
         A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
 
         if (alpha != ONE) {
@@ -7897,7 +8059,9 @@ namespace Tpetra {
             valView[k] *= alpha;
           }
         }
-        C->insertGlobalValues (globalRow, indView, valView);
+        C->insertGlobalValues (globalRow, A_numEntries,
+                               reinterpret_cast<Scalar *>(valView.data()),
+                               indView.data());
       }
     }
 
@@ -7907,11 +8071,11 @@ namespace Tpetra {
         size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
         const GO globalRow = B_rowMap->getGlobalElement (localRow);
         if (B_numEntries > static_cast<size_t> (ind.size ())) {
-          ind.resize (B_numEntries);
-          val.resize (B_numEntries);
+          Kokkos::resize(ind,B_numEntries);
+          Kokkos::resize(val,B_numEntries);
         }
-        ArrayView<GO> indView = ind (0, B_numEntries);
-        ArrayView<Scalar> valView = val (0, B_numEntries);
+        gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
+        vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
         B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
 
         if (beta != ONE) {
@@ -7919,7 +8083,9 @@ namespace Tpetra {
             valView[k] *= beta;
           }
         }
-        C->insertGlobalValues (globalRow, indView, valView);
+        C->insertGlobalValues (globalRow, B_numEntries,
+                               reinterpret_cast<Scalar *>(valView.data()),
+                               indView.data());
       }
     }
 
diff --git a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp
new file mode 100644
index 000000000000..5e335b1b6f5e
--- /dev/null
+++ b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp
@@ -0,0 +1,364 @@
+// @HEADER
+// ***********************************************************************
+//
+//          Tpetra: Templated Linear Algebra Services Package
+//                 Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+// @HEADER
+
+#ifndef TPETRA_DETAILS_WRAPPEDDUALVIEW_HPP
+#define TPETRA_DETAILS_WRAPPEDDUALVIEW_HPP
+
+#include <Tpetra_Access.hpp>
+#include <Kokkos_DualView.hpp>
+#include "Teuchos_TestForException.hpp"
+#include <sstream>
+
+// #define DEBUG_UVM_REMOVAL  // Works only with gcc > 4.8
+
+#ifdef DEBUG_UVM_REMOVAL
+
+#define DEBUG_UVM_REMOVAL_ARGUMENT ,const char* callerstr = __builtin_FUNCTION()
+
+#define DEBUG_UVM_REMOVAL_PRINT_CALLER(fn) \
+  { \
+  auto envVarSet = std::getenv("TPETRA_UVM_REMOVAL"); \
+  if (envVarSet && (std::strcmp(envVarSet,"1") == 0)) \
+    std::cout << (fn) << " called from " << callerstr \
+              << " host cnt " << dualView.h_view.use_count()  \
+              << " device cnt " << dualView.d_view.use_count()  \
+              << std::endl; \
+  }
+
+#else
+
+#define DEBUG_UVM_REMOVAL_ARGUMENT
+#define DEBUG_UVM_REMOVAL_PRINT_CALLER(fn)
+
+#endif
+
+//! Namespace for Tpetra classes and methods
+namespace Tpetra {
+
+/// \brief Namespace for Tpetra implementation details.
+/// \warning Do NOT rely on the contents of this namespace.
+namespace Details {
+
+namespace impl {
+
+template <typename DualViewType>
+struct hasConstData {
+  using valueType = typename DualViewType::value_type;
+  using constValueType = typename DualViewType::const_value_type;
+  static constexpr bool value = std::is_same<valueType, constValueType>::value;
+};
+
+template <typename DualViewType>
+using enableIfConstData = std::enable_if_t<hasConstData<DualViewType>::value>;
+
+template <typename DualViewType>
+using enableIfNonConstData = std::enable_if_t<!hasConstData<DualViewType>::value>;
+
+template <typename DualViewType>
+enableIfNonConstData<DualViewType>
+sync_host(DualViewType dualView) {
+  dualView.sync_host();
+}
+
+template <typename DualViewType>
+enableIfConstData<DualViewType>
+sync_host(DualViewType dualView) { }
+
+template <typename DualViewType>
+enableIfNonConstData<DualViewType>
+sync_device(DualViewType dualView) {
+  dualView.sync_device();
+}
+
+template <typename DualViewType>
+enableIfConstData<DualViewType>
+sync_device(DualViewType dualView) { }
+
+}
+
+template <typename DualViewType>
+class WrappedDualView {
+public:
+  using HostViewType = typename DualViewType::t_host;
+  using DeviceViewType = typename DualViewType::t_dev;
+
+private:
+  static constexpr bool dualViewHasNonConstData = !impl::hasConstData<DualViewType>::value;
+  static constexpr bool deviceMemoryIsHostAccessible =
+    Kokkos::SpaceAccessibility<Kokkos::Serial, typename DeviceViewType::memory_space>::accessible;
+
+public:
+  WrappedDualView() {}
+
+  WrappedDualView(DualViewType dualV)
+    : originalDualView(dualV),
+      dualView(originalDualView)
+  { }
+
+  WrappedDualView(const DeviceViewType deviceView) {
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        deviceView.data() != nullptr && deviceView.use_count() == 0,
+        std::invalid_argument,
+        "Tpetra::Details::WrappedDualView: cannot construct with a device view that\n"
+        "does not own its memory (i.e. constructed with a raw pointer and dimensions)\n"
+        "because the WrappedDualView needs to assume ownership of the memory.");
+    //If the provided view is default-constructed (null, 0 extent, 0 use count),
+    //leave the host mirror default-constructed as well in order to have a matching use count of 0.
+    HostViewType hostView;
+    if(deviceView.use_count() != 0)
+    {
+      hostView = Kokkos::create_mirror_view_and_copy(
+          typename HostViewType::memory_space(),
+          deviceView);
+    }
+    originalDualView = DualViewType(deviceView, hostView);
+    dualView = originalDualView;
+  }
+
+  WrappedDualView(const WrappedDualView parent, int offset, int numEntries) {
+    originalDualView = parent.originalDualView;
+    dualView = getSubview(parent.dualView, offset, numEntries);
+  }
+
+  size_t extent(const int i) const {
+    return dualView.extent(i);
+  }
+
+  typename HostViewType::const_type
+  getHostView(Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewReadOnly");
+    throwIfDeviceViewAlive();
+    impl::sync_host(originalDualView);
+    return dualView.view_host();
+  }
+
+  HostViewType
+  getHostView(Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    throwIfDeviceViewAlive();
+    impl::sync_host(originalDualView);
+    originalDualView.modify_host();
+    return dualView.view_host();
+  }
+
+  HostViewType
+  getHostView(Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    if (iAmASubview()) {
+      return getHostView(Access::ReadWrite);
+    }
+    throwIfDeviceViewAlive();
+    if (deviceMemoryIsHostAccessible) Kokkos::fence();
+    dualView.clear_sync_state();
+    dualView.modify_host();
+    return dualView.view_host();
+  }
+
+  typename DeviceViewType::const_type
+  getDeviceView(Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadOnly");
+    throwIfHostViewAlive();
+    impl::sync_device(originalDualView);
+    return dualView.view_device();
+  }
+
+  DeviceViewType
+  getDeviceView(Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    throwIfHostViewAlive();
+    impl::sync_device(originalDualView);
+    originalDualView.modify_device();
+    return dualView.view_device();
+  }
+
+  DeviceViewType
+  getDeviceView(Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    if (iAmASubview()) {
+      return getDeviceView(Access::ReadWrite);
+    }
+    throwIfHostViewAlive();
+    dualView.clear_sync_state();
+    dualView.modify_device();
+    return dualView.view_device();
+  }
+
+  typename HostViewType::const_type
+  getHostSubview(int offset, int numEntries, Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewReadOnly");
+    throwIfDeviceViewAlive();
+    impl::sync_host(originalDualView);
+    return getSubview(dualView.view_host(), offset, numEntries);
+  }
+
+  HostViewType
+  getHostSubview(int offset, int numEntries, Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    throwIfDeviceViewAlive();
+    impl::sync_host(originalDualView);
+    originalDualView.modify_host();
+    return getSubview(dualView.view_host(), offset, numEntries);
+  }
+
+  HostViewType
+  getHostSubview(int offset, int numEntries, Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    return getHostSubview(offset, numEntries, Access::ReadWrite);
+  }
+
+  typename DeviceViewType::const_type
+  getDeviceSubview(int offset, int numEntries, Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadOnly");
+    throwIfHostViewAlive();
+    impl::sync_device(originalDualView);
+    return getSubview(dualView.view_device(), offset, numEntries);
+  }
+
+  DeviceViewType
+  getDeviceSubview(int offset, int numEntries, Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    throwIfHostViewAlive();
+    impl::sync_device(originalDualView);
+    originalDualView.modify_device();
+    return getSubview(dualView.view_device(), offset, numEntries);
+  }
+
+  DeviceViewType
+  getDeviceSubview(int offset, int numEntries, Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) 
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    return getDeviceSubview(offset, numEntries, Access::ReadWrite);
+  }
+
+private:
+  template <typename ViewType>
+  ViewType getSubview(ViewType view, int offset, int numEntries) const {
+    return Kokkos::subview(view, Kokkos::pair<int, int>(offset, offset+numEntries));
+  }
+
+  void throwIfHostViewAlive() const {
+    if( deviceMemoryIsHostAccessible && dualView.h_view.data() == dualView.d_view.data()) return;
+
+    if (dualView.h_view.use_count() > dualView.d_view.use_count()) {
+      std::ostringstream msg;
+      msg << "Tpetra::Details::WrappedDualView (name = " << dualView.d_view.label() 
+          << "; host use_count = " << dualView.h_view.use_count()
+          << "; device use_count = " << dualView.d_view.use_count() << "): "
+          << "Cannot access data on device while a host view is alive";
+      throw std::runtime_error(msg.str());
+    }
+  }
+
+  void throwIfDeviceViewAlive() const {
+    if(deviceMemoryIsHostAccessible && dualView.h_view.data() == dualView.d_view.data()) return;
+
+    if (dualView.d_view.use_count() > dualView.h_view.use_count()) {
+      std::ostringstream msg;
+      msg << "Tpetra::Details::WrappedDualView (name = " << dualView.d_view.label()
+          << "; host use_count = " << dualView.h_view.use_count()
+          << "; device use_count = " << dualView.d_view.use_count() << "): "
+          << "Cannot access data on host while a device view is alive";
+      throw std::runtime_error(msg.str());
+    }
+  }
+
+  bool iAmASubview() {
+    return originalDualView.h_view != dualView.h_view;
+  }
+
+  mutable DualViewType originalDualView;
+  mutable DualViewType dualView;
+};
+
+} // namespace Details
+
+} // namespace Tpetra
+
+#endif
diff --git a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp
index 7a4e34e7e4e4..8c999ddb005b 100644
--- a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp
@@ -46,6 +46,7 @@
 #include "Kokkos_Core.hpp"
 #include "Tpetra_Details_Behavior.hpp"
 #include "Tpetra_Details_CrsPadding.hpp"
+#include "Tpetra_Details_WrappedDualView.hpp"
 #include <iostream>
 #include <memory>
 #include <unordered_map>
@@ -151,8 +152,8 @@ pad_crs_arrays(
   const PadCrsAction action,
   const RowPtr& row_ptr_beg,
   const RowPtr& row_ptr_end,
-  Indices& indices,
-  Values& values,
+  Indices& indices_wdv,
+  Values& values_wdv,
   const Padding& padding,
   const int my_rank,
   const bool verbose)
@@ -187,8 +188,8 @@ pad_crs_arrays(
     Kokkos::deep_copy(row_ptr_end_h, row_ptr_end);
     verbosePrintArray(os, row_ptr_end_h, "row_ptr_end before scan",
                       maxNumToPrint);
-    os << ", indices.extent(0): " << indices.extent(0)
-       << ", values.extent(0): " << values.extent(0)
+    os << ", indices.extent(0): " << indices_wdv.extent(0)
+       << ", values.extent(0): " << values_wdv.extent(0)
        << ", padding: ";
     padding.print(os);
     os << endl;
@@ -215,6 +216,7 @@ pad_crs_arrays(
   }
   size_t increase = 0;
   {
+    // Must do on host because padding uses std::map
     auto row_ptr_end_h = create_mirror_view(
       hostSpace, row_ptr_end, verbose, prefix.get());
     Kokkos::deep_copy(row_ptr_end_h, row_ptr_end);
@@ -268,20 +270,23 @@ pad_crs_arrays(
     Kokkos::deep_copy(newAllocPerRow, newAllocPerRow_h);
   }
 
-  using inds_value_type = typename Indices::non_const_value_type;
-  using vals_value_type = typename Values::non_const_value_type;
+  using inds_value_type = 
+        typename Indices::DeviceViewType::non_const_value_type;
+  using vals_value_type = typename Values::DeviceViewType::non_const_value_type;
 
-  const size_t newIndsSize = size_t(indices.size()) + increase;
-  auto indices_new = make_uninitialized_view<Indices>(
+  auto indices_old = indices_wdv.getDeviceView(Access::ReadOnly);
+  const size_t newIndsSize = size_t(indices_old.size()) + increase;
+  auto indices_new = make_uninitialized_view<typename Indices::DeviceViewType>(
     "Tpetra::CrsGraph column indices", newIndsSize, verbose,
     prefix.get());
 
-  Values values_new;
+  typename Values::DeviceViewType values_new;
+  auto values_old = values_wdv.getDeviceView(Access::ReadOnly);
   if (action == PadCrsAction::INDICES_AND_VALUES) {
     const size_t newValsSize = newIndsSize;
     // NOTE (mfh 10 Feb 2020) If we don't initialize values_new here,
     // then the CrsMatrix tests fail.
-    values_new = make_initialized_view<Values>(
+    values_new = make_initialized_view<typename Values::DeviceViewType>(
       "Tpetra::CrsMatrix values", newValsSize, verbose, prefix.get());
   }
 
@@ -290,11 +295,11 @@ pad_crs_arrays(
     os << *prefix << "Repack" << endl;
     std::cerr << os.str();
   }
-  using execution_space = typename Indices::execution_space;
+  using execution_space = typename Indices::DeviceViewType::execution_space;
   using range_type = Kokkos::RangePolicy<execution_space, size_t>;
   Kokkos::parallel_scan(
     "Tpetra::CrsGraph or CrsMatrix repack",
-    range_type(0, lclNumRows+1),
+    range_type(size_t(0), size_t(lclNumRows+1)),
     KOKKOS_LAMBDA (const size_t lclRow, size_t& newRowBeg,
                    const bool finalPass)
     {
@@ -313,15 +318,16 @@ pad_crs_arrays(
             row_beg, row_beg + numEnt);
           const Kokkos::pair<size_t, size_t> newRange(
             newRowBeg, newRowBeg + numEnt);
-          auto oldColInds = subview(indices, oldRange);
-          auto newColInds = subview(indices_new, newRange);
+          auto oldColInds = Kokkos::subview(indices_old, oldRange);
+          auto newColInds = Kokkos::subview(indices_new, newRange);
           // memcpy works fine on device; the next step is to
           // introduce two-level parallelism and use team copy.
           memcpy(newColInds.data(), oldColInds.data(),
                  numEnt * sizeof(inds_value_type));
           if (action == PadCrsAction::INDICES_AND_VALUES) {
-            auto oldVals = subview(values, oldRange);
-            auto newVals = subview(values_new, newRange);
+            auto oldVals = 
+                 Kokkos::subview(values_old, oldRange);
+            auto newVals = Kokkos::subview(values_new, newRange);
             memcpy(newVals.data(), oldVals.data(),
                    numEnt * sizeof(vals_value_type));
           }
@@ -335,7 +341,8 @@ pad_crs_arrays(
       newRowBeg += newRowAllocSize;
     });
 
-  if (verbose) {
+  if (verbose) 
+  {
     std::ostringstream os;
 
     os << *prefix;
@@ -354,21 +361,15 @@ pad_crs_arrays(
                       maxNumToPrint);
     os << endl;
 
-    std::cerr << os.str();
+    std::cout << os.str();
   }
 
-  assign_to_view(indices, indices_new,
-                 "Tpetra::CrsGraph column indices",
-                 verbose, prefix.get());
-  assign_to_view(values, values_new,
-                 "Tpetra::CrsMatrix values",
-                 verbose, prefix.get());
+  indices_wdv = Indices(indices_new);
+  values_wdv = Values(values_new);
 
   if (verbose) {
-    auto indices_h = Kokkos::create_mirror_view(hostSpace, indices);
-    Kokkos::deep_copy(indices_h, indices);
-    auto values_h = Kokkos::create_mirror_view(hostSpace, values);
-    Kokkos::deep_copy(values_h, values);
+    auto indices_h = indices_wdv.getHostView(Access::ReadOnly);
+    auto values_h = values_wdv.getHostView(Access::ReadOnly);
     std::ostringstream os;
     os << "On output: ";
     verbosePrintArray(os, indices_h, "indices", maxNumToPrint);
@@ -500,7 +501,8 @@ find_crs_indices(
   if (new_indices.size() == 0)
     return 0;
 
-  using ordinal = typename Indices1::value_type;
+  using ordinal = 
+        typename std::remove_const<typename Indices1::value_type>::type;
   auto invalid_ordinal = Teuchos::OrdinalTraits<ordinal>::invalid();
 
   const size_t start = static_cast<size_t> (row_ptrs[row]);
@@ -549,17 +551,17 @@ void
 padCrsArrays(
     const RowPtr& rowPtrBeg,
     const RowPtr& rowPtrEnd,
-    Indices& indices,
+    Indices& indices_wdv,
     const Padding& padding,
     const int my_rank,
     const bool verbose)
 {
   using impl::pad_crs_arrays;
   // send empty values array
-  Indices values;
-  pad_crs_arrays<RowPtr, Indices, Indices, Padding>(
+  Indices values_null; 
+  pad_crs_arrays<RowPtr, Indices, Indices, Padding>( 
     impl::PadCrsAction::INDICES_ONLY, rowPtrBeg, rowPtrEnd,
-    indices, values, padding, my_rank, verbose);
+    indices_wdv, values_null, padding, my_rank, verbose);
 }
 
 template<class RowPtr, class Indices, class Values, class Padding>
@@ -567,8 +569,8 @@ void
 padCrsArrays(
     const RowPtr& rowPtrBeg,
     const RowPtr& rowPtrEnd,
-    Indices& indices,
-    Values& values,
+    Indices& indices_wdv,
+    Values& values_wdv,
     const Padding& padding,
     const int my_rank,
     const bool verbose)
@@ -576,7 +578,7 @@ padCrsArrays(
   using impl::pad_crs_arrays;
   pad_crs_arrays<RowPtr, Indices, Values, Padding>(
     impl::PadCrsAction::INDICES_AND_VALUES, rowPtrBeg, rowPtrEnd,
-    indices, values, padding, my_rank, verbose);
+    indices_wdv, values_wdv, padding, my_rank, verbose);
 }
 
 /// \brief Insert new indices in to current list of indices
diff --git a/packages/tpetra/core/src/Tpetra_Details_extractBlockDiagonal.hpp b/packages/tpetra/core/src/Tpetra_Details_extractBlockDiagonal.hpp
index c1e70af8d265..e423bc6082d6 100644
--- a/packages/tpetra/core/src/Tpetra_Details_extractBlockDiagonal.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_extractBlockDiagonal.hpp
@@ -64,7 +64,7 @@ void extractBlockDiagonal(const SparseMatrixType& A, MultiVectorType & diagonal)
   using local_map_type = typename SparseMatrixType::map_type::local_map_type;
   using SC             = typename MultiVectorType::scalar_type;
   using LO             = typename SparseMatrixType::local_ordinal_type;
-  using KCRS           = typename SparseMatrixType::local_matrix_type;
+  using KCRS           = typename SparseMatrixType::local_matrix_device_type;
   using lno_view_t     = typename KCRS::StaticCrsGraphType::row_map_type::const_type;
   using lno_nnz_view_t = typename KCRS::StaticCrsGraphType::entries_type::const_type;
   using scalar_view_t  = typename KCRS::values_type::const_type;
@@ -85,7 +85,7 @@ void extractBlockDiagonal(const SparseMatrixType& A, MultiVectorType & diagonal)
   local_map_type rowmap  = A.getRowMap()->getLocalMap();
   local_map_type colmap  = A.getRowMap()->getLocalMap();
   local_mv_type diag     = diagonal.getLocalViewDevice(Access::OverwriteAll);
-  const KCRS   Amat      = A.getLocalMatrix();
+  const KCRS   Amat      = A.getLocalMatrixDevice();
   lno_view_t Arowptr     = Amat.graph.row_map;
   lno_nnz_view_t Acolind = Amat.graph.entries;
   scalar_view_t Avals    = Amat.values;
diff --git a/packages/tpetra/core/src/Tpetra_Details_getDiagCopyWithoutOffsets_def.hpp b/packages/tpetra/core/src/Tpetra_Details_getDiagCopyWithoutOffsets_def.hpp
index 548f36271d96..9233151fc4fd 100644
--- a/packages/tpetra/core/src/Tpetra_Details_getDiagCopyWithoutOffsets_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_getDiagCopyWithoutOffsets_def.hpp
@@ -152,25 +152,20 @@ class GetLocalDiagCopyWithoutOffsetsNotFillCompleteFunctor {
       errCount++;
     }
     else { // row index is also in the column Map on this process
-      LO numEnt;
-      const LO* lclColInds;
-      const SC* curVals;
-      const LO err = A_.getLocalRowViewRaw (lclRowInd, numEnt, lclColInds, curVals);
-      if (err != 0) {
+      typename row_matrix_type::local_inds_host_view_type lclColInds;
+      typename row_matrix_type::values_host_view_type curVals;
+      A_.getLocalRowView(lclRowInd, lclColInds, curVals);
+      LO numEnt = lclColInds.extent(0);
+      // The search hint is always zero, since we only call this
+      // once per row of the matrix.
+      const LO hint = 0;
+      const LO offset =
+        findRelOffset (lclColInds, numEnt, lclColInd, hint, sorted_);
+      if (offset == numEnt) { // didn't find the diagonal column index
         errCount++;
       }
       else {
-        // The search hint is always zero, since we only call this
-        // once per row of the matrix.
-        const LO hint = 0;
-        const LO offset =
-          findRelOffset (lclColInds, numEnt, lclColInd, hint, sorted_);
-        if (offset == numEnt) { // didn't find the diagonal column index
-          errCount++;
-        }
-        else {
-          D_lcl_1d_(lclRowInd) = curVals[offset];
-        }
+        D_lcl_1d_(lclRowInd) = curVals[offset];
       }
     }
   }
diff --git a/packages/tpetra/core/src/Tpetra_Details_getGraphDiagOffsets_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_getGraphDiagOffsets_decl.hpp
index 4a93834cf6f5..56b830ba6504 100644
--- a/packages/tpetra/core/src/Tpetra_Details_getGraphDiagOffsets_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_getGraphDiagOffsets_decl.hpp
@@ -86,9 +86,9 @@ class GetGraphDiagOffsets {
   typedef ::Kokkos::StaticCrsGraph<LO,
                                    ::Kokkos::LayoutLeft,
                                    device_type,
-                                   void, size_t> local_graph_type;
+                                   void, size_t> local_graph_device_type;
   typedef ::Tpetra::Details::LocalMap<LO, GO, device_type> local_map_type;
-  typedef ::Kokkos::View<const typename local_graph_type::size_type*,
+  typedef ::Kokkos::View<const typename local_graph_device_type::size_type*,
                          ::Kokkos::LayoutLeft,
                          device_type,
                          ::Kokkos::MemoryUnmanaged> row_offsets_type;
diff --git a/packages/tpetra/core/src/Tpetra_Details_getNumDiags.hpp b/packages/tpetra/core/src/Tpetra_Details_getNumDiags.hpp
index 6aace9e2c42b..9d97ef98c9a8 100644
--- a/packages/tpetra/core/src/Tpetra_Details_getNumDiags.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_getNumDiags.hpp
@@ -61,7 +61,7 @@ namespace Impl {
   ///   number of diagonal entries in a sparse graph.
   ///
   /// \tparam LocalGraphType Kokkos::StaticCrsGraph specialization
-  /// \tparam LocalMapType Result of Tpetra::Map::getLocalGraph()
+  /// \tparam LocalMapType Result of Tpetra::CrsGraph::getLocalGraph*()
   template<class LocalGraphType, class LocalMapType>
   class CountLocalNumDiags {
   public:
@@ -116,8 +116,8 @@ namespace Impl {
   {
     using crs_graph_type = ::Tpetra::CrsGraph<LO, GO, NT>;
     using local_map_type = typename crs_graph_type::map_type::local_map_type;
-    using local_graph_type = typename crs_graph_type::local_graph_type;
-    using functor_type = CountLocalNumDiags<local_graph_type, local_map_type>;
+    using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
+    using functor_type = CountLocalNumDiags<local_graph_device_type, local_map_type>;
     using execution_space = typename crs_graph_type::device_type::execution_space;
     using policy_type = Kokkos::RangePolicy<execution_space, LO>;
 
@@ -128,7 +128,7 @@ namespace Impl {
     }
     else {
       LO lclNumDiags {0};
-      functor_type f (G.getLocalGraph (), rowMap->getLocalMap (), colMap->getLocalMap ());
+      functor_type f (G.getLocalGraphDevice (), rowMap->getLocalMap (), colMap->getLocalMap ());
       Kokkos::parallel_reduce (policy_type (0, G.getNodeNumRows ()), f, lclNumDiags);
       return lclNumDiags;
     }
@@ -168,7 +168,8 @@ namespace Impl {
       TEUCHOS_TEST_FOR_EXCEPTION
         (! G.supportsRowViews (), std::logic_error, "Not implemented!");
 
-      Teuchos::ArrayView<const LO> lclColInds;
+      typename ::Tpetra::RowGraph<LO, GO, NT>::local_inds_host_view_type 
+               lclColInds;
       const LO lclNumRows = static_cast<LO> (G.getNodeNumRows ());
 
       LO diagCount = 0;
@@ -208,17 +209,16 @@ namespace Impl {
       return 0; // this process does not participate
     }
     else {
-      Teuchos::Array<LO> lclColIndsBuf;
+      using inds_type = typename ::Tpetra::RowGraph<LO,GO,NT>::nonconst_local_inds_host_view_type;
+      inds_type lclColIndsBuf("lclColIndsBuf",G.getNodeMaxNumRowEntries());
       const LO lclNumRows = static_cast<LO> (G.getNodeNumRows ());
 
       LO diagCount = 0;
       for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
         size_t numEntSizeT = G.getNumEntriesInLocalRow (lclRow);
         const LO numEnt = static_cast<LO> (numEntSizeT);
-        if (static_cast<LO> (lclColIndsBuf.size ()) < numEnt) {
-          lclColIndsBuf.resize (numEnt);
-        }
-        Teuchos::ArrayView<LO> lclColInds = lclColIndsBuf (0, numEnt);
+
+        inds_type lclColInds = Kokkos::subview(lclColIndsBuf,std::make_pair(0,numEnt));
         G.getLocalRowCopy (lclRow, lclColInds, numEntSizeT);
 
         if (numEnt != 0) {
@@ -252,7 +252,8 @@ namespace Impl {
       return 0; // this process does not participate
     }
     else {
-      Teuchos::ArrayView<const GO> gblColInds;
+      typename ::Tpetra::RowGraph<LO,GO,NT>::global_inds_host_view_type 
+               gblColInds;
       const LO lclNumRows = static_cast<LO> (G.getNodeNumRows ());
 
       LO diagCount = 0;
@@ -281,12 +282,13 @@ namespace Impl {
   typename ::Tpetra::RowGraph<LO, GO, NT>::local_ordinal_type
   countLocalNumDiagsInNonFillCompleteGloballyIndexedGraphWithoutRowViews (const ::Tpetra::RowGraph<LO, GO, NT>& G)
   {
+    using gids_type = typename ::Tpetra::RowGraph<LO,GO,NT>::nonconst_global_inds_host_view_type ;
     const auto rowMap = G.getRowMap ();
     if (rowMap.get () == nullptr) {
       return 0; // this process does not participate
     }
     else {
-      Teuchos::Array<GO> gblColIndsBuf;
+      gids_type gblColIndsBuf;
       const LO lclNumRows = static_cast<LO> (G.getNodeNumRows ());
 
       LO diagCount = 0;
@@ -294,9 +296,10 @@ namespace Impl {
         size_t numEntSizeT = G.getNumEntriesInLocalRow (lclRow);
         const LO numEnt = static_cast<LO> (numEntSizeT);
         if (static_cast<LO> (gblColIndsBuf.size ()) < numEnt) {
-          gblColIndsBuf.resize (numEnt);
+          Kokkos::resize(gblColIndsBuf,numEnt);
         }
-        Teuchos::ArrayView<GO> gblColInds = gblColIndsBuf (0, numEnt);
+
+        gids_type gblColInds = Kokkos::subview(gblColIndsBuf,std::make_pair((LO)0, numEnt));
         const GO gblRow = rowMap->getGlobalElement (lclRow);
         G.getGlobalRowCopy (gblRow, gblColInds, numEntSizeT);
 
diff --git a/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_decl.hpp
index 9e00e267a3e3..c00fffc01467 100644
--- a/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_decl.hpp
@@ -40,6 +40,8 @@
 #ifndef TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DECL_HPP
 #define TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DECL_HPP
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+
 /// \file Tpetra_Details_localDeepCopyRowMatrix_decl.hpp
 /// \brief Declaration of function for making a deep copy of a
 ///   Tpetra::RowMatrix's local matrix.
@@ -59,6 +61,7 @@ KokkosSparse::CrsMatrix<
     typename NT::device_type,
     void,
     size_t>
+TPETRA_DEPRECATED
 localDeepCopyLocallyIndexedRowMatrix
   (const RowMatrix<SC, LO, GO, NT>& A,
    const char label[]);
@@ -66,4 +69,6 @@ localDeepCopyLocallyIndexedRowMatrix
 } // namespace Details
 } // namespace Tpetra
 
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 #endif // TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DECL_HPP
diff --git a/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_def.hpp
index 16f2ae4ee744..b817d4b0e8b7 100644
--- a/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_localDeepCopyRowMatrix_def.hpp
@@ -40,6 +40,7 @@
 #ifndef TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DEF_HPP
 #define TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DEF_HPP
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
 /// \file Tpetra_Details_localDeepCopyRowMatrix_def.hpp
 /// \brief Definition of function for making a deep copy of a
 ///   Tpetra::RowMatrix's local matrix.
@@ -63,6 +64,7 @@ KokkosSparse::CrsMatrix<
     typename NT::device_type,
     void,
     size_t>
+TPETRA_DEPRECATED
 localDeepCopyLocallyIndexedRowMatrix
 (const RowMatrix<SC, LO, GO, NT>& A,
  const char label[])
@@ -89,32 +91,38 @@ localDeepCopyLocallyIndexedRowMatrix
   using Kokkos::view_alloc;
   using Kokkos::WithoutInitializing;
   using IST = typename Kokkos::ArithTraits<SC>::val_type;
-  using local_matrix_type = KokkosSparse::CrsMatrix<
+  using local_matrix_device_type = KokkosSparse::CrsMatrix<
     IST, LO, typename NT::device_type, void, size_t>;
-  using local_graph_type =
-    typename local_matrix_type::staticcrsgraph_type;
-  using inds_type = typename local_graph_type::entries_type;
+  using local_graph_device_type =
+    typename local_matrix_device_type::staticcrsgraph_type;
+  using inds_type = typename local_graph_device_type::entries_type;
   inds_type ind (view_alloc ("ind", WithoutInitializing), nnz);
   auto ind_h = Kokkos::create_mirror_view (ind);
 
-  using values_type = typename local_matrix_type::values_type;
+  using values_type = typename local_matrix_device_type::values_type;
   values_type val (view_alloc ("val", WithoutInitializing), nnz);
   auto val_h = Kokkos::create_mirror_view (val);
 
   const bool hasViews = A.supportsRowViews ();
+  using row_matrix_type = RowMatrix<SC, LO, GO, NT>;
+  using h_lids_type = typename row_matrix_type::nonconst_local_inds_host_view_type;
+  using h_vals_type = typename row_matrix_type::nonconst_values_host_view_type;
+  using h_lids_type_const = typename row_matrix_type::local_inds_host_view_type;
+  using h_vals_type_const = typename row_matrix_type::values_host_view_type;
 
-  Teuchos::Array<LO> inputIndsBuf;
-  Teuchos::Array<SC> inputValsBuf;
+
+  h_lids_type inputIndsBuf;
+  h_vals_type inputValsBuf;
   if (! hasViews) {
-    inputIndsBuf.resize (maxNumEnt);
-    inputValsBuf.resize (maxNumEnt);
+    Kokkos::resize(inputIndsBuf,maxNumEnt);
+    Kokkos::resize(inputValsBuf,maxNumEnt);
   }
 
   const LO lclNumRows (A.getNodeNumRows ());
   offset_type curPos = 0;
   for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
-    Teuchos::ArrayView<const LO> inputInds_av;
-    Teuchos::ArrayView<const SC> inputVals_av;
+    h_lids_type_const inputInds_av;
+    h_vals_type_const inputVals_av;
     size_t numEnt = 0;
     if (hasViews) {
       A.getLocalRowView (lclRow, inputInds_av,
@@ -122,14 +130,14 @@ localDeepCopyLocallyIndexedRowMatrix
       numEnt = static_cast<size_t> (inputInds_av.size ());
     }
     else {
-      A.getLocalRowCopy (lclRow, inputIndsBuf (),
-                         inputValsBuf (), numEnt);
-      inputInds_av = inputIndsBuf.view (0, numEnt);
-      inputVals_av = inputValsBuf.view (0, numEnt);
+      A.getLocalRowCopy (lclRow, inputIndsBuf,
+                         inputValsBuf, numEnt);
+      inputInds_av = Kokkos::subview(inputIndsBuf,std::make_pair((size_t)0,numEnt));
+      inputVals_av = Kokkos::subview(inputValsBuf,std::make_pair((size_t)0,numEnt));
     }
     const IST* inVals =
-      reinterpret_cast<const IST*> (inputVals_av.getRawPtr ());
-    const LO* inInds = inputInds_av.getRawPtr ();
+      reinterpret_cast<const IST*> (inputVals_av.data());
+    const LO* inInds = inputInds_av.data();
     std::copy (inInds, inInds + numEnt, ind_h.data () + curPos);
     std::copy (inVals, inVals + numEnt, val_h.data () + curPos);
     curPos += offset_type (numEnt);
@@ -137,14 +145,16 @@ localDeepCopyLocallyIndexedRowMatrix
   Kokkos::deep_copy (ind, ind_h);
   Kokkos::deep_copy (val, val_h);
 
-  local_graph_type lclGraph (ind, ptr);
+  local_graph_device_type lclGraph (ind, ptr);
   const size_t numCols = A.getColMap ()->getNodeNumElements ();
-  return local_matrix_type (label, numCols, val, lclGraph);
+  return local_matrix_device_type (label, numCols, val, lclGraph);
 }
 
+
 } // namespace Details
 } // namespace Tpetra
 
+
 //
 // Explicit instantiation macros
 //
@@ -160,4 +170,6 @@ namespace Details { \
      const char label[]); \
 }
 
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 #endif // TPETRA_DETAILS_LOCALDEEPCOPYROWMATRIX_DEF_HPP
diff --git a/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_decl.hpp
index 5ef8116ea2ba..db3792c50b9e 100644
--- a/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_decl.hpp
@@ -56,13 +56,13 @@ namespace Details {
 template <class NT>
 struct LocalRowOffsetsResult {
 private:
-  using local_graph_type =
+  using local_graph_device_type =
     typename KokkosSparse::CrsMatrix<
       double, int, typename NT::device_type, void, size_t>::
         staticcrsgraph_type;
 public:
   using offsets_type =
-    typename local_graph_type::row_map_type::non_const_type;
+    typename local_graph_device_type::row_map_type::non_const_type;
   using offset_type = typename offsets_type::non_const_value_type;
 
   offsets_type ptr; //!< Local row offsets (Kokkos::View)
diff --git a/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_def.hpp b/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_def.hpp
index d497813e7031..fe044271adfd 100644
--- a/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_localRowOffsets_def.hpp
@@ -128,7 +128,7 @@ localRowOffsetsFromFillCompleteCrsGraph (const CrsGraph<LO, GO, NT>& G)
   using offsets_type = typename result_type::offsets_type;
   using offset_type = typename result_type::offset_type;
 
-  auto G_lcl = G.getLocalGraph ();
+  auto G_lcl = G.getLocalGraphDevice ();
   offsets_type ptr (view_alloc ("ptr", WithoutInitializing),
                     G_lcl.row_map.extent (0));
   Kokkos::deep_copy (ptr, G_lcl.row_map);
diff --git a/packages/tpetra/core/src/Tpetra_Details_makeColMap_def.hpp b/packages/tpetra/core/src/Tpetra_Details_makeColMap_def.hpp
index 96b0abd79dda..2ec1a490f796 100644
--- a/packages/tpetra/core/src/Tpetra_Details_makeColMap_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_makeColMap_def.hpp
@@ -434,7 +434,7 @@ makeColMap (Teuchos::RCP<const Tpetra::Map<LO, GO, NT> >& colMap,
       const LO lclNumRows = rowMap.getNodeNumElements ();
       for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
         const GO gblRow = rowMap.getGlobalElement (lclRow);
-        Teuchos::ArrayView<const GO> rowGids;
+        typename RowGraph<LO,GO,NT>::global_inds_host_view_type rowGids;
         graph.getGlobalRowView (gblRow, rowGids);
 
         const LO numEnt = static_cast<LO> (rowGids.size ());
diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp
index 6ed5c71c2a0e..479733fe8c9d 100644
--- a/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp
@@ -686,12 +686,12 @@ packCrsGraph
   using packet_type = typename crs_graph_type::packet_type;
   using buffer_device_type = typename crs_graph_type::buffer_device_type;
   using exports_view_type = Kokkos::DualView<packet_type*, buffer_device_type>;
-  using local_graph_type = typename crs_graph_type::local_graph_type;
+  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
   using local_map_type = typename Tpetra::Map<LO, GO, NT>::local_map_type;
   const char prefix[] = "Tpetra::Details::packCrsGraph: ";
   constexpr bool debug = false;
 
-  local_graph_type local_graph = sourceGraph.getLocalGraph ();
+  local_graph_device_type local_graph = sourceGraph.getLocalGraphDevice ();
   local_map_type local_col_map = sourceGraph.getColMap ()->getLocalMap ();
 
   // Setting this to zero tells the caller to expect a possibly
@@ -755,7 +755,7 @@ packCrsGraph
 
   exports.modify_device ();
   auto exports_d = exports.view_device ();
-  do_pack<packet_type, local_graph_type, local_map_type, buffer_device_type>
+  do_pack<packet_type, local_graph_device_type, local_map_type, buffer_device_type>
     (local_graph, local_col_map, exports_d, num_packets_per_lid,
      export_lids, export_pids, offsets, pack_pids);
   // If we got this far, we succeeded.
@@ -880,11 +880,11 @@ packCrsGraphNew (const CrsGraph<LO,GO,NT>& sourceGraph,
   using BDT = typename crs_graph_type::buffer_device_type;
   using PT = typename crs_graph_type::packet_type;
   using exports_dual_view_type = Kokkos::DualView<PT*, BDT>;
-  using LGT = typename crs_graph_type::local_graph_type;
+  using LGT = typename crs_graph_type::local_graph_device_type;
   using LMT = typename crs_graph_type::map_type::local_map_type;
   const char prefix[] = "Tpetra::Details::packCrsGraphNew: ";
 
-  const LGT local_graph = sourceGraph.getLocalGraph ();
+  const LGT local_graph = sourceGraph.getLocalGraphDevice ();
   const LMT local_col_map = sourceGraph.getColMap ()->getLocalMap ();
 
   // Setting this to zero tells the caller to expect a possibly
diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp
index d15031230c63..a0f83397581d 100644
--- a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp
@@ -484,12 +484,12 @@ packCrsMatrixRow (const ColumnMap& col_map,
 
 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
 struct PackCrsMatrixFunctor {
-  typedef LocalMatrix local_matrix_type;
+  typedef LocalMatrix local_matrix_device_type;
   typedef LocalMap local_map_type;
-  typedef typename local_matrix_type::value_type ST;
+  typedef typename local_matrix_device_type::value_type ST;
   typedef typename local_map_type::local_ordinal_type LO;
   typedef typename local_map_type::global_ordinal_type GO;
-  typedef typename local_matrix_type::device_type DT;
+  typedef typename local_matrix_device_type::device_type DT;
 
   typedef Kokkos::View<const size_t*, BufferDeviceType>
     num_packets_per_lid_view_type;
@@ -504,11 +504,11 @@ struct PackCrsMatrixFunctor {
     offset_type;
   typedef Kokkos::pair<int, LO> value_type;
 
-  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
+  static_assert (std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
                  "local_map_type::local_ordinal_type and "
-                 "local_matrix_type::ordinal_type must be the same.");
+                 "local_matrix_device_type::ordinal_type must be the same.");
 
-  local_matrix_type local_matrix;
+  local_matrix_device_type local_matrix;
   local_map_type local_col_map;
   exports_view_type exports;
   num_packets_per_lid_view_type num_packets_per_lid;
@@ -518,7 +518,7 @@ struct PackCrsMatrixFunctor {
   size_t num_bytes_per_value;
   bool pack_pids;
 
-  PackCrsMatrixFunctor (const local_matrix_type& local_matrix_in,
+  PackCrsMatrixFunctor (const local_matrix_device_type& local_matrix_in,
                         const local_map_type& local_col_map_in,
                         const exports_view_type& exports_in,
                         const num_packets_per_lid_view_type& num_packets_per_lid_in,
@@ -744,7 +744,7 @@ packCrsMatrix (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
   const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
   constexpr bool debug = false;
 
-  auto local_matrix = sourceMatrix.getLocalMatrix ();
+  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
   auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
 
   // Setting this to zero tells the caller to expect a possibly
@@ -843,13 +843,13 @@ packCrsMatrix (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
      "one matrix entry, but export_pids.extent(0) = 0.");
 
   typedef typename std::decay<decltype (local_matrix)>::type
-    local_matrix_type;
+    local_matrix_device_type;
   typedef typename std::decay<decltype (local_col_map)>::type
     local_map_type;
 
   exports.modify_device ();
   auto exports_d = exports.view_device ();
-  do_pack<local_matrix_type, local_map_type, DT>
+  do_pack<local_matrix_device_type, local_map_type, DT>
     (local_matrix, local_col_map, exports_d, num_packets_per_lid,
      export_lids, export_pids, offsets, num_bytes_per_value,
      pack_pids);
@@ -867,8 +867,8 @@ packCrsMatrix (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
                size_t& constantNumPackets,
                Distributor& distor)
 {
-  using local_matrix_type = typename CrsMatrix<ST,LO,GO,NT>::local_matrix_type;
-  using device_type = typename local_matrix_type::device_type;
+  using local_matrix_device_type = typename CrsMatrix<ST,LO,GO,NT>::local_matrix_device_type;
+  using device_type = typename local_matrix_device_type::device_type;
   using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
   using host_exec_space = typename Kokkos::View<size_t*, device_type>::HostMirror::execution_space;
   using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
@@ -969,12 +969,12 @@ packCrsMatrixWithOwningPIDs (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
                              size_t& constantNumPackets,
                              Distributor& distor)
 {
-  typedef typename CrsMatrix<ST,LO,GO,NT>::local_matrix_type local_matrix_type;
+  typedef typename CrsMatrix<ST,LO,GO,NT>::local_matrix_device_type local_matrix_device_type;
   typedef typename DistObject<char, LO, GO, NT>::buffer_device_type buffer_device_type;
   typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
   typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
 
-  typename local_matrix_type::device_type outputDevice;
+  typename local_matrix_device_type::device_type outputDevice;
 
   const bool verbose = ::Tpetra::Details::Behavior::verbose ();
   std::unique_ptr<std::string> prefix;
diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp
index 14a9c4123e87..e95b90bcaead 100644
--- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp
@@ -141,11 +141,10 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
   using Teuchos::NO_TRANS;
   ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
 
-  auto A_lcl = A.getLocalMatrix (); 
+  auto A_lcl = A.getLocalMatrixDevice (); 
   auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
   auto B_lcl = B.getLocalViewDevice(Access::ReadOnly);
   auto R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
-  auto lclMatrix_ = A.getLocalMatrix ();
 
   const bool debug = ::Tpetra::Details::Behavior::debug ();
   if (debug) {
diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
index 72881aa7385d..a52f4b223a9f 100644
--- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
@@ -874,7 +874,7 @@ unpackAndCombineWithOwningPIDsCount(
   using Kokkos::View;
   using device_type = typename Node::device_type;
   using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
-  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
+  using local_graph_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_device_type;
   using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
   const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
 
@@ -893,7 +893,7 @@ unpackAndCombineWithOwningPIDsCount(
      prefix << "importLIDs.size() = " << importLIDs.size() << " != "
      "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
 
-  auto local_graph = sourceGraph.getLocalGraph();
+  auto local_graph = sourceGraph.getLocalGraphDevice();
   auto permute_from_lids_d =
     create_mirror_view_from_raw_host_array(device_type(),
                                            permuteFromLIDs.getRawPtr(),
@@ -911,7 +911,7 @@ unpackAndCombineWithOwningPIDsCount(
                                            "num_packets_per_lid");
 
   return UnpackAndCombineCrsGraphImpl::unpackAndCombineWithOwningPIDsCount<
-    packet_type,local_graph_type,buffer_device_type>(
+    packet_type,local_graph_device_type,buffer_device_type>(
       local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
 }
 
@@ -958,7 +958,7 @@ unpackAndCombineIntoCrsArrays(
   using GO = GlobalOrdinal;
   using crs_graph_type = CrsGraph<LO, GO, Node>;
   using packet_type = typename crs_graph_type::packet_type;
-  using local_graph_type = typename crs_graph_type::local_graph_type;
+  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
   using buffer_device_type = typename crs_graph_type::buffer_device_type;
   using device_type = typename Node::device_type;
   using size_type = typename Teuchos::ArrayView<const LO>::size_type;
@@ -988,7 +988,7 @@ unpackAndCombineIntoCrsArrays(
   TargetPids.assign(TargetNumNonzeros, -1);
 
   // Grab pointers for sourceGraph
-  auto local_graph = sourceGraph.getLocalGraph();
+  auto local_graph = sourceGraph.getLocalGraphDevice();
   auto local_col_map = sourceGraph.getColMap()->getLocalMap();
 
   // Convert input arrays to Kokkos::View
@@ -1042,7 +1042,7 @@ unpackAndCombineIntoCrsArrays(
 
   using local_map_type = decltype(local_col_map);
   UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays<
-    packet_type,local_graph_type,local_map_type,buffer_device_type>(
+    packet_type,local_graph_device_type,local_map_type,buffer_device_type>(
       local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
       permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
       tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
index 908ca4a10000..7bd5f5d507ea 100644
--- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
@@ -1196,8 +1196,8 @@ unpackCrsMatrixAndCombine(
 {
   using Kokkos::View;
   typedef typename Node::device_type device_type;
-  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_type local_matrix_type;
-  static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
+  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
+  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
                  "Node::device_type and LocalMatrix::device_type must be the same.");
 
   // Execution space.
@@ -1221,16 +1221,17 @@ unpackCrsMatrixAndCombine(
     create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
         imports.size(), true, "imports");
 
-  auto local_matrix = sourceMatrix.getLocalMatrix();
+  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
   auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
 
-  for (int i=0; i<importLIDs.size(); i++)
-  {
-    auto lclRow = importLIDs[i];
-    Teuchos::ArrayView<const LO> A_indices;
-    Teuchos::ArrayView<const ST> A_values;
-    sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
-  }
+//KDDKDD This loop doesn't appear to do anything; what is it?
+//KDDKDD  for (int i=0; i<importLIDs.size(); i++)
+//KDDKDD  {
+//KDDKDD    auto lclRow = importLIDs[i];
+//KDDKDD    Teuchos::ArrayView<const LO> A_indices;
+//KDDKDD    Teuchos::ArrayView<const ST> A_values;
+//KDDKDD    sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
+//KDDKDD  }
   // Now do the actual unpack!
   UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
       local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
@@ -1256,12 +1257,12 @@ unpackCrsMatrixAndCombineNew(
   using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
   using dist_object_type = DistObject<char, LO, GO, NT>;
   using device_type = typename crs_matrix_type::device_type;
-  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
+  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
   using buffer_device_type = typename dist_object_type::buffer_device_type;
 
   static_assert
-    (std::is_same<device_type, typename local_matrix_type::device_type>::value,
-     "crs_matrix_type::device_type and local_matrix_type::device_type "
+    (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
+     "crs_matrix_type::device_type and local_matrix_device_type::device_type "
      "must be the same.");
 
   if (numPacketsPerLID.need_sync_device()) {
@@ -1277,12 +1278,12 @@ unpackCrsMatrixAndCombineNew(
   }
   auto imports_d = imports.view_device ();
 
-  auto local_matrix = sourceMatrix.getLocalMatrix ();
+  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
   auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
   typedef decltype (local_col_map) local_map_type;
 
   UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
-      local_matrix_type,
+      local_matrix_device_type,
       local_map_type,
       buffer_device_type
     > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
@@ -1379,7 +1380,7 @@ unpackAndCombineWithOwningPIDsCount (
      prefix << "importLIDs.size() = " << importLIDs.size () << " != "
      "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
 
-  auto local_matrix = sourceMatrix.getLocalMatrix ();
+  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
   auto permute_from_lids_d =
     create_mirror_view_from_raw_host_array (DT (),
                                             permuteFromLIDs.getRawPtr (),
@@ -1481,7 +1482,7 @@ unpackAndCombineIntoCrsArrays (
   TargetPids.assign (TargetNumNonzeros, -1);
 
   // Grab pointers for sourceMatrix
-  auto local_matrix = sourceMatrix.getLocalMatrix();
+  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
   auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
 
   // Convert input arrays to Kokkos::View
diff --git a/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp
index 689c70ff7f33..334578fb370c 100644
--- a/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_FECrsGraph_decl.hpp
@@ -145,7 +145,7 @@ namespace Tpetra {
     typedef typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::execution_space execution_space;
 
     //! The type of the part of the sparse graph on each MPI process.
-    typedef  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type local_graph_type;
+    typedef  typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type local_graph_device_type;
 
     //! The Map specialization used by this class.
     using map_type = ::Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>;
diff --git a/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp
index 2232a4de134e..aed24aab7cc6 100644
--- a/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_FECrsGraph_def.hpp
@@ -279,7 +279,8 @@ void FECrsGraph<LocalOrdinal, GlobalOrdinal, Node>::doOwnedPlusSharedToOwned(con
     if (debug && checkColGIDsInAtLeastOneOwnedRow) {
       Teuchos::RCP<const map_type> colmap = this->getColMap();
       Teuchos::Array<bool> flag(colmap->getNodeNumElements(),false);
-      Teuchos::Array<LocalOrdinal> indices(this->getNodeMaxNumRowEntries());
+      typename crs_graph_type::nonconst_local_inds_host_view_type indices("indices",this->getNodeMaxNumRowEntries());
+
       for(size_t i=0; i<ownedRowMap->getNodeNumElements(); i++)  {
         size_t NumEntries=0;
         this->getLocalRowCopy(i,indices,NumEntries);
@@ -313,13 +314,7 @@ void FECrsGraph<LocalOrdinal, GlobalOrdinal, Node>::doOwnedPlusSharedToOwned(con
     }
 
     // Time to build an owned localGraph via subviews
-    local_graph_type ownedPlusSharedGraph = this->getLocalGraph();
-    size_t numOwnedRows = ownedRowMap->getNodeNumElements();
-    size_t numOwnedNonZeros = Tpetra::Details::getEntryOnHost(ownedPlusSharedGraph.row_map,numOwnedRows);
-    auto row_ptrs = Kokkos::subview(ownedPlusSharedGraph.row_map,Kokkos::pair<size_t,size_t>(0,numOwnedRows+1));
-    auto col_indices = Kokkos::subview(ownedPlusSharedGraph.entries,Kokkos::pair<size_t,size_t>(0,numOwnedNonZeros));
-
-    inactiveCrsGraph_ = Teuchos::rcp(new crs_graph_type(ownedRowMap,this->getColMap(),row_ptrs,col_indices));
+    inactiveCrsGraph_ = Teuchos::rcp(new crs_graph_type(*this, ownedRowMap));
     inactiveCrsGraph_->fillComplete(ownedDomainMap_,ownedRangeMap_);
   }
 }//end doOverlapToLocal
diff --git a/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp
index 44509dd15d7c..6d3ae433e521 100644
--- a/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_FECrsMatrix_decl.hpp
@@ -117,11 +117,21 @@ class FECrsMatrix :
     typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::crs_graph_type crs_graph_type;
 
     //! The part of the sparse matrix's graph on each MPI process.
-    typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_graph_type local_graph_type;
+    typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type local_graph_device_type;
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     /// \brief The specialization of Kokkos::CrsMatrix that represents
     ///        the part of the sparse matrix on each MPI process.
     typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_type local_matrix_type;
+#endif
+
+    /// \brief The specialization of Kokkos::CrsMatrix that represents
+    ///        the part of the sparse matrix for each MPI process on device.
+    typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type local_matrix_device_type;
+
+    /// \brief The specialization of Kokkos::CrsMatrix that represents
+    ///        the part of the sparse matrix for each MPI process on host.
+    typedef typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type local_matrix_host_type;
 
     /// \brief Parent CrsMatrix type using the same scalars
     typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crs_matrix_type;
diff --git a/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp
index c2c138b29e39..10476a1e23b0 100644
--- a/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_FECrsMatrix_def.hpp
@@ -57,7 +57,7 @@ FECrsMatrix(const Teuchos::RCP<const fe_crs_graph_type>& graph,
 
 {
   const char tfecfFuncName[] = "FECrsMatrix(RCP<const FECrsGraph>[, RCP<ParameterList>]): ";
-  typedef typename local_matrix_type::values_type values_type;
+  typedef typename local_matrix_device_type::values_type values_type;
 
   TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
     (graph.is_null (), std::runtime_error, "Input graph is null.");
@@ -78,10 +78,7 @@ FECrsMatrix(const Teuchos::RCP<const fe_crs_graph_type>& graph,
   if(!graph->inactiveCrsGraph_.is_null() ) {
     // We are *requiring* memory aliasing here, so we'll grab the first chunk of the Owned+Shared matrix's values array to make the 
     // guy for the Owned matrix.
-    values_type myvals = this->getLocalMatrix().values;
-
-    size_t numOwnedVals = graph->getLocalGraph().entries.extent(0); // OwnedVals
-    inactiveCrsMatrix_ = Teuchos::rcp(new crs_matrix_type(graph,Kokkos::subview(myvals,Kokkos::pair<size_t,size_t>(0,numOwnedVals))));
+    inactiveCrsMatrix_ = Teuchos::rcp(new crs_matrix_type(*this,graph));
   }
 }
 
diff --git a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp
index 491297dc38ab..26cc6850c617 100644
--- a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp
@@ -72,25 +72,25 @@ namespace Tpetra {
       ::Tpetra::Details::DefaultTypes::local_ordinal_type;
     using execution_space = typename Device::execution_space;
   public:
-    using local_matrix_type =
+    using local_matrix_device_type =
       KokkosSparse::CrsMatrix<matrix_scalar_type,
                               local_ordinal_type,
                               device_type,
                               void,
                               size_t>;
   private:
-    //The type of a matrix with offset=ordinal, but otherwise the same as local_matrix_type
+    //The type of a matrix with offset=ordinal, but otherwise the same as local_matrix_device_type
     using local_cusparse_matrix_type =
       KokkosSparse::CrsMatrix<matrix_scalar_type,
                               local_ordinal_type,
                               device_type,
                               void,
                               local_ordinal_type>;
-    using local_graph_type = typename local_matrix_type::StaticCrsGraphType;
-    using ordinal_view_type = typename local_graph_type::entries_type::non_const_type;
+    using local_graph_device_type = typename local_matrix_device_type::StaticCrsGraphType;
+    using ordinal_view_type = typename local_graph_device_type::entries_type::non_const_type;
 
   public:
-    LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_type>& A);
+    LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_device_type>& A);
     ~LocalCrsMatrixOperator () override = default;
 
     void
@@ -114,10 +114,10 @@ namespace Tpetra {
 
     bool hasTransposeApply () const override;
 
-    const local_matrix_type& getLocalMatrix () const;
+    const local_matrix_device_type& getLocalMatrixDevice () const;
 
   private:
-    std::shared_ptr<local_matrix_type> A_;
+    std::shared_ptr<local_matrix_device_type> A_;
     //If the number of entries in A_ can be represented as ordinal,
     //make a copy of the rowptrs as ordinal. This allows the use of cuSPARSE spmv.
     //If cusparse is not enabled or there would be no benefit from using these,
diff --git a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp
index 1dbd9b7958b9..d7b4587ae67c 100644
--- a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp
@@ -50,7 +50,7 @@ namespace Tpetra {
 
 template<class MultiVectorScalar, class MatrixScalar, class Device>
 LocalCrsMatrixOperator<MultiVectorScalar, MatrixScalar, Device>::
-LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_type>& A)
+LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_device_type>& A)
   : A_ (A)
 {
   const char tfecfFuncName[] = "LocalCrsMatrixOperator: ";
@@ -63,8 +63,8 @@ LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_type>& A)
   //  - The execution space is CUDA
   //  - The local matrix offset and ordinal types are different (otherwise, no reason to enable)
   //  - The number of entries can be represented by the ordinal type.
-  using kk_offset_t = typename std::remove_const<typename local_matrix_type::size_type>::type;
-  using kk_ordinal_t = typename std::remove_const<typename local_matrix_type::ordinal_type>::type;
+  using kk_offset_t = typename std::remove_const<typename local_matrix_device_type::size_type>::type;
+  using kk_ordinal_t = typename std::remove_const<typename local_matrix_device_type::ordinal_type>::type;
   using exec_space = typename Device::execution_space;
   if(std::is_same<exec_space, Kokkos::Cuda>::value &&
       !std::is_same<kk_offset_t, kk_ordinal_t>::value &&
@@ -72,7 +72,7 @@ LocalCrsMatrixOperator (const std::shared_ptr<local_matrix_type>& A)
   {
     A_ordinal_rowptrs = ordinal_view_type(Kokkos::ViewAllocateWithoutInitializing("A_ordinal_rowptrs"), A_->numRows() + 1);
     //This is just like a deep copy, but it implicitly converts each element
-    KokkosKernels::Impl::copy_view<typename local_graph_type::row_map_type, ordinal_view_type, exec_space>
+    KokkosKernels::Impl::copy_view<typename local_graph_device_type::row_map_type, ordinal_view_type, exec_space>
       (A_ordinal_rowptrs.extent(0), A_->graph.row_map, A_ordinal_rowptrs);
     A_cusparse = local_cusparse_matrix_type("A(cusparse)", A_->numRows(), A_->numCols(), A_->nnz(), A_->values, A_ordinal_rowptrs, A_->graph.entries);
   }
@@ -189,9 +189,9 @@ applyImbalancedRows (
 }
 
 template<class MultiVectorScalar, class MatrixScalar, class Device>
-const typename LocalCrsMatrixOperator<MultiVectorScalar, MatrixScalar, Device>::local_matrix_type&
+const typename LocalCrsMatrixOperator<MultiVectorScalar, MatrixScalar, Device>::local_matrix_device_type&
 LocalCrsMatrixOperator<MultiVectorScalar, MatrixScalar, Device>::
-getLocalMatrix () const
+getLocalMatrixDevice () const
 {
   return *A_;
 }
diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
index 6bddbcdae0d5..bdf596511f46 100644
--- a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
@@ -1996,7 +1996,6 @@ namespace Tpetra {
     using Teuchos::RCP;
     // View of all the dot product results.
     typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
-    typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
     typedef typename dual_view_type::t_dev_const XMV;
     const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
 
@@ -2695,7 +2694,6 @@ namespace Tpetra {
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
     const char tfecfFuncName[] = "scale: ";
 
     const size_t lclNumRows = getLocalLength ();
@@ -2742,7 +2740,6 @@ namespace Tpetra {
   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   reciprocal (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A)
   {
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     const char tfecfFuncName[] = "reciprocal: ";
 
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
@@ -2782,7 +2779,6 @@ namespace Tpetra {
   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   abs (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A)
   {
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     const char tfecfFuncName[] = "abs";
 
     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
@@ -2827,7 +2823,6 @@ namespace Tpetra {
     const char tfecfFuncName[] = "update: ";
     using Kokkos::subview;
     using Kokkos::ALL;
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
 
     ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
 
@@ -2881,7 +2876,6 @@ namespace Tpetra {
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
 
     const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
 
@@ -2945,7 +2939,6 @@ namespace Tpetra {
   getData (size_t j) const
   {
     using Kokkos::ALL;
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     using IST = impl_scalar_type;
     const char tfecfFuncName[] = "getData: ";
 
@@ -2977,7 +2970,6 @@ namespace Tpetra {
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     using IST = impl_scalar_type;
     const char tfecfFuncName[] = "getDataNonConst: ";
 
@@ -3630,7 +3622,6 @@ namespace Tpetra {
       // Since get1dView() is and was always marked const, I have to
       // cast away const here in order not to break backwards
       // compatibility.
-      using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
       auto X_lcl = getLocalViewHost(Access::ReadOnly);
       Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
         Kokkos::Compat::persistingView (X_lcl);
@@ -3828,7 +3819,6 @@ namespace Tpetra {
     // Since get2dView() is and was always marked const, I have to
     // cast away const here in order not to break backwards
     // compatibility.
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     auto X_lcl = getLocalViewHost(Access::ReadOnly);
 
     // Don't use the row range here on the outside, in order to avoid
@@ -4099,8 +4089,6 @@ namespace Tpetra {
   {
     using Kokkos::ALL;
     using Kokkos::subview;
-    using MV = MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
-    using V = Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
     const char tfecfFuncName[] = "elementWiseMultiply: ";
 
     const size_t lclNumRows = this->getLocalLength ();
diff --git a/packages/tpetra/core/src/Tpetra_RowGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_RowGraph_decl.hpp
index 81c3c42935b5..a78074bc77e1 100644
--- a/packages/tpetra/core/src/Tpetra_RowGraph_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_RowGraph_decl.hpp
@@ -73,13 +73,36 @@ namespace Tpetra {
     //! \name Typedefs
     //@{
     //! The type of local indices in the graph.
-    typedef LocalOrdinal  local_ordinal_type;
+    typedef LocalOrdinal local_ordinal_type;
     //! The type of global indices in the graph.
     typedef GlobalOrdinal global_ordinal_type;
     //! The Kokkos Node type.
-    typedef Node          node_type;
+    typedef Node node_type;
     //@}
 
+    typedef typename
+        Kokkos::View<LocalOrdinal *, typename Node::device_type>::const_type
+        local_inds_device_view_type;
+    typedef typename local_inds_device_view_type::HostMirror::const_type
+        local_inds_host_view_type;
+    typedef typename local_inds_device_view_type::HostMirror
+        nonconst_local_inds_host_view_type;
+
+
+    typedef typename
+        Kokkos::View<GlobalOrdinal *, typename Node::device_type>::const_type
+        global_inds_device_view_type;
+    typedef typename global_inds_device_view_type::HostMirror::const_type
+        global_inds_host_view_type;
+    typedef typename global_inds_device_view_type::HostMirror
+        nonconst_global_inds_host_view_type;
+
+    typedef typename 
+        Kokkos::View<const size_t*, typename Node::device_type>::const_type
+        row_ptrs_device_view_type;
+    typedef typename row_ptrs_device_view_type::HostMirror::const_type
+        row_ptrs_host_view_type;
+
     //! Destructor (virtual for memory safety of derived classes).
     virtual ~RowGraph() {};
 
@@ -182,11 +205,17 @@ namespace Tpetra {
     ///
     /// \pre <tt>getRowMap()->isNodeGlobalElement(gblRow)<tt> is <tt>true</tt>.
     /// \pre <tt>gblColInds.size() >= getNumEntriesInGlobalRow(gblRow)</tt> is <tt>true</tt>.
+
+    virtual void
+    getGlobalRowCopy (const GlobalOrdinal gblRow,
+                      nonconst_global_inds_host_view_type& gblColInds,
+                      size_t& numColInds) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowCopy (GlobalOrdinal gblRow,
                       const Teuchos::ArrayView<GlobalOrdinal>& gblColInds,
                       size_t& numColInds) const = 0;
-
+#endif
     /// \brief Get a copy of the local column indices in a given row
     ///   of the graph.
     ///
@@ -203,10 +232,16 @@ namespace Tpetra {
     /// \pre <tt>hasColMap()</tt> is <tt>true</tt>.
     /// \pre <tt>getRowMap()->isNodeLocalElement(lclRow)<tt> is <tt>true</tt>.
     /// \pre <tt>lclColInds.size() >= getNumEntriesInLocalRow(lclRow)</tt> is <tt>true</tt>.
+    virtual void
+    getLocalRowCopy (const LocalOrdinal lclRow,
+                     nonconst_local_inds_host_view_type & lclColInds,
+                     size_t& numColInds) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowCopy (LocalOrdinal lclRow,
                      const Teuchos::ArrayView<LocalOrdinal>& lclColInds,
                      size_t& numColInds) const = 0;
+#endif
 
     /// \brief Whether this class implements getLocalRowView() and
     ///   getGlobalRowView().
@@ -245,9 +280,14 @@ namespace Tpetra {
     /// have made this method pure virtual, but that would have broken
     /// backwards compatibility, since we added the method at least
     /// one major release after introducing this class.
+    virtual void
+    getLocalRowView (const LocalOrdinal lclRow,
+                     local_inds_host_view_type & lclColInds) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowView (const LocalOrdinal lclRow,
                      Teuchos::ArrayView<const LocalOrdinal>& lclColInds) const;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief Get a const, non-persisting view of the given global
     ///   row's global column indices, as a Teuchos::ArrayView.
@@ -264,9 +304,14 @@ namespace Tpetra {
     /// have made this method pure virtual, but that would have broken
     /// backwards compatibility, since we added the method at least
     /// one major release after introducing this class.
+    virtual void
+    getGlobalRowView (const GlobalOrdinal gblRow,
+                      global_inds_host_view_type& gblColInds) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowView (const GlobalOrdinal gblRow,
                       Teuchos::ArrayView<const GlobalOrdinal>& gblColInds) const;
+#endif
 
     //@}
     //! \name Implementation of Packable interface
diff --git a/packages/tpetra/core/src/Tpetra_RowGraph_def.hpp b/packages/tpetra/core/src/Tpetra_RowGraph_def.hpp
index d874b506b4bd..13c8b22c3877 100644
--- a/packages/tpetra/core/src/Tpetra_RowGraph_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_RowGraph_def.hpp
@@ -74,7 +74,7 @@ namespace Tpetra {
     // packets (that is, entries) owned by this process in all the
     // rows that the caller wants us to send out.
     size_t totalNumPackets = 0;
-    Array<GO> row;
+    nonconst_global_inds_host_view_type row;
     for (LO i = 0; i < exportLIDs.size (); ++i) {
       const GO GID = srcMap.getGlobalElement (exportLIDs[i]);
       size_t row_length = this->getNumEntriesInGlobalRow (GID);
@@ -90,19 +90,18 @@ namespace Tpetra {
     for (LO i = 0; i < exportLIDs.size (); ++i) {
       const GO GID = srcMap.getGlobalElement (exportLIDs[i]);
       size_t row_length = this->getNumEntriesInGlobalRow (GID);
-      row.resize (row_length);
+      Kokkos::resize(row,row_length);
       size_t check_row_length = 0;
-      this->getGlobalRowCopy (GID, row (), check_row_length);
-      typename Array<GO>::const_iterator row_iter = row.begin();
-      typename Array<GO>::const_iterator row_end = row.end();
-      size_t j = 0;
-      for (; row_iter != row_end; ++row_iter, ++j) {
-        exports[exportsOffset+j] = *row_iter;
+      this->getGlobalRowCopy (GID, row, check_row_length);
+
+      for (size_t j=0; j<row_length; ++j) {
+        exports[exportsOffset+j] = row[j];
       }
-      exportsOffset += row.size ();
+      exportsOffset += row.extent(0);
     }
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template<class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   RowGraph<LocalOrdinal,GlobalOrdinal,Node>::
@@ -144,6 +143,8 @@ namespace Tpetra {
        prefix << "This object claims to support row views, "
        "but this method is not implemented.");
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 } // namespace Tpetra
 
 //
diff --git a/packages/tpetra/core/src/Tpetra_RowMatrixTransposer_def.hpp b/packages/tpetra/core/src/Tpetra_RowMatrixTransposer_def.hpp
index dba7bd4ed142..ae2393a3242f 100644
--- a/packages/tpetra/core/src/Tpetra_RowMatrixTransposer_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_RowMatrixTransposer_def.hpp
@@ -174,16 +174,16 @@ createTransposeLocal (const Teuchos::RCP<Teuchos::ParameterList>& params)
     }
   }
 
-  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
-  using local_graph_type = typename crs_matrix_type::local_graph_type;
-  using offset_type = typename local_graph_type::size_type;
-  using row_map_type = typename local_matrix_type::row_map_type::non_const_type;
-  using index_type = typename local_matrix_type::index_type::non_const_type;
-  using values_type = typename local_matrix_type::values_type::non_const_type;
-  using execution_space = typename local_matrix_type::execution_space;
+  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
+  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
+  using offset_type = typename local_graph_device_type::size_type;
+  using row_map_type = typename local_matrix_device_type::row_map_type::non_const_type;
+  using index_type = typename local_matrix_device_type::index_type::non_const_type;
+  using values_type = typename local_matrix_device_type::values_type::non_const_type;
+  using execution_space = typename local_matrix_device_type::execution_space;
 
-  local_matrix_type lclMatrix = crsMatrix->getLocalMatrix ();
-  local_graph_type lclGraph = lclMatrix.graph;
+  local_matrix_device_type lclMatrix = crsMatrix->getLocalMatrixDevice ();
+  local_graph_device_type lclGraph = lclMatrix.graph;
 
   // Determine how many nonzeros there are per row in the transpose.
   using DT = typename crs_matrix_type::device_type;
@@ -255,7 +255,7 @@ createTransposeLocal (const Teuchos::RCP<Teuchos::ParameterList>& params)
       });
   }
 
-  local_matrix_type lclTransposeMatrix ("transpose", lclNumCols,
+  local_matrix_device_type lclTransposeMatrix ("transpose", lclNumCols,
                                         lclNumRows, nnz,
                                         t_vals, t_offsets, t_cols);
 
diff --git a/packages/tpetra/core/src/Tpetra_RowMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_RowMatrix_decl.hpp
index e6c1e4ae52a8..382e11b18472 100644
--- a/packages/tpetra/core/src/Tpetra_RowMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_RowMatrix_decl.hpp
@@ -99,6 +99,16 @@ namespace Tpetra {
     //! The Kokkos Node type.
     typedef Node          node_type;
 
+    /// \brief The type used internally in place of \c Scalar.
+    ///
+    /// Some \c Scalar types might not work with Kokkos on all
+    /// execution spaces, due to missing CUDA device macros or
+    /// volatile overloads.  The C++ standard type std::complex<T> has
+    /// this problem.  To fix this, we replace std::complex<T> values
+    /// internally with the (usually) bitwise identical type
+    /// Kokkos::complex<T>.  The latter is the \c impl_scalar_type
+    /// corresponding to \c Scalar = std::complex.
+    using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
     /// \brief Type of a norm result.
     ///
     /// This is usually the same as the type of the magnitude
@@ -106,6 +116,39 @@ namespace Tpetra {
     /// certain <tt>Scalar</tt> types.
     using mag_type = typename Kokkos::ArithTraits<Scalar>::mag_type;
 
+    typedef typename 
+        Kokkos::View<impl_scalar_type*, typename Node::device_type>::const_type
+        values_device_view_type;
+    typedef typename values_device_view_type::HostMirror::const_type
+        values_host_view_type;
+    typedef typename values_device_view_type::HostMirror
+        nonconst_values_host_view_type;
+
+    typedef typename
+        Kokkos::View<LocalOrdinal *, typename Node::device_type>::const_type
+        local_inds_device_view_type;
+    typedef typename local_inds_device_view_type::HostMirror::const_type
+        local_inds_host_view_type;
+    typedef typename local_inds_device_view_type::HostMirror
+        nonconst_local_inds_host_view_type;
+
+    typedef typename
+        Kokkos::View<GlobalOrdinal *, typename Node::device_type>::const_type
+        global_inds_device_view_type;
+    typedef typename global_inds_device_view_type::HostMirror::const_type
+        global_inds_host_view_type;
+    typedef typename global_inds_device_view_type::HostMirror
+        nonconst_global_inds_host_view_type;
+
+
+    typedef typename
+        Kokkos::View<const size_t*, typename Node::device_type>::const_type
+        row_ptrs_device_view_type;
+    typedef typename row_ptrs_device_view_type::HostMirror::const_type
+        row_ptrs_host_view_type;
+
+
+
     //@}
     //! @name Destructor
     //@{
@@ -253,12 +296,18 @@ namespace Tpetra {
     /// the calling process, then the method sets NumIndices to
     /// <tt>Teuchos::OrdinalTraits<size_t>::invalid()</tt>, and does
     /// not modify Indices or Values.
+    virtual void
+    getGlobalRowCopy (GlobalOrdinal GlobalRow,
+                      nonconst_global_inds_host_view_type &Indices,
+                      nonconst_values_host_view_type &Values,
+                      size_t& NumEntries) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowCopy (GlobalOrdinal GlobalRow,
                       const Teuchos::ArrayView<GlobalOrdinal> &Indices,
                       const Teuchos::ArrayView<Scalar> &Values,
                       size_t &NumEntries) const = 0;
-
+#endif
     /// \brief Get a copy of the given local row's entries.
     ///
     /// This method only gets the entries in the given row that are
@@ -279,12 +328,18 @@ namespace Tpetra {
     /// the calling process, then the method sets NumIndices to
     /// <tt>Teuchos::OrdinalTraits<size_t>::invalid()</tt>, and does
     /// not modify Indices or Values.
+    virtual void
+    getLocalRowCopy (LocalOrdinal LocalRow,
+                     nonconst_local_inds_host_view_type &Indices,
+                     nonconst_values_host_view_type &Values,
+                     size_t& NumEntries) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowCopy (LocalOrdinal LocalRow,
                      const Teuchos::ArrayView<LocalOrdinal> &Indices,
                      const Teuchos::ArrayView<Scalar> &Values,
                      size_t &NumEntries) const = 0;
-
+#endif
     /// \brief Get a constant, nonpersisting, globally indexed view of
     ///   the given row of the matrix.
     ///
@@ -309,10 +364,16 @@ namespace Tpetra {
     ///
     /// If \c GlobalRow does not belong to this node, then \c indices
     /// is set to \c null.
+    virtual void
+    getGlobalRowView (GlobalOrdinal GlobalRow,
+                      global_inds_host_view_type &indices,
+                      values_host_view_type &values) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getGlobalRowView (GlobalOrdinal GlobalRow,
                       Teuchos::ArrayView<const GlobalOrdinal> &indices,
                       Teuchos::ArrayView<const Scalar> &values) const = 0;
+#endif
 
     /// \brief Get a constant, nonpersisting, locally indexed view of
     ///   the given row of the matrix.
@@ -338,6 +399,11 @@ namespace Tpetra {
     ///
     /// If \c LocalRow does not belong to this node, then \c indices
     /// is set to \c null.
+    virtual void
+    getLocalRowView (LocalOrdinal LocalRow,
+                     local_inds_host_view_type & indices,
+                     values_host_view_type & values) const = 0;
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
     virtual void
     getLocalRowView (LocalOrdinal LocalRow,
                      Teuchos::ArrayView<const LocalOrdinal>& indices,
@@ -374,6 +440,7 @@ namespace Tpetra {
                         LocalOrdinal& numEnt,
                         const LocalOrdinal*& lclColInds,
                         const Scalar*& vals) const;
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
     /// \brief Get a copy of the diagonal entries, distributed by the row Map.
     ///
diff --git a/packages/tpetra/core/src/Tpetra_RowMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_RowMatrix_def.hpp
index a527290ac2b3..ee6ee36ef063 100644
--- a/packages/tpetra/core/src/Tpetra_RowMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_RowMatrix_def.hpp
@@ -231,8 +231,10 @@ namespace Tpetra {
     //
     // Compute C = alpha*A + beta*B.
     //
-    Array<GO> ind;
-    Array<Scalar> val;
+    using gids_type = nonconst_global_inds_host_view_type;
+    using vals_type = nonconst_values_host_view_type;
+    gids_type ind;
+    vals_type val;
 
     if (alpha != STS::zero ()) {
       const LO A_localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
@@ -240,11 +242,11 @@ namespace Tpetra {
         size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
         const GO globalRow = A_rowMap->getGlobalElement (localRow);
         if (A_numEntries > static_cast<size_t> (ind.size ())) {
-          ind.resize (A_numEntries);
-          val.resize (A_numEntries);
+          Kokkos::resize(ind,A_numEntries);
+          Kokkos::resize(val,A_numEntries);
         }
-        ArrayView<GO> indView = ind (0, A_numEntries);
-        ArrayView<Scalar> valView = val (0, A_numEntries);
+        gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, A_numEntries));
+        vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, A_numEntries));
         A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
 
         if (alpha != STS::one ()) {
@@ -252,7 +254,9 @@ namespace Tpetra {
             valView[k] *= alpha;
           }
         }
-        C->insertGlobalValues (globalRow, indView, valView);
+        C->insertGlobalValues (globalRow, A_numEntries, 
+                               reinterpret_cast<const Scalar*>(valView.data()),
+                               indView.data());
       }
     }
 
@@ -262,11 +266,11 @@ namespace Tpetra {
         size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
         const GO globalRow = B_rowMap->getGlobalElement (localRow);
         if (B_numEntries > static_cast<size_t> (ind.size ())) {
-          ind.resize (B_numEntries);
-          val.resize (B_numEntries);
+          Kokkos::resize(ind,B_numEntries);
+          Kokkos::resize(val,B_numEntries);
         }
-        ArrayView<GO> indView = ind (0, B_numEntries);
-        ArrayView<Scalar> valView = val (0, B_numEntries);
+        gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, B_numEntries));
+        vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, B_numEntries));
         B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
 
         if (beta != STS::one ()) {
@@ -274,7 +278,9 @@ namespace Tpetra {
             valView[k] *= beta;
           }
         }
-        C->insertGlobalValues (globalRow, indView, valView);
+        C->insertGlobalValues (globalRow, B_numEntries, 
+                               reinterpret_cast<const Scalar*>(valView.data()),
+                               indView.data());
       }
     }
 
@@ -405,8 +411,8 @@ namespace Tpetra {
         // If the matrix is locally indexed on the calling process, we
         // have to use its column Map (which it _must_ have in this
         // case) to convert to global indices.
-        ArrayView<const LO> indIn;
-        ArrayView<const Scalar> valIn;
+        local_inds_host_view_type indIn;
+        values_host_view_type valIn;
         this->getLocalRowView (lclRow, indIn, valIn);
         const map_type& colMap = * (this->getColMap ());
         // Copy column indices one at a time, so that we don't need
@@ -415,7 +421,7 @@ namespace Tpetra {
           const GO gblIndIn = colMap.getGlobalElement (indIn[k]);
           memcpy (indOut + k * sizeof (GO), &gblIndIn, sizeof (GO));
         }
-        memcpy (valOut, valIn.getRawPtr (), numEnt * sizeof (Scalar));
+        memcpy (valOut, valIn.data (), numEnt * sizeof (Scalar));
       }
       else if (this->isGloballyIndexed ()) {
         // If the matrix is globally indexed on the calling process,
@@ -423,13 +429,13 @@ namespace Tpetra {
         // have to get the global row index.  The calling process must
         // have a row Map, since otherwise it shouldn't be participating
         // in packing operations.
-        ArrayView<const GO> indIn;
-        ArrayView<const Scalar> valIn;
+        global_inds_host_view_type indIn;
+        values_host_view_type valIn;
         const map_type& rowMap = * (this->getRowMap ());
         const GO gblRow = rowMap.getGlobalElement (lclRow);
         this->getGlobalRowView (gblRow, indIn, valIn);
-        memcpy (indOut, indIn.getRawPtr (), numEnt * sizeof (GO));
-        memcpy (valOut, valIn.getRawPtr (), numEnt * sizeof (Scalar));
+        memcpy (indOut, indIn.data (), numEnt * sizeof (GO));
+        memcpy (valOut, valIn.data (), numEnt * sizeof (Scalar));
       }
       else {
         if (numEnt != 0) {
@@ -441,10 +447,10 @@ namespace Tpetra {
       // FIXME (mfh 25 Jan 2015) Pass in valIn and indIn as scratch
       // space, instead of allocating them on each call.
       if (this->isLocallyIndexed ()) {
-        Array<LO> indIn (numEnt);
-        Array<Scalar> valIn (numEnt);
+        nonconst_local_inds_host_view_type indIn("indIn",numEnt);
+        nonconst_values_host_view_type valIn("valIn",numEnt);
         size_t theNumEnt = 0;
-        this->getLocalRowCopy (lclRow, indIn (), valIn (), theNumEnt);
+        this->getLocalRowCopy (lclRow, indIn, valIn, theNumEnt);
         if (theNumEnt != numEnt) {
           return false;
         }
@@ -455,11 +461,11 @@ namespace Tpetra {
           const GO gblIndIn = colMap.getGlobalElement (indIn[k]);
           memcpy (indOut + k * sizeof (GO), &gblIndIn, sizeof (GO));
         }
-        memcpy (valOut, valIn.getRawPtr (), numEnt * sizeof (Scalar));
+        memcpy (valOut, valIn.data(), numEnt * sizeof (Scalar));
       }
       else if (this->isGloballyIndexed ()) {
-        Array<GO> indIn (numEnt);
-        Array<Scalar> valIn (numEnt);
+        nonconst_global_inds_host_view_type indIn("indIn",numEnt);
+        nonconst_values_host_view_type valIn("valIn",numEnt);
         const map_type& rowMap = * (this->getRowMap ());
         const GO gblRow = rowMap.getGlobalElement (lclRow);
         size_t theNumEnt = 0;
@@ -467,8 +473,8 @@ namespace Tpetra {
         if (theNumEnt != numEnt) {
           return false;
         }
-        memcpy (indOut, indIn.getRawPtr (), numEnt * sizeof (GO));
-        memcpy (valOut, valIn.getRawPtr (), numEnt * sizeof (Scalar));
+        memcpy (indOut, indIn.data(), numEnt * sizeof (GO));
+        memcpy (valOut, valIn.data(), numEnt * sizeof (Scalar));
       }
       else {
         if (numEnt != 0) {
@@ -590,6 +596,7 @@ namespace Tpetra {
       << ", numBytes: " << firstBadNumBytes << ".");
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   LocalOrdinal
   RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -617,6 +624,7 @@ namespace Tpetra {
 
     return static_cast<LocalOrdinal> (0);
   }
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
 
 } // namespace Tpetra
 
diff --git a/packages/tpetra/core/src/Tpetra_Util.hpp b/packages/tpetra/core/src/Tpetra_Util.hpp
index 8f17679e01c2..74fc5b47eb70 100644
--- a/packages/tpetra/core/src/Tpetra_Util.hpp
+++ b/packages/tpetra/core/src/Tpetra_Util.hpp
@@ -51,6 +51,7 @@
 
 #include "Tpetra_ConfigDefs.hpp"
 #include "Kokkos_DualView.hpp"
+#include "KokkosCompat_View.hpp"
 #include "Teuchos_Assert.hpp"
 #include "Teuchos_CommHelpers.hpp"
 #include "Teuchos_OrdinalTraits.hpp"
@@ -547,6 +548,71 @@ namespace Tpetra {
   }
 
 
+/**
+   * \brief Sort the first array, and apply the resulting permutation to the second array.
+   *
+   * Sort the values in the first array (of length size)
+   * in ascending order.  Apply the
+   * permutation resulting from the sort to the second array
+   *
+   * @param view1 A host-accessible 1D Kokkos::View
+   *   of the first array.
+   * @param size Length of the first array.
+   * @param view2 A host-accessible 1D Kokkos::View
+   *   of the second array.  The second array must have no fewer
+   *   elements than the first array.  If the first array has N
+   *   elements, then the permutation will only be applied to the
+   *   first N elements of the second array.
+   */
+  template<class View1, class View2>
+  void sort2(View1 &view1, const size_t &size, View2 &view2) {
+    // NOTE: This assumes the view is host-accessible.
+
+    // Wrap the views as rcps (this happens to preserve the reference counting, but that doesn't really matter here)
+    Teuchos::ArrayRCP<typename View1::non_const_value_type> view1_rcp =  Kokkos::Compat::persistingView(view1, 0, size);
+    Teuchos::ArrayRCP<typename View2::non_const_value_type> view2_rcp =  Kokkos::Compat::persistingView(view2, 0, size);
+
+    sort2(view1_rcp.begin(),view1_rcp.end(),view2_rcp.begin());    
+  }
+
+/**
+   * \brief Convenience wrapper for std::sort for host-accessible views
+   *
+   * Sort the values in the array (of length size) in ascending order.
+
+   * @param view A host-accessible 1D Kokkos::View.
+   * @param size Length of the first array (or portion of which to sort).
+   */
+  template<class View>
+  void sort(View &view, const size_t &size) {
+    // NOTE: This assumes the view is host-accessible.
+
+    // Wrap the view as rcps (this happens to preserve the reference counting, but that doesn't really matter here)
+    Teuchos::ArrayRCP<typename View::non_const_value_type> view_rcp =  Kokkos::Compat::persistingView(view, 0, size);
+
+    std::sort(view_rcp.begin(),view_rcp.end());    
+  }
+
+  /**
+   * \brief Convenience wrapper for a reversed std::sort for host-accessible views
+   *
+   * Reverse Sort the values in the array (of length size) in ascending order.
+
+   * @param view A host-accessible 1D Kokkos::View.
+   * @param size Length of the array (or portion of which to sort, from the *end*)
+   */
+  template<class View>
+  void reverse_sort(View &view, const size_t &size) {
+    // NOTE: This assumes the view is host-accessible.
+    // Wrap the view as rcps (this happens to preserve the reference counting, but that doesn't really matter here)
+    Teuchos::ArrayRCP<typename View::non_const_value_type> view_rcp =  Kokkos::Compat::persistingView(view, 0, size);
+
+    std::sort(view_rcp.rbegin(),view_rcp.rend());    
+  }
+  
+
+
+
   /**
    * \brief Sort the first array, and apply the same permutation to the second
    * and third arrays.
diff --git a/packages/tpetra/core/src/Tpetra_applyDirichletBoundaryCondition.hpp b/packages/tpetra/core/src/Tpetra_applyDirichletBoundaryCondition.hpp
index 6d004abaf150..956ec09fe92b 100644
--- a/packages/tpetra/core/src/Tpetra_applyDirichletBoundaryCondition.hpp
+++ b/packages/tpetra/core/src/Tpetra_applyDirichletBoundaryCondition.hpp
@@ -138,7 +138,7 @@ struct ApplyDirichletBoundaryConditionToLocalMatrixRows {
     TEUCHOS_TEST_FOR_EXCEPTION
       (colMap.get () == nullptr, std::invalid_argument,
        "The matrix must have a column Map.");
-    auto A_lcl = A.getLocalMatrix ();
+    auto A_lcl = A.getLocalMatrixDevice ();
 
     const LO lclNumRows = static_cast<LO> (rowMap->getNodeNumElements ());
     TEUCHOS_TEST_FOR_EXCEPTION
diff --git a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
index a3b79d34e538..60c844740590 100644
--- a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
@@ -79,21 +79,25 @@ lclMaxNumEntriesRowMatrix (const Tpetra::RowMatrix<SC, LO, GO, NT>& A)
 
 template<class SC, class LO, class GO, class NT>
 void
-forEachLocalRowMatrixRow (const Tpetra::RowMatrix<SC, LO, GO, NT>& A,
-                          const LO lclNumRows,
-                          const std::size_t maxNumEnt,
-                          std::function<void (const LO lclRow,
-                                              const Teuchos::ArrayView<LO>& /* ind */,
-                                              const Teuchos::ArrayView<SC>& /* val */,
-                                              std::size_t /* numEnt */ )> doForEachRow)
+forEachLocalRowMatrixRow (
+  const Tpetra::RowMatrix<SC, LO, GO, NT>& A,
+  const LO lclNumRows,
+  const std::size_t maxNumEnt,
+  std::function<void (
+       const LO lclRow,
+       const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type& /*ind*/,
+       const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type& /*val*/,
+       std::size_t /*numEnt*/ )> doForEachRow)
 {
-  Teuchos::Array<LO> indBuf (maxNumEnt);
-  Teuchos::Array<SC> valBuf (maxNumEnt);
+  using lids_type = typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type;
+  using vals_type = typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type;
+  lids_type indBuf("indices",maxNumEnt);
+  vals_type valBuf("values",maxNumEnt);
 
   for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
     std::size_t numEnt = A.getNumEntriesInLocalRow (lclRow);
-    Teuchos::ArrayView<LO> ind = indBuf.view (0, numEnt);
-    Teuchos::ArrayView<SC> val = valBuf.view (0, numEnt);
+    lids_type ind = Kokkos::subview(indBuf,std::make_pair((size_t)0, numEnt));
+    vals_type val = Kokkos::subview(valBuf,std::make_pair((size_t)0, numEnt));
     A.getLocalRowCopy (lclRow, ind, val, numEnt);
     doForEachRow (lclRow, ind, val, numEnt);
   }
@@ -101,11 +105,13 @@ forEachLocalRowMatrixRow (const Tpetra::RowMatrix<SC, LO, GO, NT>& A,
 
 template<class SC, class LO, class GO, class NT>
 void
-forEachLocalRowMatrixRow (const Tpetra::RowMatrix<SC, LO, GO, NT>& A,
-                          std::function<void (const LO lclRow,
-                                              const Teuchos::ArrayView<LO>& /* ind */,
-                                              const Teuchos::ArrayView<SC>& /* val */,
-                                              std::size_t /* numEnt */ )> doForEachRow)
+forEachLocalRowMatrixRow (
+  const Tpetra::RowMatrix<SC, LO, GO, NT>& A,
+  std::function<void (
+       const LO lclRow,
+       const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type& /*ind*/,
+       const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type& /*val*/,
+       std::size_t /*numEnt*/ )> doForEachRow)
 {
   const auto& rowMap = * (A.getRowMap ());
   const LO lclNumRows = static_cast<LO> (rowMap.getNodeNumElements ());
@@ -132,8 +138,8 @@ computeLocalRowScaledColumnNorms_RowMatrix (EquilibrationInfo<typename Kokkos::A
 
   forEachLocalRowMatrixRow<SC, LO, GO, NT> (A,
     [&] (const LO lclRow,
-         const Teuchos::ArrayView<LO>& ind,
-         const Teuchos::ArrayView<SC>& val,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type& ind,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type& val,
          std::size_t numEnt) {
       const mag_type rowNorm = rowNorms_h[lclRow];
       for (std::size_t k = 0; k < numEnt; ++k) {
@@ -169,8 +175,8 @@ computeLocalRowOneNorms_RowMatrix (const Tpetra::RowMatrix<SC, LO, GO, NT>& A)
 
   forEachLocalRowMatrixRow<SC, LO, GO, NT> (A,
     [&] (const LO lclRow,
-         const Teuchos::ArrayView<LO>& ind,
-         const Teuchos::ArrayView<SC>& val,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type& ind,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type& val,
          std::size_t numEnt) {
       mag_type rowNorm {0.0};
       val_type diagVal {0.0};
@@ -238,8 +244,8 @@ computeLocalRowAndColumnOneNorms_RowMatrix (const Tpetra::RowMatrix<SC, LO, GO,
 
   forEachLocalRowMatrixRow<SC, LO, GO, NT> (A,
     [&] (const LO lclRow,
-         const Teuchos::ArrayView<LO>& ind,
-         const Teuchos::ArrayView<SC>& val,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type& ind,
+         const typename Tpetra::RowMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type& val,
          std::size_t numEnt) {
       mag_type rowNorm {0.0};
       val_type diagVal {0.0};
@@ -303,7 +309,7 @@ class ComputeLocalRowScaledColumnNorms {
                                     const crs_matrix_type& A) :
     rowScaledColNorms_ (rowScaledColNorms),
     rowNorms_ (rowNorms),
-    A_lcl_ (A.getLocalMatrix ())
+    A_lcl_ (A.getLocalMatrixDevice ())
   {}
 
   KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const {
@@ -340,8 +346,8 @@ class ComputeLocalRowScaledColumnNorms {
   Kokkos::View<mag_type*, device_type> rowScaledColNorms_;
   Kokkos::View<const mag_type*, device_type> rowNorms_;
 
-  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
-  local_matrix_type A_lcl_;
+  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
+  local_matrix_device_type A_lcl_;
 };
 
 template<class SC, class LO, class GO, class NT>
@@ -393,12 +399,12 @@ class ComputeLocalRowOneNorms {
 public:
   using val_type = typename Kokkos::ArithTraits<SC>::val_type;
   using equib_info_type = EquilibrationInfo<val_type, typename NT::device_type>;
-  using local_matrix_type =
-    typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_type;
+  using local_matrix_device_type =
+    typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
   using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
 
   ComputeLocalRowOneNorms (const equib_info_type& equib,   // in/out
-                           const local_matrix_type& A_lcl, // in
+                           const local_matrix_device_type& A_lcl, // in
                            const local_map_type& rowMap,   // in
                            const local_map_type& colMap) : // in
     equib_ (equib),
@@ -474,7 +480,7 @@ class ComputeLocalRowOneNorms {
 
 private:
   equib_info_type equib_;
-  local_matrix_type A_lcl_;
+  local_matrix_device_type A_lcl_;
   local_map_type rowMap_;
   local_map_type colMap_;
 };
@@ -486,12 +492,12 @@ class ComputeLocalRowAndColumnOneNorms {
 public:
   using val_type = typename Kokkos::ArithTraits<SC>::val_type;
   using equib_info_type = EquilibrationInfo<val_type, typename NT::device_type>;
-  using local_matrix_type = typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_type;
+  using local_matrix_device_type = typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
   using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
 
 public:
   ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib,   // in/out
-                                    const local_matrix_type& A_lcl, // in
+                                    const local_matrix_device_type& A_lcl, // in
                                     const local_map_type& rowMap,   // in
                                     const local_map_type& colMap) : // in
     equib_ (equib),
@@ -580,7 +586,7 @@ class ComputeLocalRowAndColumnOneNorms {
 
 private:
   equib_info_type equib_;
-  local_matrix_type A_lcl_;
+  local_matrix_device_type A_lcl_;
   local_map_type rowMap_;
   local_map_type colMap_;
 };
@@ -603,7 +609,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
   constexpr bool assumeSymmetric = false; // doesn't matter here
   equib_info_type equib (lclNumRows, lclNumCols, assumeSymmetric);
 
-  functor_type functor (equib, A.getLocalMatrix (),
+  functor_type functor (equib, A.getLocalMatrixDevice (),
                         A.getRowMap ()->getLocalMap (),
                         A.getColMap ()->getLocalMap ());
   int result = 0;
@@ -635,7 +641,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
   const LO lclNumCols = static_cast<LO> (A.getColMap ()->getNodeNumElements ());
   equib_info_type equib (lclNumRows, lclNumCols, assumeSymmetric);
 
-  functor_type functor (equib, A.getLocalMatrix (),
+  functor_type functor (equib, A.getLocalMatrixDevice (),
                         A.getRowMap ()->getLocalMap (),
                         A.getColMap ()->getLocalMap ());
   int result = 0;
diff --git a/packages/tpetra/core/src/Tpetra_createDeepCopy.hpp b/packages/tpetra/core/src/Tpetra_createDeepCopy.hpp
index d51ca698395a..095530ad229b 100644
--- a/packages/tpetra/core/src/Tpetra_createDeepCopy.hpp
+++ b/packages/tpetra/core/src/Tpetra_createDeepCopy.hpp
@@ -1,13 +1,19 @@
 #ifndef TPETRA_CREATEDEEPCOPY_HPP
 #define TPETRA_CREATEDEEPCOPY_HPP
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+
 #include "TpetraCore_config.h"
 
 namespace Tpetra {
 
 template<class OutputType, class InputType>
-OutputType createDeepCopy (const InputType& in);
+OutputType 
+TPETRA_DEPRECATED
+createDeepCopy (const InputType& in);
 
 } // namespace Tpetra
 
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 #endif // TPETRA_CREATEDEEPCOPY_HPP
diff --git a/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_decl.hpp
index a12b828a1a4e..c090c7d21c6f 100644
--- a/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_decl.hpp
@@ -5,12 +5,17 @@
 #include "Tpetra_CrsMatrix_fwd.hpp"
 #include "Tpetra_RowMatrix_fwd.hpp"
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+
 namespace Tpetra {
 
 template<class SC, class LO, class GO, class NT>
 CrsMatrix<SC, LO, GO, NT>
+TPETRA_DEPRECATED
 createDeepCopy (const RowMatrix<SC, LO, GO, NT>& in);
 
 } // namespace Tpetra
 
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 #endif // TPETRA_CREATEDEEPCOPY_CRSMATRIX_DECL_HPP
diff --git a/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_def.hpp
index 48912eb2a122..1e4d91fd4072 100644
--- a/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_createDeepCopy_CrsMatrix_def.hpp
@@ -1,6 +1,8 @@
 #ifndef TPETRA_CREATEDEEPCOPY_CRSMATRIX_DEF_HPP
 #define TPETRA_CREATEDEEPCOPY_CRSMATRIX_DEF_HPP
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE 
+
 #include "Tpetra_CrsMatrix.hpp"
 #include "Tpetra_Map.hpp"
 #include "Tpetra_RowMatrix.hpp"
@@ -9,12 +11,14 @@
 #include "Teuchos_ArrayView.hpp"
 #include <memory>
 
+
 namespace Tpetra {
 
 namespace { // (anonymous)
 
 template<class SC, class LO, class GO, class NT>
 typename CrsMatrix<SC, LO, GO, NT>::local_matrix_type
+TPETRA_DEPRECATED
 localDeepCopyFillCompleteCrsMatrix (const CrsMatrix<SC, LO, GO, NT>& A)
 {
   using Kokkos::view_alloc;
@@ -22,16 +26,16 @@ localDeepCopyFillCompleteCrsMatrix (const CrsMatrix<SC, LO, GO, NT>& A)
   using crs_matrix_type = CrsMatrix<SC, LO, GO, NT>;
   using local_matrix_type =
     typename crs_matrix_type::local_matrix_type;
-  local_matrix_type A_lcl = A.getLocalMatrix ();
+  local_matrix_type A_lcl = A.getLocalMatrixDevice ();
 
-  using local_graph_type = typename crs_matrix_type::local_graph_type;
-  using inds_type = typename local_graph_type::entries_type;
+  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
+  using inds_type = typename local_graph_device_type::entries_type;
   inds_type ind (view_alloc ("ind", WithoutInitializing),
                  A_lcl.graph.entries.extent (0));
   Kokkos::deep_copy (ind, A_lcl.graph.entries);
 
   using offsets_type =
-    typename local_graph_type::row_map_type::non_const_type;
+    typename local_graph_device_type::row_map_type::non_const_type;
   offsets_type ptr (view_alloc ("ptr", WithoutInitializing),
                     A_lcl.graph.row_map.extent (0));
   Kokkos::deep_copy (ptr, A_lcl.graph.row_map);
@@ -41,7 +45,7 @@ localDeepCopyFillCompleteCrsMatrix (const CrsMatrix<SC, LO, GO, NT>& A)
                    A_lcl.values.extent (0));
   Kokkos::deep_copy (val, A_lcl.values);
 
-  local_graph_type lclGraph (ind, ptr);
+  local_graph_device_type lclGraph (ind, ptr);
   const size_t numCols = A.getColMap ()->getNodeNumElements ();
   return local_matrix_type (A.getObjectLabel (), numCols, val, lclGraph);
 }
@@ -50,6 +54,7 @@ localDeepCopyFillCompleteCrsMatrix (const CrsMatrix<SC, LO, GO, NT>& A)
 
 template<class SC, class LO, class GO, class NT>
 CrsMatrix<SC, LO, GO, NT>
+TPETRA_DEPRECATED
 createDeepCopy (const RowMatrix<SC, LO, GO, NT>& A)
 {
   using crs_matrix_type = CrsMatrix<SC, LO, GO, NT>;
@@ -89,33 +94,38 @@ createDeepCopy (const RowMatrix<SC, LO, GO, NT>& A)
       crs_matrix_type (A.getRowMap (), entPerRow_av);
 
     const bool hasViews = A.supportsRowViews ();
-
-    Teuchos::Array<GO> inputIndsBuf;
-    Teuchos::Array<SC> inputValsBuf;
+    
+    typename crs_matrix_type::nonconst_global_inds_host_view_type inputIndsBuf;
+    typename crs_matrix_type::nonconst_values_host_view_type inputValsBuf;
     if (! hasViews) {
-      inputIndsBuf.resize (maxNumEnt);
-      inputValsBuf.resize (maxNumEnt);
+      Kokkos::resize(inputIndsBuf,maxNumEnt);
+      Kokkos::resize(inputValsBuf,maxNumEnt);
     }
 
     const auto& rowMap = * (A.getRowMap ());
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
       const GO gblRow = rowMap.getGlobalElement (lclRow);
-      Teuchos::ArrayView<const GO> inputInds_av;
-      Teuchos::ArrayView<const SC> inputVals_av;
-      size_t numEnt = 0;
       if (hasViews) {
-        A.getGlobalRowView (gblRow, inputInds_av, inputVals_av);
-        numEnt = static_cast<size_t> (inputInds_av.size ());
+        typename crs_matrix_type::global_inds_host_view_type inputInds;
+        typename crs_matrix_type::values_host_view_type inputVals;
+        A.getGlobalRowView (gblRow, inputInds, inputVals);
+        // BAD BAD BAD
+        // we want a better way than reinterpret casting back and forth between scalar_type and
+        // impl_scalar_type everywhere
+        A_copy.insertGlobalValues (gblRow, inputInds.extent(0),
+                                   reinterpret_cast<const typename crs_matrix_type::scalar_type*>(inputVals.data()),
+                                   inputInds.data());
       }
       else {
         const size_t lclNumEnt = A.getNumEntriesInLocalRow (lclRow);
         TEUCHOS_ASSERT(lclNumEnt <= maxNumEnt);
-        A.getGlobalRowCopy (gblRow, inputIndsBuf (),
-                            inputValsBuf (), numEnt);
-        inputInds_av = inputIndsBuf.view (0, numEnt);
-        inputVals_av = inputValsBuf.view (0, numEnt);
+        size_t numEnt = 0;
+        A.getGlobalRowCopy (gblRow, inputIndsBuf, inputValsBuf, numEnt);
+        A_copy.insertGlobalValues (gblRow, numEnt, 
+                                   reinterpret_cast<const typename crs_matrix_type::scalar_type*>(inputValsBuf.data()),
+                                   inputIndsBuf.data());
+
       }
-      A_copy.insertGlobalValues (gblRow, inputInds_av, inputVals_av);
     }
 
     if (A.isFillComplete ()) {
@@ -162,4 +172,7 @@ createDeepCopy (const RowMatrix<SC, LO, GO, NT>& A)
   template CrsMatrix< SC , LO , GO , NT > \
   createDeepCopy (const RowMatrix<SC, LO, GO, NT>& );
 
+#endif // TPETRA_ENABLE_DEPRECATED_CODE
+
 #endif // TPETRA_CREATEDEEPCOPY_CRSMATRIX_DEF_HPP
+
diff --git a/packages/tpetra/core/src/Tpetra_leftAndOrRightScaleCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_leftAndOrRightScaleCrsMatrix_def.hpp
index 49f81a0c8891..d480a9ecccd4 100644
--- a/packages/tpetra/core/src/Tpetra_leftAndOrRightScaleCrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_leftAndOrRightScaleCrsMatrix_def.hpp
@@ -83,7 +83,7 @@ leftAndOrRightScaleCrsMatrix (Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
     // never been called on it before.  A never-initialized (and thus
     // invalid) local matrix has zero rows, because it was default
     // constructed.
-    auto A_lcl = A.getLocalMatrix ();
+    auto A_lcl = A.getLocalMatrixDevice ();
     const LO lclNumRows =
       static_cast<LO> (A.getRowMap ()->getNodeNumElements ());
     TEUCHOS_TEST_FOR_EXCEPTION
@@ -100,13 +100,13 @@ leftAndOrRightScaleCrsMatrix (Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
 
   const bool divide = scaling == SCALING_DIVIDE;
   if (leftScale) {
-    Details::leftScaleLocalCrsMatrix (A.getLocalMatrix (),
+    Details::leftScaleLocalCrsMatrix (A.getLocalMatrixDevice (),
                                       rowScalingFactors,
                                       assumeSymmetric,
                                       divide);
   }
   if (rightScale) {
-    Details::rightScaleLocalCrsMatrix (A.getLocalMatrix (),
+    Details::rightScaleLocalCrsMatrix (A.getLocalMatrixDevice (),
                                        colScalingFactors,
                                        assumeSymmetric,
                                        divide);
diff --git a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
index 63148c2047b6..16612e53fbf2 100644
--- a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
+++ b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
@@ -52,6 +52,7 @@
 
 namespace {
   using Tpetra::TestingUtilities::getDefaultComm;
+  using Tpetra::TestingUtilities::arcp_from_view;
   using Tpetra::Details::gathervPrint;
   using Teuchos::Array;
   using Teuchos::Comm;
@@ -157,10 +158,20 @@ namespace {
     typedef Tpetra::MultiVector<Scalar, LO, GO, Node> mv_type;
     typedef Tpetra::Vector<Scalar, LO, GO, Node> vec_type;
     typedef Tpetra::CrsGraph<LO, GO, Node> graph_type;
+
     typedef Tpetra::Map<LO, GO, Node> map_type;
+
+    using lids_type = typename graph_type::nonconst_local_inds_host_view_type;
+    using gids_type = typename graph_type::nonconst_global_inds_host_view_type;
+    using vals_type = typename BCM::nonconst_values_host_view_type;
+    using local_inds_host_view_type = typename BCM::local_inds_host_view_type;
+    using values_host_view_type = typename BCM::values_host_view_type;
+    using impl_scalar_type = typename BCM::impl_scalar_type;
+
     // The typedef below is also a test.  BlockCrsMatrix must have
     // this typedef, or this test won't compile.
     typedef typename BCM::little_block_type little_block_type;
+    typedef typename BCM::little_block_host_type little_block_host_type;
     typedef Teuchos::ScalarTraits<Scalar> STS;
     typedef typename STS::magnitudeType MT;
 
@@ -200,7 +211,7 @@ namespace {
     graph_type graph (meshRowMapPtr, maxNumEntPerRow, Tpetra::StaticProfile);
 
     // Fill the graph.
-    Teuchos::Array<GO> gblColInds (maxNumEntPerRow);
+    gids_type gblColInds ("gblColIds",maxNumEntPerRow);
     const GO globalNumRows = meshRowMap.getGlobalNumElements ();
     for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
          lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
@@ -211,7 +222,7 @@ namespace {
           static_cast<GO> (globalNumRows);
         gblColInds[k] = gblColInd;
       }
-      graph.insertGlobalIndices (gblRowInd, gblColInds ());
+      graph.insertGlobalIndices (gblRowInd, gblColInds.extent(0),gblColInds.data());
     }
     graph.fillComplete ();
 
@@ -257,48 +268,38 @@ namespace {
 
     out << "Test getLocalRowView, getLocalRowCopy, and replaceLocalValues" << endl;
 
-    blockMat.sync_host ();
-    blockMat.modify_host ();
-    {
-      if (! std::is_same<typename Node::device_type::memory_space, Kokkos::HostSpace>::value) {
-        // This is messed up with HIP using HIPHostPinnedSpace as its memory space
-        #ifndef KOKKOS_ENABLE_HIP
-        TEST_ASSERT( blockMat.template need_sync<typename Node::device_type::memory_space> () );
-        TEST_ASSERT( ! blockMat.template need_sync<Kokkos::HostSpace> () );
-        #endif
-        TEST_ASSERT( blockMat.need_sync_device () );
-        TEST_ASSERT( ! blockMat.need_sync_host () );
-      }
-      auto val = blockMat.template getValues<Kokkos::HostSpace> ();
-      // "Host" View may live in CudaUVMSpace, but its execution space
-      // had better be host.  We can tell that by getting the
-      // execution space's default memory space.
-      static_assert (std::is_same<typename decltype (val)::execution_space::memory_space,
-                     Kokkos::HostSpace>::value,
-                     "Host View is not actually a host View.");
-      auto val2 = blockMat.getValuesHost ();
-      static_assert (std::is_same<typename decltype (val2)::execution_space::memory_space,
-                     Kokkos::HostSpace>::value,
-                     "Host View is not actually a host View.");
-    }
+    // KK: not meaningfule test
+    // {
+    //   auto val = blockMat.getValuesHost(); 
+    //   static_assert (std::is_same<typename decltype (val)::execution_space::memory_space,
+    //                  Kokkos::HostSpace>::value,
+    //                  "Host View is not actually a host View.");
+    // }
+    // {
+    //   auto val = blockMat.getValuesHostNonConst ();
+    //   static_assert (std::is_same<typename decltype (val)::execution_space::memory_space,
+    //                  Kokkos::HostSpace>::value,
+    //                  "Host View is not actually a host View.");
+    // }
 
     Array<Scalar> tempBlockSpace (maxNumEntPerRow * entriesPerBlock);
 
     // Test that getLocalRowView returns the right column indices.
-    Array<LO> lclColInds (maxNumEntPerRow);
-    Array<LO> myLclColIndsCopy (maxNumEntPerRow);
-    Array<Scalar> myValsCopy (maxNumEntPerRow*entriesPerBlock);
-    Array<LO> myLclColIndsSorted (maxNumEntPerRow);
+
+    lids_type lclColInds ("lclColInds",maxNumEntPerRow);
+    lids_type myLclColIndsCopy ("myLclColIndsCopy",maxNumEntPerRow);
+    vals_type myValsCopy ("myValsCopy",maxNumEntPerRow*entriesPerBlock);
+    lids_type myLclColIndsSorted ("myLclColIndsSorted",maxNumEntPerRow);
     for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
          lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      const LO* myLclColInds = NULL;
-      Scalar* myVals = NULL;
+      local_inds_host_view_type myLclColInds;
+      values_host_view_type myVals;
       LO numEnt = 0;
-      LO err = blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals, numEnt);
-      TEST_ASSERT( err == 0 );
+      blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals); numEnt = myLclColInds.extent(0);
+
       TEST_ASSERT( numEnt == static_cast<LO> (maxNumEntPerRow) );
-      TEST_ASSERT( myLclColInds != NULL );
-      TEST_ASSERT( myVals != NULL );
+      TEST_ASSERT( myLclColInds.data() != NULL );
+      TEST_ASSERT( myVals.data() != NULL );
 
       // Compute what the local column indices in this row _should_ be.
       const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
@@ -310,27 +311,27 @@ namespace {
       }
       // CrsGraph doesn't technically need to promise to sort by local
       // column indices, so we sort both arrays before comparing.
-      std::sort (lclColInds.begin (), lclColInds.end ());
-      std::copy (myLclColInds, myLclColInds + 2, myLclColIndsSorted.begin ());
-      std::sort (myLclColIndsSorted.begin (), myLclColIndsSorted.end ());
+      Tpetra::sort (lclColInds, lclColInds.extent(0));
+      std::copy (myLclColInds.data(), myLclColInds.data() + 2, arcp_from_view(myLclColIndsSorted).begin());
+      Tpetra::sort (myLclColIndsSorted, myLclColIndsSorted.extent(0));
       TEST_COMPARE_ARRAYS( lclColInds, myLclColIndsSorted );
 
       // Test that getLocalRowCopy works.
       size_t numEntries;
-      blockMat.getLocalRowCopy (lclRowInd, myLclColIndsCopy(), myValsCopy(), numEntries);
+      blockMat.getLocalRowCopy (lclRowInd, myLclColIndsCopy, myValsCopy, numEntries);
       numEnt = static_cast<LO>(numEntries);
-      TEST_ASSERT( err == 0 );
       TEST_ASSERT( numEnt == static_cast<LO> (maxNumEntPerRow) );
 
       // CrsGraph doesn't technically need to promise to sort by local
       // column indices, so we sort both arrays before comparing.
-      std::copy (myLclColIndsCopy.getRawPtr(), myLclColIndsCopy.getRawPtr() + 2, myLclColIndsSorted.begin ());
-      std::sort (myLclColIndsSorted.begin (), myLclColIndsSorted.end ());
+      Kokkos::deep_copy(myLclColIndsSorted,Kokkos::subview(myLclColIndsCopy,std::make_pair(0,2)));
+      //      std::copy (myLclColIndsCopy.getRawPtr(), myLclColIndsCopy.getRawPtr() + 2, myLclColIndsSorted.begin ());
+      Tpetra::sort (myLclColIndsSorted, myLclColIndsSorted.extent(0));
       TEST_COMPARE_ARRAYS( lclColInds, myLclColIndsSorted );
 
       // Fill the entries in the row with zeros.
       std::fill (tempBlockSpace.begin (), tempBlockSpace.end (), STS::zero ());
-      err = blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (),
+      int err = blockMat.replaceLocalValues (lclRowInd, lclColInds.data(),
                                          tempBlockSpace.getRawPtr (), numEnt);
       TEST_ASSERT( err == numEnt );
       // Make sure that the input Scalar values didn't change (are
@@ -338,7 +339,7 @@ namespace {
       for (LO k = 0; k < numEnt; ++k) {
         Scalar* const tempBlockPtr = tempBlockSpace.getRawPtr () +
           k * blockSize * blockSize;
-        little_block_type tempBlock ((typename little_block_type::value_type*) tempBlockPtr, blockSize, blockSize);
+        little_block_host_type tempBlock ((typename little_block_host_type::value_type*) tempBlockPtr, blockSize, blockSize);
         for (LO j = 0; j < blockSize; ++j) {
           for (LO i = 0; i < blockSize; ++i) {
             TEST_ASSERT( static_cast<Scalar> (tempBlock(i,j)) == STS::zero () );
@@ -351,29 +352,29 @@ namespace {
       for (LO k = 0; k < numEnt; ++k) {
         Scalar* const tempBlockPtr = tempBlockSpace.getRawPtr () +
           k * blockSize * blockSize;
-        little_block_type tempBlock ((typename little_block_type::value_type*) tempBlockPtr, blockSize, blockSize);
+        little_block_host_type tempBlock ((typename little_block_host_type::value_type*) tempBlockPtr, blockSize, blockSize);
         for (LO j = 0; j < blockSize; ++j) {
           for (LO i = 0; i < blockSize; ++i) {
             tempBlock(i,j) = static_cast<Scalar> (static_cast<MT> (j + i * blockSize));
           }
         }
       } // for each entry in the row
-      err = blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (),
+      err = blockMat.replaceLocalValues (lclRowInd, lclColInds.data(),
                                          tempBlockSpace.getRawPtr (), numEnt);
       TEST_ASSERT( err == numEnt );
 
       // Get a view of the current row again, and test that the
       // entries were modified as expected.  This tests that the
       // method assumes that the input blocks are row major.
-      err = blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals, numEnt);
-      TEST_ASSERT( err == 0 );
-      TEST_ASSERT( numEnt == static_cast<LO> (maxNumEntPerRow) );
-      TEST_ASSERT( myLclColInds != NULL );
-      TEST_ASSERT( myVals != NULL );
+      blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals); numEnt = static_cast<LO>(myLclColInds.extent(0));
 
+
+      TEST_ASSERT( numEnt == static_cast<LO> (maxNumEntPerRow) );
+      TEST_ASSERT( myLclColInds.data() != NULL );
+      TEST_ASSERT( myVals.data() != NULL );
       for (LO k = 0; k < numEnt; ++k) {
-        Scalar* curBlkPtr = myVals + k * blockSize * blockSize;
-        little_block_type curBlk ((typename little_block_type::value_type*) curBlkPtr, blockSize, blockSize);
+        impl_scalar_type* curBlkPtr = const_cast<impl_scalar_type*>(reinterpret_cast<const impl_scalar_type*>(myVals.data())) + k * blockSize * blockSize;
+        little_block_host_type curBlk ((typename little_block_host_type::value_type*) curBlkPtr, blockSize, blockSize);
 
         for (LO j = 0; j < blockSize; ++j) {
           for (LO i = 0; i < blockSize; ++i) {
@@ -384,25 +385,22 @@ namespace {
       } // for each entry in the row
     } // for each local row
 
-    // We're done modifying data on host.
-    blockMat.template sync<typename Node::device_type::memory_space> ();
-    {
-      TEST_ASSERT( ! blockMat.template need_sync<typename Node::device_type::memory_space> () );
-      TEST_ASSERT( ! blockMat.template need_sync<Kokkos::HostSpace> () );
-      auto val = blockMat.template getValues<typename Node::device_type::memory_space> ();
-      // "Device" View may live in CudaUVMSpace.
-#if defined(KOKKOS_ENABLE_CUDA)
-      constexpr bool testing_cuda =
-        std::is_same<typename Node::device_type::execution_space, Kokkos::Cuda>::value;
-      static_assert (! testing_cuda ||
-                     std::is_same<typename decltype (val)::execution_space, Kokkos::Cuda>::value,
-                     "Device View is not actually a Device View.");
-      auto val2 = blockMat.getValuesDevice ();
-      static_assert (! testing_cuda ||
-                     std::is_same<typename decltype (val2)::execution_space, Kokkos::Cuda>::value,
-                     "Device View is not actually a Device View.");
-#endif // defined(KOKKOS_ENABLE_CUDA)
-    }
+    // KK: not meaningfule test; will be deprecated
+//     {
+//       auto val = blockMat.template getValues<typename Node::device_type::memory_space> ();
+//       // "Device" View may live in CudaUVMSpace.
+// #if defined(KOKKOS_ENABLE_CUDA)
+//       constexpr bool testing_cuda =
+//         std::is_same<typename Node::device_type::execution_space, Kokkos::Cuda>::value;
+//       static_assert (! testing_cuda ||
+//                      std::is_same<typename decltype (val)::execution_space, Kokkos::Cuda>::value,
+//                      "Device View is not actually a Device View.");
+//       auto val2 = blockMat.getValuesDevice ();
+//       static_assert (! testing_cuda ||
+//                      std::is_same<typename decltype (val2)::execution_space, Kokkos::Cuda>::value,
+//                      "Device View is not actually a Device View.");
+// #endif // defined(KOKKOS_ENABLE_CUDA)
+//     }
 
     out << "Test applyBlock for a single vector" << endl;
 
@@ -883,9 +881,13 @@ namespace {
     // The typedef below is also a test.  BlockCrsMatrix must have
     // this typedef, or this test won't compile.
     typedef typename BCM::little_block_type little_block_type;
+    typedef typename BCM::little_block_host_type little_block_host_type;
     typedef Teuchos::ScalarTraits<Scalar> STS;
     typedef typename STS::magnitudeType MT;
 
+    using local_inds_host_view_type = typename BCM::local_inds_host_view_type;
+    using values_host_view_type = typename BCM::values_host_view_type;
+
     out << "Testing output of a Tpetra::BlockCrsMatrix" << endl;
     Teuchos::OSTab tab0 (out);
 
@@ -949,10 +951,11 @@ namespace {
     Array<LO> myLclColIndsCopy (maxNumEntPerRow);
     for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
          lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      const LO* myLclColInds = NULL;
-      Scalar* myVals = NULL;
+
+      local_inds_host_view_type myLclColInds;
+      values_host_view_type myVals;
       LO numEnt = 0;
-      blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals, numEnt);
+      blockMat.getLocalRowView (lclRowInd, myLclColInds, myVals); numEnt = myLclColInds.extent(0);
 
       // Compute what the local column indices in this row _should_ be.
       const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
@@ -965,7 +968,7 @@ namespace {
       // CrsGraph doesn't technically need to promise to sort by local
       // column indices, so we sort both arrays before comparing.
       std::sort (lclColInds.begin (), lclColInds.end ());
-      std::copy (myLclColInds, myLclColInds + 2, myLclColIndsCopy.begin ());
+      std::copy (myLclColInds.data(), myLclColInds.data() + 2, myLclColIndsCopy.begin ());
       std::sort (myLclColIndsCopy.begin (), myLclColIndsCopy.end ());
       TEST_COMPARE_ARRAYS( lclColInds, myLclColIndsCopy );
 
@@ -979,7 +982,7 @@ namespace {
       for (LO k = 0; k < numEnt; ++k) {
         Scalar* const tempBlockPtr = tempBlockSpace.getRawPtr () +
           k * blockSize * blockSize;
-        little_block_type tempBlock ((typename little_block_type::value_type*) tempBlockPtr, blockSize, blockSize);
+        little_block_host_type tempBlock ((typename little_block_host_type::value_type*) tempBlockPtr, blockSize, blockSize);
         for (LO j = 0; j < blockSize; ++j) {
           for (LO i = 0; i < blockSize; ++i) {
             tempBlock(i,j) = static_cast<Scalar> (static_cast<MT> (j + i * blockSize) + 0.0123);
@@ -1014,8 +1017,12 @@ namespace {
     // The typedef below is also a test.  BlockCrsMatrix must have
     // this typedef, or this test won't compile.
     typedef typename BCM::little_block_type little_block_type;
+    typedef typename BCM::little_block_host_type little_block_host_type;
     typedef Teuchos::ScalarTraits<Scalar> STS;
 
+    using local_inds_host_view_type = typename BCM::local_inds_host_view_type;
+    using values_host_view_type = typename BCM::values_host_view_type;
+
     int lclSuccess = 1;
     int gblSuccess = 1;
 
@@ -1091,6 +1098,7 @@ namespace {
       std::cerr << os.str ();
     }
     Kokkos::fence ();
+    auto diagMeshOffsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), diagMeshOffsets);
 
     lclSuccess = success ? 1 : 0;
     reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
@@ -1115,7 +1123,7 @@ namespace {
     }
     else {
       TEST_ASSERT( diagMeshOffsets.extent (0) != 0 );
-      auto localGraph = graph.getLocalGraph ();
+      auto localGraph = graph.getLocalGraphDevice ();
       const auto& colMap = * (graph.getColMap ());
 
       TEST_EQUALITY( static_cast<size_t> (numLclMeshPoints + 1),
@@ -1129,13 +1137,12 @@ namespace {
           const GO gblColInd = gblRowInd;
           bool diagOffsetCorrect = false;
 
-          const LO* lclColInds = NULL;
-          Scalar* lclVals = NULL;
+          local_inds_host_view_type lclColInds;
+          values_host_view_type lclVals;
           LO numEnt = 0;
-          LO err = blockMat.getLocalRowView (lclRowInd, lclColInds, lclVals, numEnt);
-          TEST_ASSERT( err == 0 );
-          if (err == 0) {
-            const size_t offset = diagMeshOffsets[lclRowInd];
+          blockMat.getLocalRowView (lclRowInd, lclColInds, lclVals); numEnt = lclColInds.extent(0);
+          {
+            const size_t offset = diagMeshOffsetsHost[lclRowInd];
             if (offset >= static_cast<size_t> (numEnt)) {
               diagOffsetCorrect = false;
             }
@@ -1175,10 +1182,10 @@ namespace {
     for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
          lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
       const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
-      const LO* lclColInds = NULL;
-      Scalar* myVals = NULL;
+      local_inds_host_view_type lclColInds;
+      values_host_view_type myVals;
       LO numEnt = 0;
-      blockMat.getLocalRowView (lclRowInd, lclColInds, myVals, numEnt);
+      blockMat.getLocalRowView (lclRowInd, lclColInds, myVals); numEnt = lclColInds.extent(0);
 
       // Fill the diagonal block D such that D(i,j) = (lclRowInd+1) *
       // (1 + i + j*blockSize).  Fill the off-diagonal block with -1.
@@ -1186,8 +1193,8 @@ namespace {
       // that we copied them in the correct order.
       for (LO k = 0; k < numEnt; ++k) {
         const LO offset = blockSize * blockSize * k;
-        little_block_type curBlock (reinterpret_cast<IST*> (myVals) + offset,
-                                    blockSize, blockSize); // row major
+        little_block_host_type curBlock (const_cast<IST*> (myVals.data()) + offset,
+                                         blockSize, blockSize); // row major
         const GO gblColInd = meshColMap.getGlobalElement (lclColInds[k]);
         if (gblColInd == gblRowInd) { // the diagonal block
           IST curVal = STS::one ();
@@ -1212,24 +1219,25 @@ namespace {
     Kokkos::fence ();
     blockMat.getLocalDiagCopy (diagBlocks, diagMeshOffsets);
     Kokkos::fence ();
+    auto diagBlocksHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), diagBlocks);
 
     bool allBlocksGood = true;
     for (LO lclRowInd = 0; lclRowInd < static_cast<LO> (numLclMeshPoints); ++lclRowInd) {
       const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
-      const LO* lclColInds = NULL;
-      Scalar* myVals = NULL;
+      local_inds_host_view_type lclColInds;
+      values_host_view_type myVals;
       LO numEnt = 0;
-      blockMat.getLocalRowView (lclRowInd, lclColInds, myVals, numEnt);
+      blockMat.getLocalRowView (lclRowInd, lclColInds, myVals); numEnt = lclColInds.extent(0);
 
       // Make sure that the diagonal blocks from getLocalDiagCopy
       // match those in the matrix.
       for (LO k = 0; k < numEnt; ++k) {
         const LO offset = blockSize * blockSize * k;
-        little_block_type curBlock (reinterpret_cast<IST*> (myVals) + offset,
+        little_block_host_type curBlock (const_cast<IST*> (myVals.data()) + offset,
                                     blockSize, blockSize); // row major
         const GO gblColInd = meshColMap.getGlobalElement (lclColInds[k]);
         if (gblColInd == gblRowInd) { // the diagonal block
-          auto diagBlock = subview (diagBlocks, lclRowInd, ALL (), ALL ());
+          auto diagBlock = subview (diagBlocksHost, lclRowInd, ALL (), ALL ());
           for (LO j = 0; j < blockSize; ++j) {
             for (LO i = 0; i < blockSize; ++i) {
               if (curBlock(i,j) != diagBlock(i,j)) {
diff --git a/packages/tpetra/core/test/BugTests/Bug5072.cpp b/packages/tpetra/core/test/BugTests/Bug5072.cpp
index 3dbabaa3dd0e..626138743d01 100644
--- a/packages/tpetra/core/test/BugTests/Bug5072.cpp
+++ b/packages/tpetra/core/test/BugTests/Bug5072.cpp
@@ -127,8 +127,8 @@ namespace {
     TEST_EQUALITY( testMatrix->getNodeNumCols(), readMatrix->getNodeNumCols() );
     TEST_EQUALITY( testMatrix->getNodeNumEntries(), readMatrix->getNodeNumEntries() );
     if (success) {
-      Teuchos::ArrayView<const LO>    rowinds1, rowinds2;
-      Teuchos::ArrayView<const SC> rowvals1, rowvals2;
+      typename crs_matrix_type::local_inds_host_view_type rowinds1, rowinds2;
+      typename crs_matrix_type::values_host_view_type rowvals1, rowvals2;
 
       const LO lclNumRows = testMatrix->getNodeNumRows ();
       for (LO r = 0; r < lclNumRows; ++r) {
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_Issue601.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_Issue601.cpp
index b7766d2011b8..b640b3b65f38 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_Issue601.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_Issue601.cpp
@@ -159,37 +159,45 @@ namespace { // (anonymous)
 
         // Test gblRow0
         {
-          Teuchos::ArrayView<const GO> gblInds;
+          typename crs_graph_type::global_inds_host_view_type gblInds;
           G.getGlobalRowView (gblRow0, gblInds);
 
           const LO expectedNumEnt = static_cast<LO> (maxNumEntPerRow);
           TEST_EQUALITY( static_cast<LO> (gblInds.size ()), expectedNumEnt );
           if (static_cast<LO> (gblInds.size ()) == expectedNumEnt) {
             if (insertLocalEntry) {
-              auto lclEntIter = std::find (gblInds.begin (), gblInds.end (), gblRow0);
-              TEST_ASSERT( lclEntIter != gblInds.end () );
+              auto lclEntIter = std::find (gblInds.data(), 
+                                           gblInds.data() + gblInds.extent(0),
+                                           gblRow0);
+              TEST_ASSERT( lclEntIter != gblInds.data() + gblInds.extent(0));
             }
             const GO gblCol0 = gblRow0 + static_cast<GO> (numProcs);
-            auto nonlclEntIter = std::find (gblInds.begin (), gblInds.end (), gblCol0);
-            TEST_ASSERT( nonlclEntIter != gblInds.end () );
+            auto nonlclEntIter = std::find (gblInds.data(), 
+                                            gblInds.data() + gblInds.extent(0),
+                                            gblCol0);
+            TEST_ASSERT( nonlclEntIter != gblInds.data() + gblInds.extent(0));
           }
         }
 
         // Test gblRow1
         {
-          Teuchos::ArrayView<const GO> gblInds;
+          typename crs_graph_type::global_inds_host_view_type gblInds;
           G.getGlobalRowView (gblRow1, gblInds);
 
           const LO expectedNumEnt = static_cast<LO> (maxNumEntPerRow);
           TEST_EQUALITY( static_cast<LO> (gblInds.size ()), expectedNumEnt );
           if (static_cast<LO> (gblInds.size ()) == expectedNumEnt) {
             if (insertLocalEntry) {
-              auto lclEntIter = std::find (gblInds.begin (), gblInds.end (), gblRow1);
-              TEST_ASSERT( lclEntIter != gblInds.end () );
+              auto lclEntIter = std::find (gblInds.data(), 
+                                           gblInds.data() + gblInds.extent(0),
+                                           gblRow1);
+              TEST_ASSERT( lclEntIter != gblInds.data() + gblInds.extent(0) );
             }
             const GO gblCol1 = gblRow1 + static_cast<GO> (numProcs);
-            auto nonlclEntIter = std::find (gblInds.begin (), gblInds.end (), gblCol1);
-            TEST_ASSERT( nonlclEntIter != gblInds.end () );
+            auto nonlclEntIter = std::find (gblInds.data(), 
+                                            gblInds.data() + gblInds.extent(0),
+                                            gblCol1);
+            TEST_ASSERT( nonlclEntIter != gblInds.data() + gblInds.extent(0) );
           }
         }
 
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_ReindexColumns.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_ReindexColumns.cpp
index dc2e1755f1c2..a678a86ef8cf 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_ReindexColumns.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_ReindexColumns.cpp
@@ -76,6 +76,7 @@ namespace {
     typedef Tpetra::CrsGraph<LO, GO, Node> graph_type;
     typedef Tpetra::Import<LO, GO, Node> import_type;
     typedef Tpetra::Map<LO, GO, Node> map_type;
+    using lids_type = typename graph_type::nonconst_local_inds_host_view_type;
 
     const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();
     int gblSuccess = 0;
@@ -360,10 +361,10 @@ namespace {
         // Get the "new" local column indices that resulted from the
         // call to reindexColumns.  Get by copy, not by view, so we
         // can sort it.
-        Array<LO> newLclColInds (numEnt);
+        lids_type newLclColInds ("newLclColIds",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph.getLocalRowCopy (lclRowInd, newLclColInds (), actualNumEnt);
+          graph.getLocalRowCopy (lclRowInd, newLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -371,7 +372,7 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", newLclInds: " << Teuchos::toString (newLclColInds);
+        //        os << ", newLclInds: " << Teuchos::toString (newLclColInds);
 
         // Use the new column Map to convert them to global indices.
         Array<GO> gblColInds (numEnt);
@@ -408,10 +409,10 @@ namespace {
         os << ", oldLclInds: " << Teuchos::toString (oldLclColInds);
 
         // Get the original local indices from the original graph.
-        Array<LO> origLclColInds (numEnt);
+        lids_type origLclColInds ("origLclColIds",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph2->getLocalRowCopy (lclRowInd, origLclColInds (), actualNumEnt);
+          graph2->getLocalRowCopy (lclRowInd, origLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph2.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -419,16 +420,16 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", origLclInds: " << Teuchos::toString (origLclColInds);
+        //        os << ", origLclInds: " << Teuchos::toString (origLclColInds);
 
         // The indices in both graphs don't need to be in the same
         // order; they just need to be the same indices.
-        std::sort (origLclColInds.begin (), origLclColInds.end ());
+        Tpetra::sort (origLclColInds, origLclColInds.extent(0));
         std::sort (oldLclColInds.begin (), oldLclColInds.end ());
 
         // Compare the two sets of indices.
         bool arraysSame = true;
-        if (oldLclColInds.size () != origLclColInds.size ()) {
+        if ((size_t)oldLclColInds.size() != (size_t)origLclColInds.extent(0)) {
           arraysSame = false;
         } else {
           for (size_type k = 0; k < oldLclColInds.size (); ++k) {
@@ -534,6 +535,7 @@ namespace {
     typedef Tpetra::CrsGraph<LO, GO, Node> graph_type;
     typedef Tpetra::Import<LO, GO, Node> import_type;
     typedef Tpetra::Map<LO, GO, Node> map_type;
+    using lids_type = typename graph_type::nonconst_local_inds_host_view_type;
 
     const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();
     int gblSuccess = 0;
@@ -822,10 +824,10 @@ namespace {
         // Get the "new" local column indices that resulted from the
         // call to reindexColumns.  Get by copy, not by view, so we
         // can sort it.
-        Array<LO> newLclColInds (numEnt);
+        lids_type newLclColInds ("newLclColInds",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph.getLocalRowCopy (lclRowInd, newLclColInds (), actualNumEnt);
+          graph.getLocalRowCopy (lclRowInd, newLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -833,7 +835,7 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", newLclInds: " << Teuchos::toString (newLclColInds);
+        //        os << ", newLclInds: " << Teuchos::toString (newLclColInds);
 
         // Use the new column Map to convert them to global indices.
         Array<GO> gblColInds (numEnt);
@@ -870,10 +872,10 @@ namespace {
         os << ", oldLclInds: " << Teuchos::toString (oldLclColInds);
 
         // Get the original local indices from the original graph.
-        Array<LO> origLclColInds (numEnt);
+        lids_type origLclColInds("origLclColIds",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph2->getLocalRowCopy (lclRowInd, origLclColInds (), actualNumEnt);
+          graph2->getLocalRowCopy (lclRowInd, origLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph2.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -881,16 +883,16 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", origLclInds: " << Teuchos::toString (origLclColInds);
+        //os << ", origLclInds: " << Teuchos::toString (origLclColInds);
 
         // The indices in both graphs don't need to be in the same
         // order; they just need to be the same indices.
-        std::sort (origLclColInds.begin (), origLclColInds.end ());
+        Tpetra::sort (origLclColInds, origLclColInds.extent(0));
         std::sort (oldLclColInds.begin (), oldLclColInds.end ());
 
         // Compare the two sets of indices.
         bool arraysSame = true;
-        if (oldLclColInds.size () != origLclColInds.size ()) {
+        if ((size_t)oldLclColInds.size () != (size_t)origLclColInds.size ()) {
           arraysSame = false;
         } else {
           for (size_type k = 0; k < oldLclColInds.size (); ++k) {
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests0.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests0.cpp
index 6d7b46ff0062..f769d3387e06 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests0.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests0.cpp
@@ -47,6 +47,7 @@ namespace { // (anonymous)
 
   using Tpetra::ProfileType;
   using Tpetra::StaticProfile;
+  using Tpetra::TestingUtilities::arcp_from_view;
   using Teuchos::arcp;
   using Teuchos::arcpClone;
   using Teuchos::Array;
@@ -360,16 +361,20 @@ namespace { // (anonymous)
       ProfileType pftype = StaticProfile;
       params->set("Optimize Storage",((T & 2) == 2));
       GRAPH trigraph(rmap,cmap, ginds.size(),pftype);   // only allocate as much room as necessary
-      Array<GO> GCopy(4); Array<LO> LCopy(4);
-      ArrayView<const GO> GView;
-      ArrayView<const LO> LView;
       size_t numindices;
-      // at this point, there are no global or local indices, but views and copies should succeed
-      trigraph.getLocalRowCopy(0,LCopy,numindices);
-      trigraph.getLocalRowView(0,LView);
-      trigraph.getGlobalRowCopy(myrowind,GCopy,numindices);
-      trigraph.getGlobalRowView(myrowind,GView);
-      // use multiple inserts: this illustrated an overwrite bug for column-map-specified graphs
+      {
+
+        typename GRAPH::global_inds_host_view_type GView;
+        typename GRAPH::local_inds_host_view_type LView;
+        typename GRAPH::nonconst_global_inds_host_view_type GCopy("gcopy",4);
+        typename GRAPH::nonconst_local_inds_host_view_type LCopy("lcopy",4);
+        // at this point, there are no global or local indices, but views and copies should succeed
+        TEST_NOTHROW(trigraph.getLocalRowCopy(0,LCopy,numindices));
+        TEST_NOTHROW(trigraph.getLocalRowView(0,LView));
+        TEST_NOTHROW(trigraph.getGlobalRowCopy(myrowind,GCopy,numindices));
+        TEST_NOTHROW(trigraph.getGlobalRowView(myrowind,GView));
+      }
+        // use multiple inserts: this illustrated an overwrite bug for column-map-specified graphs
       typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
       for (size_type j=0; j < ginds.size(); ++j) {
         trigraph.insertGlobalIndices(myrowind,ginds(j,1));
@@ -387,16 +392,22 @@ namespace { // (anonymous)
         Array<GO> zero(0);
         TEST_THROW( trigraph.insertGlobalIndices(0,zero()), std::runtime_error );
       }
-      // check for throws and no-throws/values
-      TEST_THROW( trigraph.getGlobalRowView(myrowind,GView), std::runtime_error );
-      TEST_THROW( trigraph.getLocalRowCopy(    0       ,LCopy(0,1),numindices), std::runtime_error );
-      TEST_THROW( trigraph.getGlobalRowCopy(myrowind,GCopy(0,1),numindices), std::runtime_error );
-      TEST_NOTHROW( trigraph.getLocalRowView(0,LView) );
-      TEST_COMPARE_ARRAYS( LView, linds );
-      TEST_NOTHROW( trigraph.getLocalRowCopy(0,LCopy,numindices) );
-      TEST_COMPARE_ARRAYS( LCopy(0,numindices), linds );
-      TEST_NOTHROW( trigraph.getGlobalRowCopy(myrowind,GCopy,numindices) );
-      TEST_COMPARE_ARRAYS( GCopy(0,numindices), ginds );
+      {
+        // check for throws and no-throws/values
+        typename GRAPH::global_inds_host_view_type GView;
+        typename GRAPH::local_inds_host_view_type LView;
+        typename GRAPH::nonconst_global_inds_host_view_type GCopy_short("gshort",1), GCopy("gcopy",4);
+        typename GRAPH::nonconst_local_inds_host_view_type LCopy_short("lshort",1), LCopy("lcopy",4);
+        TEST_THROW( trigraph.getGlobalRowView(myrowind,GView), std::runtime_error );
+        TEST_THROW( trigraph.getLocalRowCopy(0, LCopy_short,numindices), std::runtime_error );
+        TEST_THROW( trigraph.getGlobalRowCopy(myrowind,GCopy_short,numindices), std::runtime_error );
+        TEST_NOTHROW( trigraph.getLocalRowView(0,LView) );
+        TEST_COMPARE_ARRAYS( LView, linds );
+        TEST_NOTHROW( trigraph.getLocalRowCopy(0,LCopy,numindices) );
+        TEST_COMPARE_ARRAYS( arcp_from_view(LCopy,numindices), linds );
+        TEST_NOTHROW( trigraph.getGlobalRowCopy(myrowind,GCopy,numindices) );
+        TEST_COMPARE_ARRAYS( arcp_from_view(GCopy,numindices), ginds );
+      }
       STD_TESTS(trigraph);
       
       // All procs fail if any node fails
@@ -622,11 +633,13 @@ namespace { // (anonymous)
       ArrayRCP<size_t> toalloc = arcpClone<size_t>( tuple<size_t>(0,1,0) );
       GRAPH ddgraph(map, toalloc (), pftype);
       ddgraph.insertGlobalIndices(mymiddle, tuple<GO>(mymiddle));
-      // before globalAssemble(), there should be one local entry on middle, none on the others
-      ArrayView<const GO> myrow_gbl;
-      ddgraph.getGlobalRowView(mymiddle-1,myrow_gbl); TEST_EQUALITY( myrow_gbl.size(), 0 );
-      ddgraph.getGlobalRowView(mymiddle  ,myrow_gbl); TEST_COMPARE_ARRAYS( myrow_gbl, tuple<GO>(mymiddle) );
-      ddgraph.getGlobalRowView(mymiddle+1,myrow_gbl); TEST_EQUALITY( myrow_gbl.size(), 0 );
+      {
+        // before globalAssemble(), there should be one local entry on middle, none on the others
+        typename GRAPH::global_inds_host_view_type myrow_gbl;
+        ddgraph.getGlobalRowView(mymiddle-1,myrow_gbl); TEST_EQUALITY( myrow_gbl.size(), 0 );
+        ddgraph.getGlobalRowView(mymiddle  ,myrow_gbl); TEST_COMPARE_ARRAYS( myrow_gbl, tuple<GO>(mymiddle) );
+        ddgraph.getGlobalRowView(mymiddle+1,myrow_gbl); TEST_EQUALITY( myrow_gbl.size(), 0 );
+      }
       if (pftype == StaticProfile) { // no room for more, on any row
         TEST_THROW( ddgraph.insertGlobalIndices(mymiddle-1,tuple<GO>(mymiddle+1)), std::runtime_error );
         TEST_THROW( ddgraph.insertGlobalIndices(mymiddle  ,tuple<GO>(mymiddle+1)), std::runtime_error );
@@ -634,13 +647,15 @@ namespace { // (anonymous)
       }
       ddgraph.fillComplete(params);
       // after fillComplete(), there should be a single entry on my middle, corresponding to the diagonal, none on the others
-      ArrayView<const LO> myrow_lcl;
-      TEST_EQUALITY_CONST( ddgraph.getNumEntriesInLocalRow(0), 0 );
-      TEST_EQUALITY_CONST( ddgraph.getNumEntriesInLocalRow(2), 0 );
-      ddgraph.getLocalRowView(1,myrow_lcl);
-      TEST_EQUALITY_CONST( myrow_lcl.size(), 1 );
-      if (myrow_lcl.size() == 1) {
-        TEST_EQUALITY( ddgraph.getColMap()->getGlobalElement(myrow_lcl[0]), mymiddle );
+      {
+        typename GRAPH::local_inds_host_view_type myrow_lcl;
+        TEST_EQUALITY_CONST( ddgraph.getNumEntriesInLocalRow(0), 0 );
+        TEST_EQUALITY_CONST( ddgraph.getNumEntriesInLocalRow(2), 0 );
+        ddgraph.getLocalRowView(1,myrow_lcl);
+        TEST_EQUALITY_CONST( myrow_lcl.size(), 1 );
+        if (myrow_lcl.size() == 1) {
+          TEST_EQUALITY( ddgraph.getColMap()->getGlobalElement(myrow_lcl[0]), mymiddle );
+        }
       }
       // also, the row map and column map should be equivalent
       TEST_EQUALITY( ddgraph.getGlobalNumCols(), static_cast<GST> (3*numProcs) );
@@ -704,23 +719,32 @@ namespace { // (anonymous)
             grow = 0;
           }
           diaggraph.insertGlobalIndices (grow, tuple<GO> (grow));
-          // before globalAssemble(), there should be no local entries if numProcs > 1
-          ArrayView<const GO> myrow_gbl;
-          diaggraph.getGlobalRowView (myrowind, myrow_gbl);
-          TEST_EQUALITY( myrow_gbl.size (), (numProcs == 1 ? 1 : 0) );
+          // before globalAssemble(), there should be no local entries if 
+          // numProcs > 1
+          {
+            typename GRAPH::global_inds_host_view_type myrow_gbl;
+            diaggraph.getGlobalRowView (myrowind, myrow_gbl);
+            TEST_EQUALITY( myrow_gbl.size (), (numProcs == 1 ? 1 : 0) );
+          }
           diaggraph.globalAssemble ();
           // after globalAssemble(), there should be one local entry per
           // row, corresponding to the diagonal
-          diaggraph.getGlobalRowView (myrowind, myrow_gbl);
-          TEST_COMPARE_ARRAYS( myrow_gbl, tuple<GO> (myrowind) );
+          {
+            typename GRAPH::global_inds_host_view_type myrow_gbl;
+            diaggraph.getGlobalRowView (myrowind, myrow_gbl);
+            TEST_COMPARE_ARRAYS( myrow_gbl, tuple<GO> (myrowind) );
+          }
 
           if (pftype == StaticProfile) { // no room for more
-            out << "Attempt to insert global column index " << (myrowind+1) << " into"
-              " global row " << myrowind << "; it should throw, because the graph"
-              " is StaticProfile, has an upper bound of one entry per row, and "
-              "already has a different column index " << grow << " in this row."
+            out << "Attempt to insert global column index " << (myrowind+1) 
+                << " into global row " << myrowind 
+                << "; it should throw, because the graph"
+                << " is StaticProfile, has an upper bound of one entry "
+                << "per row, and already has a different column index " 
+                << grow << " in this row."
                 << endl;
-            TEST_THROW( diaggraph.insertGlobalIndices(myrowind,tuple<GO>(myrowind+1)),
+            TEST_THROW( diaggraph.insertGlobalIndices(myrowind,
+                                                      tuple<GO>(myrowind+1)),
                         std::runtime_error );
           }
 
@@ -728,15 +752,19 @@ namespace { // (anonymous)
 
           // after fillComplete(), there should be a single entry on my
           // row, corresponding to the diagonal
-          ArrayView<const LO> myrow_lcl;
-          diaggraph.getLocalRowView (0, myrow_lcl);
-          TEST_EQUALITY_CONST( myrow_lcl.size (), 1 );
-          if (myrow_lcl.size() == 1) {
-            TEST_EQUALITY( diaggraph.getColMap ()->getGlobalElement (myrow_lcl[0]),
-                           myrowind );
+          {
+            typename GRAPH::local_inds_host_view_type myrow_lcl;
+            diaggraph.getLocalRowView (0, myrow_lcl);
+            TEST_EQUALITY_CONST( myrow_lcl.size (), 1 );
+            if (myrow_lcl.size() == 1) {
+              TEST_EQUALITY( 
+                   diaggraph.getColMap()->getGlobalElement(myrow_lcl[0]),
+                   myrowind );
+            }
           }
           // also, the row map and column map should be equivalent
-          TEST_EQUALITY_CONST( diaggraph.getRowMap()->isSameAs(*diaggraph.getColMap()), true );
+          TEST_EQUALITY_CONST(
+               diaggraph.getRowMap()->isSameAs(*diaggraph.getColMap()), true );
 
           STD_TESTS(diaggraph);
         }
@@ -763,11 +791,6 @@ namespace { // (anonymous)
           ngraph.insertGlobalIndices (grows[1], tuple<GO> (myRank));
           ngraph.insertGlobalIndices (grows[2], tuple<GO> (myRank));
 
-          // before globalAssemble(), there should be a single local
-          // entry on parallel runs, three on serial runs
-          ArrayView<const GO> myrow_gbl;
-          ngraph.getGlobalRowView (myrowind, myrow_gbl);
-
           // after globalAssemble(), storage should be maxed out
           out << "Calling globalAssemble()" << endl;
           ngraph.globalAssemble();
@@ -777,14 +800,11 @@ namespace { // (anonymous)
           out << "Calling fillComplete(params)" << endl;
           ngraph.fillComplete (params);
 
-          // after fillComplete(), there should be entries for me and my
-          // neighbors on my row
-          ArrayView<const LO> myrow_lcl;
-          ngraph.getLocalRowView (0, myrow_lcl);
-          out << "Returned view of column indices on Proc 0: "
-              << Teuchos::toString (myrow_lcl) << endl;
-
           {
+            // after fillComplete(), there should be entries for me and my
+            // neighbors on my row
+            typename GRAPH::local_inds_host_view_type myrow_lcl;
+            ngraph.getLocalRowView (0, myrow_lcl);
             // check indices on my row
             typename Array<GO>::iterator glast;
             std::sort (grows.begin (), grows.end ());
@@ -795,20 +815,23 @@ namespace { // (anonymous)
             TEST_EQUALITY_CONST( (size_t)myrow_lcl.size(), numunique );
             if ((size_t)myrow_lcl.size() == numunique) {
               size_t numinds;
-              Array<GO> inds(numunique+1);
+              typename GRAPH::nonconst_global_inds_host_view_type inds("right",numunique),inds_short("short",numunique-1),inds_long("long",numunique+1);
               TEST_THROW(
-                         ngraph.getGlobalRowCopy (myrowind, inds (0, numunique-1), numinds),
-                         std::runtime_error );
+                   ngraph.getGlobalRowCopy (myrowind, inds_short,
+                                            numinds),
+                   std::runtime_error );
+              TEST_NOTHROW(
+                   ngraph.getGlobalRowCopy (myrowind, inds_long,
+                                            numinds) );
               TEST_NOTHROW(
-                           ngraph.getGlobalRowCopy (myrowind, inds (0, numunique), numinds) );
-              TEST_NOTHROW( ngraph.getGlobalRowCopy (myrowind,inds (), numinds) );
-              std::sort (inds.begin (), inds.begin () + numinds);
-              TEST_COMPARE_ARRAYS( inds (0, numinds), grows (0, numunique) );
+                   ngraph.getGlobalRowCopy (myrowind,inds, numinds) );
+              Tpetra::sort(inds, numinds);
+              TEST_COMPARE_ARRAYS( arcp_from_view(inds,numinds), grows (0, numunique) );
 
               out << "On Proc 0:" << endl;
               Teuchos::OSTab tab5 (out);
               out << "numinds: " << numinds << endl
-                  << "inds(0,numinds): " << inds (0, numinds) << endl
+                  << "inds(0,numinds): " << arcp_from_view(inds, numinds) << endl
                   << "numunique: " << numunique << endl
                   << "grows(0,numunique): " << grows (0, numunique) << endl;
             }
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp
index 465c7c6a3ad9..b668f308b039 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp
@@ -191,7 +191,7 @@ namespace { // (anonymous)
     {
       bool sortingCheck = true;
       for (LO i=map->getMinLocalIndex(); i <= map->getMaxLocalIndex(); ++i) {
-        ArrayView<const LO> inds;
+        typename GRAPH::local_inds_host_view_type inds;
         graph.getLocalRowView(i,inds);
         for (int j=1; j < (int)inds.size(); ++j) {
           if (inds[j-1] > inds[j]) {sortingCheck = false; break;}
@@ -205,7 +205,7 @@ namespace { // (anonymous)
     {
       bool sortingCheck = true;
       for (LO i=map->getMinLocalIndex(); i <= map->getMaxLocalIndex(); ++i) {
-        ArrayView<const LO> inds;
+        typename GRAPH::local_inds_host_view_type inds;
         graph.getLocalRowView(i,inds);
         for (int j=1; j < (int)inds.size(); ++j) {
           if (inds[j-1] > inds[j]) {sortingCheck = false; break;}
@@ -222,7 +222,7 @@ namespace { // (anonymous)
     {
       bool sortingCheck = true;
       for (LO i=map->getMinLocalIndex(); i <= map->getMaxLocalIndex(); ++i) {
-        ArrayView<const LO> inds;
+        typename GRAPH::local_inds_host_view_type inds;
         graph.getLocalRowView(i,inds);
         for (int j=1; j < (int)inds.size(); ++j) {
           if (inds[j-1] > inds[j]) {sortingCheck = false; break;}
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp
index ed27dfdc6b54..93bce1345f00 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnpackIntoStaticGraph.cpp
@@ -201,6 +201,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT
   A->fillComplete();
   using device_type = typename NT::device_type;
   using execution_space = typename device_type::execution_space;
+  using gids_type = typename graph_type::nonconst_global_inds_host_view_type;
   execution_space().fence ();
 
   auto loc_num_errs = 0;
@@ -211,9 +212,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT
     for (LO loc_row=0; loc_row<num_loc_rows; ++loc_row) {
       const auto gbl_row = map1->getGlobalElement(loc_row);
       size_t num_entries = 3;
-      Array<GO> A_indices(num_entries);
-      A->getGlobalRowCopy(gbl_row, A_indices(), num_entries);
-      std::sort(A_indices.begin(), A_indices.begin()+num_entries);
+      gids_type A_indices(num_entries);
+      A->getGlobalRowCopy(gbl_row, A_indices, num_entries);
+      Tpetra::sort(A_indices, num_entries);
 
       auto errors = 0; // Herb Sutter loves you :)
       if (gbl_row == 0) {
diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_insertGlobalIndicesFiltered.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_insertGlobalIndicesFiltered.cpp
index c7040a541a8f..f569d81bb624 100644
--- a/packages/tpetra/core/test/CrsGraph/CrsGraph_insertGlobalIndicesFiltered.cpp
+++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_insertGlobalIndicesFiltered.cpp
@@ -74,6 +74,7 @@ namespace { // (anonymous)
     typedef Tpetra::CrsGraph<LO, GO, NODE_TYPE> crs_graph_type;
     typedef Tpetra::Map<LO, GO, NODE_TYPE> map_type;
     typedef Tpetra::Export<LO, GO, NODE_TYPE> export_type;
+    using gids_type = typename crs_graph_type::nonconst_global_inds_host_view_type;
     int lclSuccess = 1; // to set below
     int gblSuccess = 0; // to set below
     const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();
@@ -131,7 +132,7 @@ namespace { // (anonymous)
     const size_t maxNumEntPerRow = static_cast<size_t> (lclNumColMapInds_src);
 
     // Buffer for storing output of getGlobalRowCopy.
-    Teuchos::Array<GO> gblColIndsBuf (maxNumEntPerRow);
+    gids_type gblColIndsBuf("gcids",maxNumEntPerRow);
 
     const Tpetra::ProfileType profileTypes[1] = {Tpetra::StaticProfile};
     for (auto profileType_src : profileTypes) {
@@ -158,7 +159,7 @@ namespace { // (anonymous)
         for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
           const GO gblRow = rowMap_tgt->getGlobalElement (lclRow);
           size_t numEnt = 0; // output argument
-          graph_tgt.getGlobalRowCopy (gblRow, gblColIndsBuf (), numEnt);
+          graph_tgt.getGlobalRowCopy (gblRow, gblColIndsBuf, numEnt);
           TEST_EQUALITY( numEnt, static_cast<size_t> (lclNumColMapInds_tgt) );
           if (numEnt == static_cast<size_t> (lclNumColMapInds_tgt)) {
             for (LO k = 0; k < static_cast<LO> (numEnt); ++k) {
diff --git a/packages/tpetra/core/test/CrsGraph/UnpackMerge.cpp b/packages/tpetra/core/test/CrsGraph/UnpackMerge.cpp
index 2931c549b262..cb3c4ecc9a00 100644
--- a/packages/tpetra/core/test/CrsGraph/UnpackMerge.cpp
+++ b/packages/tpetra/core/test/CrsGraph/UnpackMerge.cpp
@@ -162,7 +162,7 @@ namespace { // (anonymous)
 
       Kokkos::fence(); // since we're accessing data on host now
 
-      Teuchos::ArrayView<const LO> lclColInds;
+      typename crs_graph_type::local_inds_host_view_type lclColInds;
       const LO lclRowToTest (0);
       A_tgt.getLocalRowView(lclRowToTest, lclColInds);
 
@@ -191,6 +191,7 @@ namespace { // (anonymous)
     using crs_graph_type = Tpetra::CrsGraph<LO, GO, Node>;
     using import_type = Tpetra::Import<LO, GO, Node>;
     using map_type = Tpetra::Map<LO, GO, Node>;
+    using gids_type = typename crs_graph_type::nonconst_global_inds_host_view_type;
     int lclSuccess = 1;
     int gblSuccess = 0;
 
@@ -321,12 +322,12 @@ namespace { // (anonymous)
     if (myRank == 0) {
       const GO gblRowToTest = tgtRowMap->getMinGlobalIndex();
       size_t numEnt = A_tgt.getNumEntriesInGlobalRow(gblRowToTest);
-      Teuchos::Array<GO> gblColInds(numEnt);
-      A_tgt.getGlobalRowCopy(gblRowToTest, gblColInds(), numEnt);
+      gids_type gblColInds("gids",numEnt);
+      A_tgt.getGlobalRowCopy(gblRowToTest, gblColInds, numEnt);
 
       const LO expectedNumEnt(unionGblColInds.size());
       TEST_EQUALITY( size_t(numEnt), size_t(expectedNumEnt) );
-      TEST_EQUALITY( size_t(gblColInds.size()),
+      TEST_EQUALITY( size_t(gblColInds.extent(0)),
                      size_t(expectedNumEnt) );
 
       if (success) {
diff --git a/packages/tpetra/core/test/CrsMatrix/Bug8447.cpp b/packages/tpetra/core/test/CrsMatrix/Bug8447.cpp
index 3403df5ee6e0..8868426699ec 100644
--- a/packages/tpetra/core/test/CrsMatrix/Bug8447.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/Bug8447.cpp
@@ -72,7 +72,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, Bug8447, SC, LO, GO, NT)
   typedef Tpetra::Import<LO,GO,NT> ImportType;
   typedef Tpetra::CrsMatrix<SC,LO,GO,NT> CrsMatrixType;
   typedef typename CrsMatrixType::impl_scalar_type implScalarType;
-  typedef typename CrsMatrixType::local_matrix_type lclMatrixType;
+  typedef typename CrsMatrixType::local_matrix_device_type lclMatrixType;
 
   RCP<const Comm<int> > comm = getDefaultComm();
   TEUCHOS_TEST_FOR_EXCEPTION(
diff --git a/packages/tpetra/core/test/CrsMatrix/Bug8794.cpp b/packages/tpetra/core/test/CrsMatrix/Bug8794.cpp
index cda61ce39c55..22904c00ec70 100644
--- a/packages/tpetra/core/test/CrsMatrix/Bug8794.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/Bug8794.cpp
@@ -147,9 +147,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug8794, InsertDenseRows,
               << "  maxPerRow " << Amat.getNodeMaxNumRowEntries() << "\n"
               << "  norm      " << Amat.getFrobeniusNorm() << "\n"
               << std::endl;
-
-//    Teuchos::FancyOStream foo(Teuchos::rcp(&std::cout,false));
-//    Amat.describe(foo, Teuchos::VERB_EXTREME);
   }
 
   // Initialize domain vector for SpMV
diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
index 20b906ec738a..f64990731af1 100644
--- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
+++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
@@ -394,14 +394,16 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   STANDARD_PASS_OUTPUT
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  CrsMatrix_createDeepCopy
-  SOURCES
+IF (Tpetra_ENABLE_DEPRECATED_CODE)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
     CrsMatrix_createDeepCopy
-  COMM serial mpi
-  NUM_MPI_PROCS 1-4
-  STANDARD_PASS_OUTPUT
-  )
+    SOURCES
+      CrsMatrix_createDeepCopy
+    COMM serial mpi
+    NUM_MPI_PROCS 1-4
+    STANDARD_PASS_OUTPUT
+    )
+ENDIF()
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   CrsMatrix_UnpackMerge
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_GetRowCopy.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_GetRowCopy.cpp
index 5a29e292f356..1bd18d60947e 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_GetRowCopy.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_GetRowCopy.cpp
@@ -71,6 +71,8 @@ namespace {
     typedef Tpetra::global_size_t GST;
     typedef Teuchos::ScalarTraits<Scalar> STS;
     typedef typename STS::magnitudeType MT;
+    typedef typename crs_matrix_type::nonconst_global_inds_host_view_type g_indices_type;
+    typedef typename crs_matrix_type::nonconst_values_host_view_type values_type;
 
     RCP<const Comm<int> > comm = getDefaultComm ();
     const int numProcs = comm->getSize ();
@@ -115,13 +117,13 @@ namespace {
 
     // Make the arrays bigger than necessary, just to make sure that
     // the methods behave correctly.
-    Teuchos::Array<GO> curGblColInds (5);
-    Teuchos::Array<Scalar> curVals (5);
+    g_indices_type curGblColInds ("indices",5);
+    values_type curVals ("values",5);
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
       const GO gblRow = rowMap->getGlobalElement (lclRow);
 
       size_t numEnt = 0;
-      TEST_NOTHROW( A.getGlobalRowCopy (gblRow, curGblColInds (), curVals (), numEnt) );
+      TEST_NOTHROW( A.getGlobalRowCopy (gblRow, curGblColInds, curVals, numEnt) );
       TEST_EQUALITY( numEnt, static_cast<size_t> (2) );
       if (numEnt != static_cast<size_t> (2)) {
         break; // avoid segfault on error
@@ -168,6 +170,8 @@ namespace {
     typedef Tpetra::global_size_t GST;
     typedef Teuchos::ScalarTraits<Scalar> STS;
     typedef typename STS::magnitudeType MT;
+    typedef typename crs_matrix_type::nonconst_local_inds_host_view_type l_indices_type;
+    typedef typename crs_matrix_type::nonconst_values_host_view_type values_type;
 
     RCP<const Comm<int> > comm = getDefaultComm ();
     const int numProcs = comm->getSize ();
@@ -217,11 +221,11 @@ namespace {
 
     // Make the arrays bigger than necessary, just to make sure that
     // the methods behave correctly.
-    Teuchos::Array<LO> curLclColInds (5);
-    Teuchos::Array<Scalar> curVals (5);
+    l_indices_type curLclColInds ("indices",5);
+    values_type curVals ("values",5);
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
       size_t numEnt = 0;
-      TEST_NOTHROW( A.getLocalRowCopy (lclRow, curLclColInds (), curVals (), numEnt) );
+      TEST_NOTHROW( A.getLocalRowCopy (lclRow, curLclColInds, curVals, numEnt) );
       TEST_EQUALITY( numEnt, static_cast<size_t> (2) );
       if (numEnt != static_cast<size_t> (2)) {
         break; // avoid segfault on error
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_LeftRightScale.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_LeftRightScale.cpp
index ae8acd25deb8..3a37aaf01436 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_LeftRightScale.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_LeftRightScale.cpp
@@ -120,6 +120,7 @@ namespace {
     typedef typename STS::magnitudeType MT;
     typedef Teuchos::ScalarTraits<MT> STM;
     typedef typename ArrayView<const LO>::size_type size_type;
+    typedef CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> MAT;
 
     MT mySum = STM::zero ();
     Array<LO> inds (matrix->getNodeMaxNumRowEntries ());
@@ -129,8 +130,8 @@ namespace {
     for (size_t i = 0; i < myNumRows; ++i) {
       const LO myRow = as<LO> (i);
       const size_t numRowEnts = matrix->getNumEntriesInLocalRow (myRow);
-      ArrayView<const LO> indsView = inds.view (0, as<size_type> (numRowEnts));
-      ArrayView<const ST> valsView = vals.view (0, as<size_type> (numRowEnts));
+      typename MAT::local_inds_host_view_type indsView;
+      typename MAT::values_host_view_type valsView;
       matrix->getLocalRowView (myRow, indsView, valsView);
       for (size_t j = 0; j < numRowEnts; ++j) {
         const ST curVal = valsView[j];
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MultipleFillCompletes.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MultipleFillCompletes.cpp
index cb99fd258c9f..9cfd55d8118f 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MultipleFillCompletes.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MultipleFillCompletes.cpp
@@ -148,22 +148,12 @@ namespace {
       TEST_EQUALITY( matrix.getGlobalNumEntries(), numLocal*numImages );
       TEST_EQUALITY( matrix.getNodeNumEntries(), numLocal );
       for (LO r = 0; r < static_cast<LO> (numLocal); ++r) {
-        ArrayView<const LO> inds;
-        ArrayView<const Scalar> vals;
+        typename MAT::local_inds_host_view_type inds;
+        typename MAT::values_host_view_type vals;
         TEST_NOTHROW( matrix.getLocalRowView(r,inds,vals) );
         TEST_COMPARE_ARRAYS( inds, tuple<LO> (r) );
         TEST_COMPARE_ARRAYS( vals, tuple<Scalar> (static_cast<Scalar> (3.0)) );
 
-        LO rawNumEnt = 0;
-        const Scalar* rawVals = NULL;
-        const LO* rawInds = NULL;
-        TEST_NOTHROW( matrix.getLocalRowView (r, rawNumEnt, rawVals, rawInds) );
-        TEST_EQUALITY( rawNumEnt, static_cast<LO> (1) );
-        TEST_ASSERT( rawVals != NULL && rawInds != NULL );
-        if (rawVals != NULL && rawInds != NULL) {
-          TEST_EQUALITY( rawInds[0], r );
-          TEST_EQUALITY( rawVals[0], static_cast<Scalar> (3.0) );
-        }
       }
     }
 
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto.cpp
index ba2028669e13..8e6674b7025b 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto.cpp
@@ -43,6 +43,7 @@
 
 #include "Teuchos_UnitTestHarness.hpp"
 
+#include "Tpetra_TestingUtilities.hpp"
 #include "Tpetra_CrsGraph.hpp"
 #include "Tpetra_CrsMatrix.hpp"
 #include "Tpetra_Core.hpp"
@@ -96,6 +97,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto, LocalOrdinalType,
   using Teuchos::ScalarTraits;
   using Teuchos::tuple;
   using Teuchos::TypeNameTraits;
+  using namespace Tpetra::TestingUtilities;
   using std::endl;
 
 #if 0
@@ -221,17 +223,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto, LocalOrdinalType,
   bool localGraphSuccess = true;
   std::ostringstream graphFailMsg;
   {
-    Array<GO> ind (2); // upper bound
+    using indices_type = typename crs_graph_type::nonconst_global_inds_host_view_type;
+    indices_type indView("indices",2);//upper bound
 
     for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
       size_t numEntries = 0; // output argument of below line.
-      graph->getGlobalRowCopy (globalRow, ind (), numEntries);
-
-      // Revise view based on numEntries.
-      ArrayView<GO> indView = ind.view (0, numEntries);
+      graph->getGlobalRowCopy (globalRow, indView, numEntries);
 
       // Sort the view.
-      std::sort (indView.begin (), indView.end ());
+      Tpetra::sort(indView, numEntries);
 
       if (globalRow == globalMinRow && globalRow > rowMap->getMinAllGlobalIndex ()) {
         if (numEntries != static_cast<size_t> (2)) {
@@ -339,19 +339,16 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto, LocalOrdinalType,
   bool localSuccess = true;
   std::ostringstream failMsg;
   {
-    Array<GO> ind (2); // upper bound
-    Array<ST> val (2); // upper bound
+    using indices_type = typename CrsMatrixType::nonconst_global_inds_host_view_type;
+    using values_type = typename CrsMatrixType::nonconst_values_host_view_type;
+    indices_type indView("indices",2);//upper bound
+    values_type valView("values",2);//upper bound
 
     for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
       size_t numEntries = 0; // output argument of below line.
-      matrix->getGlobalRowCopy (globalRow, ind (), val (), numEntries);
-
-      // Revise views based on numEntries.
-      ArrayView<GO> indView = ind.view (0, numEntries);
-      ArrayView<ST> valView = val.view (0, numEntries);
+      matrix->getGlobalRowCopy (globalRow, indView, valView, numEntries);
 
-      // Sort the views jointly by column index.
-      Tpetra::sort2 (indView.begin (), indView.end (), valView.begin ());
+      Tpetra::sort2(indView, numEntries, valView);
 
       if (globalRow == globalMinRow && globalRow > rowMap->getMinAllGlobalIndex ()) {
         if (numEntries != static_cast<size_t> (2)) {
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto_Ignore.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto_Ignore.cpp
index 3f907bee6e85..c96d0db780fd 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto_Ignore.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_NonlocalSumInto_Ignore.cpp
@@ -48,6 +48,7 @@
 #include <Teuchos_UnitTestHarness.hpp>
 #include <Tpetra_ConfigDefs.hpp>
 #include <TpetraCore_ETIHelperMacros.h>
+#include <Tpetra_TestingUtilities.hpp>
 
 #include <Tpetra_CrsGraph.hpp>
 #include <Tpetra_CrsMatrix.hpp>
@@ -92,6 +93,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto_Ignore, LocalOrdin
   using Tpetra::createContigMapWithNode;
   using Tpetra::global_size_t;
   using Tpetra::Map;
+  using namespace Tpetra::TestingUtilities;
   using Teuchos::Array;
   using Teuchos::ArrayView;
   using Teuchos::as;
@@ -227,17 +229,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto_Ignore, LocalOrdin
   bool localGraphSuccess = true;
   std::ostringstream graphFailMsg;
   {
-    Array<GO> ind (2); // upper bound
+    using indices_type = typename crs_graph_type::nonconst_global_inds_host_view_type;
+    indices_type indView("indices",2);//upper bound    
 
     for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
       size_t numEntries = 0; // output argument of below line.
-      graph->getGlobalRowCopy (globalRow, ind (), numEntries);
-
-      // Revise view based on numEntries.
-      ArrayView<GO> indView = ind.view (0, numEntries);
+      graph->getGlobalRowCopy (globalRow, indView, numEntries);
 
       // Sort the view.
-      std::sort (indView.begin (), indView.end ());
+      Tpetra::sort(indView,numEntries);
 
       if (numEntries != as<size_t> (1)) {
         localGraphSuccess = false;
@@ -336,19 +336,17 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto_Ignore, LocalOrdin
   bool localSuccess = true;
   std::ostringstream failMsg;
   {
-    Array<GO> ind (2); // upper bound
-    Array<ST> val (2); // upper bound
+    using indices_type = typename CrsMatrixType::nonconst_global_inds_host_view_type;
+    using values_type = typename CrsMatrixType::nonconst_values_host_view_type;
+    indices_type indView("indices",2);//upper bound
+    values_type valView("values",2);//upper bound
 
     for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
       size_t numEntries = 0; // output argument of below line.
-      matrix->getGlobalRowCopy (globalRow, ind (), val (), numEntries);
-
-      // Revise views based on numEntries.
-      ArrayView<GO> indView = ind.view (0, numEntries);
-      ArrayView<ST> valView = val.view (0, numEntries);
+      matrix->getGlobalRowCopy (globalRow, indView, valView, numEntries);
 
       // Sort the views jointly by column index.
-      Tpetra::sort2 (indView.begin (), indView.end (), valView.begin ());
+      Tpetra::sort2 (indView, numEntries, valView);
 
       if (numEntries != as<size_t> (1)) {
         localSuccess = false;
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp
index 6121f66f01fe..5331c90da1f8 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_PackUnpack.cpp
@@ -254,12 +254,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackThenUnpackAndCombine, SC, LO, G
     std::ostringstream errStrm;
     int lclNumErrors = 0;
     for (LO lclRow=0; lclRow<num_loc_rows; ++lclRow) {
-      ArrayView<const LO> A_indices;
-      ArrayView<const SC> A_values;
+      typename crs_matrix_type::local_inds_host_view_type A_indices;
+      typename crs_matrix_type::values_host_view_type A_values;
       A->getLocalRowView(lclRow, A_indices, A_values);
 
-      ArrayView<const LO> B_indices;
-      ArrayView<const SC> B_values;
+      typename crs_matrix_type::local_inds_host_view_type B_indices;
+      typename crs_matrix_type::values_host_view_type B_values;
       B->getLocalRowView(lclRow, B_indices, B_values);
 
       TEST_EQUALITY( A_indices.size (), B_indices.size () );
@@ -348,12 +348,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackThenUnpackAndCombine, SC, LO, G
     std::ostringstream errStrm;
     int lclNumErrors = 0;
     for (LO loc_row=0; loc_row<num_loc_rows; ++loc_row) {
-      ArrayView<const LO> A_indices;
-      ArrayView<const SC> A_values;
+      typename crs_matrix_type::local_inds_host_view_type A_indices;
+      typename crs_matrix_type::values_host_view_type A_values;
       A->getLocalRowView(loc_row, A_indices, A_values);
 
-      ArrayView<const LO> B_indices;
-      ArrayView<const SC> B_values;
+      typename crs_matrix_type::local_inds_host_view_type B_indices;
+      typename crs_matrix_type::values_host_view_type B_values;
       B->getLocalRowView(loc_row, B_indices, B_values);
 //      std::cout << "A_values: " << A_values << "\n";
 //      std::cout << "B_values: " << B_values << "\n";
@@ -630,12 +630,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(CrsMatrix, PackPartial, SC, LO, GO, NT)
     std::ostringstream errStrm;
     int lclNumErrors = 0;
     for (LO lclRow=0; lclRow<num_loc_rows; ++lclRow) {
-      ArrayView<const LO> A_indices;
-      ArrayView<const SC> A_values;
+      typename crs_matrix_type::local_inds_host_view_type A_indices;
+      typename crs_matrix_type::values_host_view_type A_values;
       A->getLocalRowView(lclRow, A_indices, A_values);
 
-      ArrayView<const LO> B_indices;
-      ArrayView<const SC> B_values;
+      typename crs_matrix_type::local_inds_host_view_type B_indices;
+      typename crs_matrix_type::values_host_view_type B_values;
       B->getLocalRowView(lclRow, B_indices, B_values);
 
       TEST_EQUALITY( A_indices.size (), B_indices.size () );
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReindexColumns.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReindexColumns.cpp
index 37c40699ea60..cd82eea8fbfe 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReindexColumns.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReindexColumns.cpp
@@ -433,10 +433,10 @@ namespace {
         // Get the "new" local column indices that resulted from the
         // call to reindexColumns.  Get by copy, not by view, so we
         // can sort it.
-        Array<LO> newLclColInds (numEnt);
+        typename graph_type::nonconst_local_inds_host_view_type newLclColInds ("colind",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph.getLocalRowCopy (lclRowInd, newLclColInds (), actualNumEnt);
+          graph.getLocalRowCopy (lclRowInd, newLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -444,7 +444,7 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", newLclInds: " << Teuchos::toString (newLclColInds);
+        //os << ", newLclInds: " << Teuchos::toString (newLclColInds);
 
         // Use the new column Map to convert them to global indices.
         Array<GO> gblColInds (numEnt);
@@ -457,7 +457,7 @@ namespace {
             success = false;
           }
         }
-        os << ", gblInds: " << Teuchos::toString (gblColInds ());
+        //os << ", gblInds: " << Teuchos::toString (gblColInds ());
 
         // Convert those global indices to the original column Map's
         // local indices.  Those should match the original local
@@ -481,10 +481,10 @@ namespace {
         os << ", oldLclInds: " << Teuchos::toString (oldLclColInds);
 
         // Get the original local indices from the original graph.
-        Array<LO> origLclColInds (numEnt);
+        typename graph_type::nonconst_local_inds_host_view_type origLclColInds("colind",numEnt);
         {
           size_t actualNumEnt = 0;
-          graph2->getLocalRowCopy (lclRowInd, origLclColInds (), actualNumEnt);
+          graph2->getLocalRowCopy (lclRowInd, origLclColInds, actualNumEnt);
           if (static_cast<size_t> (numEnt) != actualNumEnt) {
             os << ", graph2.getLocalRowCopy(...) reported different # entries"
                << endl;
@@ -492,16 +492,16 @@ namespace {
             continue; // don't even bother with the rest
           }
         }
-        os << ", origLclInds: " << Teuchos::toString (origLclColInds);
+        //        os << ", origLclInds: " << Teuchos::toString (origLclColInds);
 
         // The indices in both graphs don't need to be in the same
         // order; they just need to be the same indices.
-        std::sort (origLclColInds.begin (), origLclColInds.end ());
-        std::sort (oldLclColInds.begin (), oldLclColInds.end ());
+        Tpetra::sort (origLclColInds, origLclColInds.extent(0));
+        std::sort (oldLclColInds.begin(), oldLclColInds.end());
 
         // Compare the two sets of indices.
         bool arraysSame = true;
-        if (oldLclColInds.size () != origLclColInds.size ()) {
+        if ((size_t)oldLclColInds.size () != (size_t)origLclColInds.size ()) {
           arraysSame = false;
         } else {
           for (size_type k = 0; k < oldLclColInds.size (); ++k) {
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceDiagonal.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceDiagonal.cpp
index 373533e482b0..aafb3a1f4f20 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceDiagonal.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceDiagonal.cpp
@@ -184,8 +184,8 @@ namespace { // (anonymous)
          */
         
         for (size_t i = 0; i < matrix->getRowMap()->getNodeNumElements(); i++) {
-          Teuchos::ArrayView<const LO> lcols;
-          Teuchos::ArrayView<const Scalar> lvals;
+          typename crs_matrix_type::local_inds_host_view_type lcols;
+          typename crs_matrix_type::values_host_view_type lvals;
           matrix->getLocalRowView(static_cast<LO>(i), lcols, lvals);
           GO gI = matrix->getRowMap()->getGlobalElement(i);
           auto j = lcols.size();
@@ -373,8 +373,8 @@ namespace { // (anonymous)
         
 	using impl_scalar_type = typename crs_matrix_type::impl_scalar_type;
         for (size_t i = 0; i < matrix->getRowMap()->getNodeNumElements(); i++) {
-          Teuchos::ArrayView<const LO> lcols;
-          Teuchos::ArrayView<const Scalar> lvals;
+          typename crs_matrix_type::local_inds_host_view_type lcols;
+          typename crs_matrix_type::values_host_view_type lvals;
           matrix->getLocalRowView(static_cast<LO>(i), lcols, lvals);
           GO gI = matrix->getRowMap()->getGlobalElement(i);
           auto j = lcols.size();
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceLocalValues.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceLocalValues.cpp
index be6bfa2f97a9..fb96473f6955 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceLocalValues.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ReplaceLocalValues.cpp
@@ -72,6 +72,8 @@ namespace { // (anonymous)
     using std::endl;
     typedef Tpetra::Map<LO, GO, Node> map_type;
     typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
+    using lids_type = typename crs_matrix_type::nonconst_local_inds_host_view_type;
+    using vals_type = typename crs_matrix_type::nonconst_values_host_view_type;
     typedef Tpetra::Vector<Scalar, LO, GO, Node> vec_type;
     typedef Tpetra::MatrixMarket::Writer<crs_matrix_type> writer_type;
     typedef typename Teuchos::Array<LO>::size_type size_type;
@@ -225,10 +227,10 @@ namespace { // (anonymous)
       TEST_EQUALITY_CONST( numEnt, static_cast<size_t> (1) );
 
       if (numEnt == static_cast<size_t> (1)) {
-        Teuchos::Array<LO> ind (numEnt);
-        Teuchos::Array<Scalar> val (numEnt);
+        lids_type ind ("ind",numEnt);
+        vals_type val ("val",numEnt);
         size_t numEntOut = 0;
-        matrix->getLocalRowCopy (0, ind (), val (), numEntOut);
+        matrix->getLocalRowCopy (0, ind, val, numEntOut);
         TEST_EQUALITY( numEnt, numEntOut );
 
         if (numEntOut == static_cast<size_t> (1)) {
@@ -274,10 +276,10 @@ namespace { // (anonymous)
       TEST_EQUALITY_CONST( numEnt, static_cast<size_t> (1) );
 
       if (numEnt == static_cast<size_t> (1)) {
-        Teuchos::Array<LO> ind (numEnt);
-        Teuchos::Array<Scalar> val (numEnt);
+        lids_type ind ("ind",numEnt);
+        vals_type val ("val",numEnt);
         size_t numEntOut = 0;
-        matrix->getLocalRowCopy (0, ind (), val (), numEntOut);
+        matrix->getLocalRowCopy (0, ind, val, numEntOut);
         TEST_EQUALITY( numEnt, numEntOut );
 
         if (numEntOut == static_cast<size_t> (1)) {
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_TransformValues.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_TransformValues.cpp
index 3f131a0c77b9..53fbac871f5e 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_TransformValues.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_TransformValues.cpp
@@ -141,33 +141,13 @@ namespace { // (anonymous)
         const GO gblRow = rowMap->getGlobalElement (lclRow);
         const LO lclCol = colMap->getLocalElement (gblRow);
 
-        Teuchos::ArrayView<const LO> lclIndsT;
-        Teuchos::ArrayView<const Scalar> valsT;
+        typename crs_matrix_type::local_inds_host_view_type lclIndsT;
+        typename crs_matrix_type::values_host_view_type valsT;
 
         matrix.getLocalRowView (lclRow, lclIndsT, valsT);
         TEST_EQUALITY( lclIndsT[0], lclCol );
         TEST_EQUALITY( valsT[0], SIX );
 
-        LO rawNumEnt = 0;
-        const Scalar* rawValsT = NULL;
-        const LO* rawLclIndsT = NULL;
-        const LO err = matrix.getLocalRowView (lclRow, rawNumEnt, rawValsT, rawLclIndsT);
-        TEST_EQUALITY( err, static_cast<LO> (0) );
-        if (err == 0) {
-          TEST_EQUALITY( rawNumEnt, static_cast<LO> (lclIndsT.size ()) );
-          if (rawNumEnt == static_cast<LO> (lclIndsT.size ())) {
-            TEST_ASSERT( rawLclIndsT != NULL );
-            if (rawLclIndsT != NULL) {
-              TEST_EQUALITY( rawLclIndsT[0], lclIndsT[0] );
-            }
-          }
-          if (rawNumEnt == static_cast<LO> (valsT.size ())) {
-            TEST_ASSERT( rawValsT != NULL );
-            if (rawValsT != NULL) {
-              TEST_EQUALITY( rawValsT[0], valsT[0] );
-            }
-          }
-        }
       }
     }
 
@@ -195,8 +175,8 @@ namespace { // (anonymous)
         const GO gblRow = rowMap->getGlobalElement (lclRow);
         const LO lclCol = colMap->getLocalElement (gblRow);
 
-        Teuchos::ArrayView<const LO> lclIndsT;
-        Teuchos::ArrayView<const Scalar> valsT;
+        typename crs_matrix_type::local_inds_host_view_type lclIndsT;
+        typename crs_matrix_type::values_host_view_type valsT;
 
         matrix.getLocalRowView (lclRow, lclIndsT, valsT);
         TEST_EQUALITY( lclIndsT[0], lclCol );
@@ -308,8 +288,8 @@ namespace { // (anonymous)
         const GO gblRow = rowMap->getGlobalElement (lclRow);
         const LO gblCol = gblRow;
 
-        Teuchos::ArrayView<const GO> gblIndsT;
-        Teuchos::ArrayView<const Scalar> valsT;
+        typename crs_matrix_type::global_inds_host_view_type gblIndsT;
+        typename crs_matrix_type::values_host_view_type valsT;
 
         matrix.getGlobalRowView (gblRow, gblIndsT, valsT);
         TEST_EQUALITY( gblIndsT[0], gblCol );
@@ -369,8 +349,8 @@ namespace { // (anonymous)
         const GO gblRow = rowMap->getGlobalElement (lclRow);
         const LO lclCol = colMap->getLocalElement (gblRow);
 
-        Teuchos::ArrayView<const LO> lclIndsT;
-        Teuchos::ArrayView<const Scalar> valsT;
+        typename crs_matrix_type::local_inds_host_view_type lclIndsT;
+        typename crs_matrix_type::values_host_view_type valsT;
 
         matrix.getLocalRowView (lclRow, lclIndsT, valsT);
         TEST_EQUALITY( lclIndsT[0], lclCol );
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests.cpp
index d7289c80eab5..9ece51e37250 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests.cpp
@@ -74,8 +74,8 @@ namespace { // (anonymous)
     using Teuchos::outArg; \
     RCP<const Comm<int> > STCOMM = matrix.getComm(); \
     ArrayView<const GO> STMYGIDS = matrix.getRowMap()->getNodeElementList(); \
-    ArrayView<const LO> loview; \
-    ArrayView<const Scalar> sview; \
+    typename MAT::local_inds_host_view_type loview; \
+    typename MAT::values_host_view_type sview; \
     size_t STMAX = 0; \
     for (size_t STR=0; STR < matrix.getNodeNumRows(); ++STR) { \
       const size_t numEntries = matrix.getNumEntriesInLocalRow(STR); \
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests2.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests2.cpp
index a1a97c11abc6..ffddcfc6574a 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests2.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests2.cpp
@@ -427,7 +427,7 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     }
 
     // test the constructors based on 4 maps + local matri_crsx
-    RCP<MAT> tri_crs_2 = rcp(new MAT(tri_crs->getLocalMatrix(), tri_crs->getRowMap(),
+    RCP<MAT> tri_crs_2 = rcp(new MAT(tri_crs->getLocalMatrixDevice(), tri_crs->getRowMap(),
                                      tri_crs->getColMap(), tri_crs->getDomainMap(), tri_crs->getRangeMap()));
     TEST_EQUALITY(tri_crs_2->isFillComplete(), true);
     auto exporter = tri_crs_2->getGraph()->getExporter();
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests3.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests3.cpp
index c9e543460a6f..e798fbc6da9d 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests3.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests3.cpp
@@ -143,6 +143,7 @@ namespace {
   using Tpetra::DoNotOptimizeStorage;
   using Tpetra::GloballyDistributed;
   using Tpetra::INSERT;
+  using namespace Tpetra::TestingUtilities;
 
 
   double errorTolSlack = 1e+1;
@@ -159,8 +160,8 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     using Teuchos::outArg; \
     RCP<const Comm<int> > STCOMM = matrix.getComm(); \
     ArrayView<const GO> STMYGIDS = matrix.getRowMap()->getNodeElementList(); \
-    ArrayView<const LO> loview; \
-    ArrayView<const Scalar> sview; \
+    typename MAT::local_inds_host_view_type loview; \
+    typename MAT::values_host_view_type sview; \
     size_t STMAX = 0; \
     for (size_t STR=0; STR < matrix.getNodeNumRows(); ++STR) { \
       const size_t numEntries = matrix.getNumEntriesInLocalRow(STR); \
@@ -268,14 +269,27 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
       params->set("Optimize Storage",((T & 2) == 2));
       MAT matrix(rmap,cmap, ginds.size(), pftype);   // only allocate as much room as necessary
       RowMatrix<Scalar,LO,GO,Node> &rowmatrix = matrix;
-      Array<GO> GCopy(4); Array<LO> LCopy(4); Array<Scalar> SCopy(4);
-      ArrayView<const GO> CGView; ArrayView<const LO> CLView; ArrayView<const Scalar> CSView;
+
+      typename MAT::nonconst_global_inds_host_view_type GCopy("gids",4);
+      typename MAT::nonconst_local_inds_host_view_type LCopy("lids",4);
+      typename MAT::nonconst_values_host_view_type SCopy("vals",4);
+
+      typename MAT::nonconst_global_inds_host_view_type GCopy_toshort("gids",1);
+      typename MAT::nonconst_local_inds_host_view_type LCopy_toshort("lids",1);
+      typename MAT::nonconst_values_host_view_type SCopy_toshort("vals",1);
+
+      typename MAT::global_inds_host_view_type CGView; 
+      typename MAT::local_inds_host_view_type CLView; 
+      typename MAT::values_host_view_type CSView;
+
       size_t numentries;
       // at this point, the graph has not allocated data as global or local, so we can do views/copies for either local or global
       matrix.getLocalRowCopy(0,LCopy,SCopy,numentries);
       matrix.getLocalRowView(0,CLView,CSView);
+
       matrix.getGlobalRowCopy(myrowind,GCopy,SCopy,numentries);
       matrix.getGlobalRowView(myrowind,CGView,CSView);
+
       // use multiple inserts: this illustrated an overwrite bug for column-map-specified graphs
       for (size_t j=0; j<(size_t)ginds.size(); ++j) {
         matrix.insertGlobalValues(myrowind,ginds(j,1),tuple(ST::one()));
@@ -295,20 +309,21 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
       matrix.fillComplete(params);
       // check for throws and no-throws/values
       TEST_THROW( matrix.getGlobalRowView(myrowind,CGView,CSView), std::runtime_error );
-      TEST_THROW( matrix.getLocalRowCopy(    0       ,LCopy(0,1),SCopy(0,1),numentries), std::runtime_error );
-      TEST_THROW( matrix.getGlobalRowCopy(myrowind,GCopy(0,1),SCopy(0,1),numentries), std::runtime_error );
+      TEST_THROW( matrix.getLocalRowCopy(0,LCopy_toshort,SCopy_toshort,numentries), std::runtime_error );
+      TEST_THROW( matrix.getGlobalRowCopy(myrowind,GCopy_toshort,SCopy_toshort,numentries), std::runtime_error );
+
       //
       TEST_NOTHROW( matrix.getLocalRowView(0,CLView,CSView) );
       TEST_COMPARE_ARRAYS( CLView, linds );
       TEST_COMPARE_ARRAYS( CSView, vals  );
       //
       TEST_NOTHROW( matrix.getLocalRowCopy(0,LCopy,SCopy,numentries) );
-      TEST_COMPARE_ARRAYS( LCopy(0,numentries), linds );
-      TEST_COMPARE_ARRAYS( SCopy(0,numentries), vals  );
+      TEST_COMPARE_ARRAYS( arcp_from_view(LCopy,numentries), linds );
+      TEST_COMPARE_ARRAYS( arcp_from_view(SCopy,numentries), vals );
       //
       TEST_NOTHROW( matrix.getGlobalRowCopy(myrowind,GCopy,SCopy,numentries) );
-      TEST_COMPARE_ARRAYS( GCopy(0,numentries), ginds );
-      TEST_COMPARE_ARRAYS( SCopy(0,numentries), vals  );
+      TEST_COMPARE_ARRAYS( arcp_from_view(GCopy,numentries), ginds );
+      TEST_COMPARE_ARRAYS( arcp_from_view(SCopy,numentries), vals  );
       //
       STD_TESTS(rowmatrix);
     }
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests4.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests4.cpp
index 620e221aab2a..75c34fce05ff 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests4.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests4.cpp
@@ -832,16 +832,18 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     // Make some vectors
     RCP<MV> toScale2 = rcp(new MV(map,1));
     RCP<MV> toScale4 = rcp(new MV(map,1)); 
-    auto v2 = toScale2->getDataNonConst(0);
-    auto v4 = toScale4->getDataNonConst(0);
-    for(size_t i=0; i<numLocal; i++){
-      if(i%2 == 0) {
-        v2[i] = SC_one;
-        v4[i] = SC_one;
-      }
-      else {        
-        v2[i] = SC_one+SC_one;
-        v4[i] = SC_one+SC_one;
+    {
+      auto v2 = toScale2->getDataNonConst(0);
+      auto v4 = toScale4->getDataNonConst(0);
+      for(size_t i=0; i<numLocal; i++){
+        if(i%2 == 0) {
+          v2[i] = SC_one;
+          v4[i] = SC_one;
+        }
+        else {        
+          v2[i] = SC_one+SC_one;
+          v4[i] = SC_one+SC_one;
+        }
       }
     }    
     
@@ -929,18 +931,20 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     // Make some vectors
     RCP<MV> toScale2 = rcp(new MV(map,1));
     RCP<MV> toScale4 = rcp(new MV(map,1)); 
-    auto v2 = toScale2->getDataNonConst(0);
-    auto v4 = toScale4->getDataNonConst(0);
-    for(size_t i=0; i<numLocal; i++){
-      if(i%2 == 0) {
-        v2[i] = SC_one;
-        v4[i] = SC_one;
-      }
-      else {        
-        v2[i] = SC_one+SC_one;
-        v4[i] = SC_one+SC_one;
-      }
-    }    
+    {
+      auto v2 = toScale2->getDataNonConst(0);
+      auto v4 = toScale4->getDataNonConst(0);
+      for(size_t i=0; i<numLocal; i++){
+        if(i%2 == 0) {
+          v2[i] = SC_one;
+          v4[i] = SC_one;
+        }
+        else {        
+          v2[i] = SC_one+SC_one;
+          v4[i] = SC_one+SC_one;
+        }
+      }    
+    }
 
     // Now, let's rescale some vectors
     Tpetra::Details::inverseScaleBlockDiagonal(*diag2,true,*toScale2);
@@ -976,7 +980,7 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     typedef MultiVector<Scalar,LO,GO,Node> MV;
     typedef Map<LO,GO,Node> MAP;
     typedef typename ST::magnitudeType Mag;
-    using values_type = typename MAT::local_matrix_type::values_type;
+    using values_type = typename MAT::local_matrix_device_type::values_type;
     using range_policy = Kokkos::RangePolicy<typename Node::device_type::execution_space>;
 
     RCP<const Comm<int> > comm = Tpetra::getDefaultComm();
@@ -992,13 +996,14 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
     RCP<const MAP> map = A1->getRowMap();
 
     /* Now take the same thing, but multiply the entries by 2*/
-    values_type A1_values = A1->getLocalMatrix().values;
+    auto lclMtx1 = A1->getLocalMatrixDevice();
+    values_type A1_values = lclMtx1.values;
     values_type A2_values("values",A1_values.extent(0));
     Kokkos::parallel_for( range_policy(0,A1_values.extent(0)),KOKKOS_LAMBDA(const int i){
         A2_values[i] = A1_values[i] + A1_values[i];
       });
 
-    RCP<MAT> A2 = rcp(new MAT(map,A1->getColMap(),A1->getLocalMatrix().graph.row_map,A1->getLocalMatrix().graph.entries,A2_values));
+    RCP<MAT> A2 = rcp(new MAT(map,A1->getColMap(),lclMtx1.graph.row_map,lclMtx1.graph.entries,A2_values));
     A2->expertStaticFillComplete(A1->getDomainMap(),A1->getRangeMap(),A1->getGraph()->getImporter(),A1->getGraph()->getExporter());
 
     /* Allocate multivectors */
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests_Swap.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests_Swap.cpp
index 90dace398e52..f961cb8d1ffc 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests_Swap.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests_Swap.cpp
@@ -698,14 +698,17 @@ class crsMatrix_Swap_Tester
             output = false;
         }
 
-        auto rowptr1 = matrix1.getLocalMatrix().graph.row_map;
-        auto rowptr2 = matrix2.getLocalMatrix().graph.row_map;
+        auto lclmtx1 = matrix1.getLocalMatrixHost();
+        auto lclmtx2 = matrix2.getLocalMatrixHost();
 
-        auto colind1 = matrix1.getLocalMatrix().graph.entries;
-        auto colind2 = matrix2.getLocalMatrix().graph.entries;
+        auto rowptr1 = lclmtx1.graph.row_map;
+        auto rowptr2 = lclmtx2.graph.row_map;
 
-        auto rbo1 = matrix1.getLocalMatrix().graph.row_block_offsets;
-        auto rbo2 = matrix2.getLocalMatrix().graph.row_block_offsets;
+        auto colind1 = lclmtx1.graph.entries;
+        auto colind2 = lclmtx2.graph.entries;
+
+        auto rbo1 = lclmtx1.graph.row_block_offsets;
+        auto rbo2 = lclmtx2.graph.row_block_offsets;
 
         if(rowptr1.extent(0) != rowptr2.extent(0))
         {
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_createDeepCopy.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_createDeepCopy.cpp
index 7e4d2fea9d62..349977191014 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_createDeepCopy.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_createDeepCopy.cpp
@@ -41,6 +41,7 @@
 // @HEADER
 */
 
+
 #include "Tpetra_TestingUtilities.hpp"
 #include "Tpetra_CrsMatrix.hpp"
 #include "Tpetra_Map.hpp"
@@ -48,9 +49,36 @@
 #include "Tpetra_RowMatrix.hpp"
 #include "Tpetra_createDeepCopy_CrsMatrix.hpp"
 #include "Tpetra_Util.hpp"
+#include "Teuchos_ArrayRCP.hpp"
+
 
 namespace { // (anonymous)
 
+template<class Array1, class Array2>
+bool array_equal(  const Array1 &a1,  const Array2 &a2 ) 
+
+{
+  using Teuchos::as;
+  bool success = true;
+  const int n = a1.size();
+
+  // Compare sizes
+  if (as<int>(a2.size()) != n) {
+    return false;
+  }
+
+  // Compare elements
+  for( int i = 0; i < n; ++i ) {
+    const bool result = ( a1[i] == a2[i] ); // Tests C::operator[](i) const
+    if (!result) {
+      success = false;
+    }
+  }
+
+  return success;
+
+}
+
 // Test interfaces that need a RowGraph which is not just a CrsGraph.
 template <class LO, class GO, class NT>
 class MyRowGraph : public Tpetra::RowGraph<LO, GO, NT> {
@@ -162,7 +190,7 @@ class MyRowGraph : public Tpetra::RowGraph<LO, GO, NT> {
     return G_->isFillComplete ();
   }
 
-
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   void
   getGlobalRowCopy (GO gblRow,
                     const Teuchos::ArrayView<GO>& gblColInds,
@@ -170,7 +198,17 @@ class MyRowGraph : public Tpetra::RowGraph<LO, GO, NT> {
   {
     G_->getGlobalRowCopy (gblRow, gblColInds, numColInds);
   }
+#endif
 
+  void
+  getGlobalRowCopy (GO gblRow,
+                    typename base_type::nonconst_global_inds_host_view_type & gblColInds,
+                    size_t& numColInds) const override
+  {
+    G_->getGlobalRowCopy (gblRow, gblColInds,numColInds);
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   void
   getLocalRowCopy (LO lclRow,
                    const Teuchos::ArrayView<LO>& lclColInds,
@@ -178,6 +216,16 @@ class MyRowGraph : public Tpetra::RowGraph<LO, GO, NT> {
   {
     G_->getLocalRowCopy (lclRow, lclColInds, numColInds);
   }
+#endif
+
+  void
+  getLocalRowCopy (LO lclRow,
+                   typename base_type::nonconst_local_inds_host_view_type & lclColInds,
+                   size_t& numColInds) const override
+  {
+    G_->getLocalRowCopy (lclRow, lclColInds,numColInds);
+  }
+
 
   bool supportsRowViews () const override {
     return supportsRowViews_;
@@ -185,14 +233,29 @@ class MyRowGraph : public Tpetra::RowGraph<LO, GO, NT> {
 
   void
   getLocalRowView (const LO lclRow,
-                   Teuchos::ArrayView<const LO>& lclColInds) const override
+                   Teuchos::ArrayView<const LO> & lclColInds) const override
+  {
+    G_->getLocalRowView (lclRow, lclColInds);
+  }
+
+  void
+  getLocalRowView (const LO lclRow,
+                   typename base_type::local_inds_host_view_type & lclColInds) const override
   {
     G_->getLocalRowView (lclRow, lclColInds);
   }
 
+
   void
   getGlobalRowView (const GO gblRow,
-                    Teuchos::ArrayView<const GO>& gblColInds) const override
+                    Teuchos::ArrayView<const GO> & gblColInds) const override
+  {
+    G_->getGlobalRowView (gblRow, gblColInds);
+  }
+
+  void
+  getGlobalRowView (const GO gblRow,
+                    typename base_type::global_inds_host_view_type & gblColInds) const override
   {
     G_->getGlobalRowView (gblRow, gblColInds);
   }
@@ -318,7 +381,15 @@ class MyRowMatrix : public Tpetra::RowMatrix<SC, LO, GO, NT> {
     return supportsRowViews_;
   }
 
-
+  void
+  getGlobalRowCopy (GO gblRow,
+                    typename base_type::nonconst_global_inds_host_view_type &gblColInds,
+                    typename base_type::nonconst_values_host_view_type &values,
+                    size_t& numColInds) const override
+  {
+    A_->getGlobalRowCopy (gblRow, gblColInds, values, numColInds);
+  }
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   void
   getGlobalRowCopy (GO gblRow,
                     const Teuchos::ArrayView<GO>& gblColInds,
@@ -327,7 +398,18 @@ class MyRowMatrix : public Tpetra::RowMatrix<SC, LO, GO, NT> {
   {
     A_->getGlobalRowCopy (gblRow, gblColInds, values, numColInds);
   }
+#endif
 
+  void
+  getLocalRowCopy (LO lclRow,
+                   typename base_type::nonconst_local_inds_host_view_type &lclColInds,
+                   typename base_type::nonconst_values_host_view_type &values,
+                   size_t& numColInds) const override
+  {
+    A_->getLocalRowCopy (lclRow, lclColInds, values, numColInds);
+  }
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
   void
   getLocalRowCopy (LO lclRow,
                    const Teuchos::ArrayView<LO>& lclColInds,
@@ -336,19 +418,40 @@ class MyRowMatrix : public Tpetra::RowMatrix<SC, LO, GO, NT> {
   {
     A_->getLocalRowCopy (lclRow, lclColInds, values, numColInds);
   }
+#endif
+
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+  void
+  getGlobalRowView (const GO gblRow,
+                    Teuchos::ArrayView<const GO> & gblColInds,
+                    Teuchos::ArrayView<const SC> & values) const override
+  {
+    A_->getGlobalRowView (gblRow, gblColInds, values);
+  }
+#endif
 
   void
   getGlobalRowView (const GO gblRow,
-                    Teuchos::ArrayView<const GO>& gblColInds,
-                    Teuchos::ArrayView<const SC>& values) const override
+                    typename base_type::global_inds_host_view_type & gblColInds,
+                    typename base_type::values_host_view_type & values) const override
   {
     A_->getGlobalRowView (gblRow, gblColInds, values);
   }
 
+#ifdef TPETRA_ENABLE_DEPRECATED_CODE
+  void
+  getLocalRowView (const LO lclRow,
+                   Teuchos::ArrayView<const LO> & lclColInds,
+                   Teuchos::ArrayView<const SC> &values) const override
+  {
+    A_->getLocalRowView (lclRow, lclColInds, values);
+  }
+#endif
+
   void
   getLocalRowView (const LO lclRow,
-                   Teuchos::ArrayView<const LO>& lclColInds,
-                   Teuchos::ArrayView<const SC>& values) const override
+                   typename base_type::local_inds_host_view_type & lclColInds,
+                   typename base_type::values_host_view_type &values) const override
   {
     A_->getLocalRowView (lclRow, lclColInds, values);
   }
@@ -422,6 +525,8 @@ class MyRowMatrix : public Tpetra::RowMatrix<SC, LO, GO, NT> {
   bool supportsRowViews_ = false;
 };
 
+
+
 template<class SC, class LO, class GO, class NT>
 bool
 crsMatrixInstancesEqual (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
@@ -430,6 +535,9 @@ crsMatrixInstancesEqual (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
   using Teuchos::outArg;
   using Teuchos::REDUCE_MIN;
   using Teuchos::reduceAll;
+  using values_view  = typename Tpetra::CrsMatrix<SC, LO, GO, NT>::nonconst_values_host_view_type;
+  using l_indices_view = typename Tpetra::CrsMatrix<SC, LO, GO, NT>::nonconst_local_inds_host_view_type;
+  using g_indices_view = typename Tpetra::CrsMatrix<SC, LO, GO, NT>::nonconst_global_inds_host_view_type;                      
 
   const Teuchos::Comm<int>& comm = * (A.getMap ()->getComm ());
   int lclSuccess = 1;
@@ -490,63 +598,52 @@ crsMatrixInstancesEqual (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
   const auto& rowMap = * (A.getRowMap ());
   const LO lclNumRows = A.getNodeNumRows ();
 
-  Teuchos::Array<SC> A_valsBuf;
-  Teuchos::Array<SC> B_valsBuf;
+  values_view A_vals;
+  values_view B_vals;
 
   if (A.isLocallyIndexed ()) {
-    Teuchos::Array<LO> A_lclColIndsBuf;
-    Teuchos::Array<LO> B_lclColIndsBuf;
+    l_indices_view A_lclColInds;
+    l_indices_view B_lclColInds;
 
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
       size_t A_numEnt = 0;
       size_t B_numEnt = 0;
 
       const size_t A_numEnt2 = A.getNumEntriesInLocalRow (lclRow);
-      if (A_numEnt2 > size_t (A_valsBuf.size ())) {
-        A_valsBuf.resize (A_numEnt2);
+      if (A_numEnt2 > size_t (A_vals.size ())) {
+        Kokkos::resize(A_vals,A_numEnt2);
       }
-      if (A_numEnt2 > size_t (A_lclColIndsBuf.size ())) {
-        A_lclColIndsBuf.resize (A_numEnt2);
+      if (A_numEnt2 > size_t (A_lclColInds.size ())) {
+        Kokkos::resize(A_lclColInds,A_numEnt2);
       }
-      A.getLocalRowCopy (lclRow, A_lclColIndsBuf (), A_valsBuf (), A_numEnt);
+      A.getLocalRowCopy (lclRow, A_lclColInds , A_vals, A_numEnt);
 
       const size_t B_numEnt2 = B.getNumEntriesInLocalRow (lclRow);
-      if (B_numEnt2 > size_t (B_valsBuf.size ())) {
-        B_valsBuf.resize (B_numEnt2);
+      if (B_numEnt2 > size_t (B_vals.size ())) {
+        Kokkos::resize(B_vals,B_numEnt2);
       }
-      if (B_numEnt2 > size_t (B_lclColIndsBuf.size ())) {
-        B_lclColIndsBuf.resize (B_numEnt2);
+      if (B_numEnt2 > size_t (B_lclColInds.size ())) {
+        Kokkos::resize(B_lclColInds,B_numEnt2);
       }
-      B.getLocalRowCopy (lclRow, B_lclColIndsBuf (), B_valsBuf (), B_numEnt);
+      B.getLocalRowCopy (lclRow, B_lclColInds, B_vals, B_numEnt);
 
       if (A_numEnt != B_numEnt) {
         lclSuccess = 0;
         break;
       }
 
-      Teuchos::ArrayView<LO> A_lclColInds = A_lclColIndsBuf.view (0, A_numEnt);
-      Teuchos::ArrayView<SC> A_vals = A_valsBuf.view (0, A_numEnt);
-      Tpetra::sort2 (A_lclColInds.begin (), A_lclColInds.end (), A_vals.begin ());
-
-      Teuchos::ArrayView<LO> B_lclColInds = B_lclColIndsBuf.view (0, B_numEnt);
-      Teuchos::ArrayView<SC> B_vals = B_valsBuf.view (0, B_numEnt);
-      Tpetra::sort2 (B_lclColInds.begin (), B_lclColInds.end (), B_vals.begin ());
-
-      if (! std::equal (A_lclColInds.begin (), A_lclColInds.end (),
-                        B_lclColInds.begin ())) {
-        lclSuccess = 0;
-        break;
-      }
-      if (! std::equal (A_vals.begin (), A_vals.end (),
-                        B_vals.begin ())) {
-        lclSuccess = 0;
-        break;
-      }
+      Tpetra::sort2 (A_lclColInds, A_lclColInds.size(), A_vals);
+      Tpetra::sort2 (B_lclColInds, B_lclColInds.size(), B_vals);
+      
+      lclSuccess=array_equal(A_lclColInds, B_lclColInds);
+      if(!lclSuccess) break;
+      lclSuccess=array_equal(A_vals, B_vals);
+      if(!lclSuccess) break;
     }
   }
   else if (A.isGloballyIndexed ()) {
-    Teuchos::Array<GO> A_gblColIndsBuf;
-    Teuchos::Array<GO> B_gblColIndsBuf;
+    g_indices_view A_gblColInds;
+    g_indices_view B_gblColInds;
 
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
       const GO gblRow = rowMap.getGlobalElement (lclRow);
@@ -554,46 +651,24 @@ crsMatrixInstancesEqual (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A,
       size_t B_numEnt = 0;
 
       const size_t A_numEnt2 = A.getNumEntriesInGlobalRow (gblRow);
-      if (A_numEnt2 > size_t (A_valsBuf.size ())) {
-        A_valsBuf.resize (A_numEnt2);
+      if (A_numEnt2 > size_t (A_vals.size ())) {
+        Kokkos::resize(A_vals,A_numEnt2);
       }
-      if (A_numEnt2 > size_t (A_gblColIndsBuf.size ())) {
-        A_gblColIndsBuf.resize (A_numEnt2);
+      if (A_numEnt2 > size_t (A_gblColInds.size ())) {
+        Kokkos::resize(A_gblColInds,A_numEnt2);
       }
-      A.getGlobalRowCopy (gblRow, A_gblColIndsBuf (), A_valsBuf (), A_numEnt);
+      A.getGlobalRowCopy (gblRow, A_gblColInds , A_vals , A_numEnt);
 
       const size_t B_numEnt2 = B.getNumEntriesInGlobalRow (gblRow);
-      if (B_numEnt2 > size_t (B_valsBuf.size ())) {
-        B_valsBuf.resize (B_numEnt2);
-      }
-      if (B_numEnt2 > size_t (B_gblColIndsBuf.size ())) {
-        B_gblColIndsBuf.resize (B_numEnt2);
+      if (B_numEnt2 > size_t (B_vals.size ())) {
+        Kokkos::resize(B_vals,B_numEnt2);
       }
-      B.getGlobalRowCopy (gblRow, B_gblColIndsBuf (), B_valsBuf (), B_numEnt);
-
-      if (A_numEnt != B_numEnt) {
-        lclSuccess = 0;
-        break;
+      if (B_numEnt2 > size_t (B_gblColInds.size ())) {
+        Kokkos::resize(B_gblColInds,B_numEnt2);
       }
+      B.getGlobalRowCopy (gblRow, B_gblColInds, B_vals, B_numEnt);
 
-      Teuchos::ArrayView<GO> A_gblColInds = A_gblColIndsBuf.view (0, A_numEnt);
-      Teuchos::ArrayView<SC> A_vals = A_valsBuf.view (0, A_numEnt);
-      Tpetra::sort2 (A_gblColInds.begin (), A_gblColInds.end (), A_vals.begin ());
-
-      Teuchos::ArrayView<GO> B_gblColInds = B_gblColIndsBuf.view (0, B_numEnt);
-      Teuchos::ArrayView<SC> B_vals = B_valsBuf.view (0, B_numEnt);
-      Tpetra::sort2 (B_gblColInds.begin (), B_gblColInds.end (), B_vals.begin ());
-
-      if (! std::equal (A_gblColInds.begin (), A_gblColInds.end (),
-                        B_gblColInds.begin ())) {
-        gblSuccess = 0;
-        break;
-      }
-      if (! std::equal (A_vals.begin (), A_vals.end (),
-                        B_vals.begin ())) {
-        gblSuccess = 0;
-        break;
-      }
+      
     }
   }
 
@@ -744,3 +819,4 @@ main (int argc, char* argv[])
     Teuchos::UnitTestRepository::runUnitTestsFromMain (argc, argv);
   return errCode;
 }
+
diff --git a/packages/tpetra/core/test/CrsMatrix/Equilibration.cpp b/packages/tpetra/core/test/CrsMatrix/Equilibration.cpp
index b2aec882251f..cdcba5905354 100644
--- a/packages/tpetra/core/test/CrsMatrix/Equilibration.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/Equilibration.cpp
@@ -148,8 +148,8 @@ deepCopyFillCompleteCrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
     (! A.isFillComplete (), std::invalid_argument,
      "deepCopyFillCompleteCrsMatrix: Input matrix A must be fillComplete.");
   RCP<crs_matrix_type> A_copy (new crs_matrix_type (A.getCrsGraph ()));
-  auto A_copy_lcl = A_copy->getLocalMatrix ();
-  auto A_lcl = A.getLocalMatrix ();
+  auto A_copy_lcl = A_copy->getLocalMatrixDevice ();
+  auto A_lcl = A.getLocalMatrixDevice ();
   Kokkos::deep_copy (A_copy_lcl.values, A_lcl.values);
   A_copy->fillComplete (A.getDomainMap (), A.getRangeMap ());
   return A_copy;
@@ -166,15 +166,12 @@ testCrsMatrixEquality (bool& success,
   using mag_type = typename Kokkos::ArithTraits<SC>::mag_type;
   const mag_type toleranceFactor = 10.0; // factor of eps
 
-  auto A_expected_lcl = A_expected.getLocalMatrix ();
-  auto ptr_h = Kokkos::create_mirror_view (A_expected_lcl.graph.row_map);
-  Kokkos::deep_copy (ptr_h, A_expected_lcl.graph.row_map);
+  auto A_expected_lcl = A_expected.getLocalMatrixHost ();
+  auto ptr_h = A_expected_lcl.graph.row_map;
+  auto expected_val_h = A_expected_lcl.values;
 
-  auto expected_val_h = Kokkos::create_mirror_view (A_expected_lcl.values);
-  Kokkos::deep_copy (expected_val_h, A_expected_lcl.values);
-
-  auto A_actual_lcl = A_actual.getLocalMatrix ();
-  auto actual_val_h = Kokkos::create_mirror_view (A_actual_lcl.values);
+  auto A_actual_lcl = A_actual.getLocalMatrixHost ();
+  auto actual_val_h = A_actual_lcl.values;
   Kokkos::deep_copy (actual_val_h, A_actual_lcl.values);
 
   using size_type = typename decltype (A_actual_lcl.graph)::size_type;
@@ -1074,7 +1071,7 @@ makeSymmetricPositiveDefiniteTridiagonalMatrixTest (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1237,7 +1234,7 @@ makeSymmetricPositiveDefiniteTridiagonalMatrixTest (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1388,7 +1385,7 @@ makeMatrixTestWithExplicitZeroDiag (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1475,7 +1472,7 @@ makeMatrixTestWithExplicitZeroDiag (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1626,7 +1623,7 @@ makeMatrixTestWithImplicitZeroDiag (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1713,7 +1710,7 @@ makeMatrixTestWithImplicitZeroDiag (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1876,7 +1873,7 @@ makeMatrixTestWithExplicitInfAndNan (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
@@ -1998,7 +1995,7 @@ makeMatrixTestWithExplicitInfAndNan (Teuchos::FancyOStream& out,
     Teuchos::RCP<crs_matrix_type> A_copy = deepCopyFillCompleteCrsMatrix (*A);
     A_copy->resumeFill ();
 
-    auto A_lcl = A_copy->getLocalMatrix ();
+    auto A_lcl = A_copy->getLocalMatrixDevice ();
     auto val_h = Kokkos::create_mirror_view (A_lcl.values);
     Kokkos::deep_copy (val_h, A_lcl.values);
     auto ptr_h = Kokkos::create_mirror_view (A_lcl.graph.row_map);
diff --git a/packages/tpetra/core/test/CrsMatrix/Regression/Albany182.cpp b/packages/tpetra/core/test/CrsMatrix/Regression/Albany182.cpp
index 266a275ddbcd..131c304570ef 100644
--- a/packages/tpetra/core/test/CrsMatrix/Regression/Albany182.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/Regression/Albany182.cpp
@@ -200,20 +200,20 @@ namespace { // (anonymous)
     const LO lclNumRows = static_cast<LO> (rowMap.getNodeNumElements ());
     if (lclNumRows != 0) {
       if (A.isLocallyIndexed ()) {
-        const map_type& colMap = * (A.getColMap ());
         Teuchos::Array<GO> gblColInds;
+        const map_type& colMap = * (A.getColMap ());
         for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
           const GO gblRow = rowMap.getGlobalElement (lclRow);
           out << "gblRow: " << gblRow;
-          Teuchos::ArrayView<const LO> lclColInds;
-          Teuchos::ArrayView<const ST> vals;
+          typename CrsMatrixType::local_inds_host_view_type lclColInds;
+          typename CrsMatrixType::values_host_view_type vals;
           A.getLocalRowView (lclRow, lclColInds, vals);
           out << ": {lclCols: ";
           printArray (out, lclColInds);
-          if (gblColInds.size () < lclColInds.size ()) {
+          if (size_t(gblColInds.size ()) < lclColInds.size ()) {
             gblColInds.resize (lclColInds.size ());
           }
-          for (ptrdiff_t k = 0; k < lclColInds.size (); ++k) {
+          for (size_t k = 0; k < lclColInds.size (); ++k) {
             gblColInds[k] = colMap.getGlobalElement (lclColInds[k]);
           }
           out << ", gblCols: ";
@@ -230,8 +230,8 @@ namespace { // (anonymous)
         for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
           const GO gblRow = rowMap.getGlobalElement (lclRow);
           out << "gblRow: " << gblRow;
-          Teuchos::ArrayView<const GO> gblColInds;
-          Teuchos::ArrayView<const ST> vals;
+          typename CrsMatrixType::global_inds_host_view_type gblColInds;
+          typename CrsMatrixType::values_host_view_type vals;
           A.getGlobalRowView (gblRow, gblColInds, vals);
           out << ": {gblCols: ";
           printArray (out, gblColInds);
@@ -562,10 +562,11 @@ namespace { // (anonymous)
         Teuchos::OSTab tab2 (out);
 
         const GO lclRow = rowMap.getLocalElement (gblRow);
-        LO numEnt = 0;
-        const LO* lclInds = NULL;
-        const ST* vals = NULL;
-        A.getLocalRowViewRaw (lclRow, numEnt, lclInds, vals);
+        typename CrsMatrixType::crs_graph_type::local_inds_host_view_type
+                 lclInds;
+        typename CrsMatrixType::values_host_view_type vals;
+        A.getLocalRowView(lclRow, lclInds, vals);
+        LO numEnt = lclInds.extent(0);
 
         TEST_EQUALITY( numEnt, numEnt_expected );
         if (numEnt == numEnt_expected) {
@@ -580,7 +581,8 @@ namespace { // (anonymous)
           // Sort the global column indices, since we don't want to
           // rely on a promise of ordering.  Apply the same
           // permutation to the values, so we can compare.
-          std::vector<ST> vals_got (vals, vals + numEnt);
+          std::vector<ST> vals_got(numEnt);
+          for (LO k = 0; k < numEnt; ++k) vals_got[k] = vals[k];
           Tpetra::sort2 (gblInds_got.begin (), gblInds_got.end (),
                          vals_got.begin ());
           testArrayEquality (out, success,
@@ -599,8 +601,8 @@ namespace { // (anonymous)
         out << "Overlapping CrsMatrix is globally indexed" << endl;
         Teuchos::OSTab tab2 (out);
 
-        Teuchos::ArrayView<const GO> gblInds;
-        Teuchos::ArrayView<const ST> vals;
+        typename CrsMatrixType::global_inds_host_view_type gblInds;
+        typename CrsMatrixType::values_host_view_type vals;
         A.getGlobalRowView (gblRow, gblInds, vals);
         const LO numEnt = static_cast<LO> (gblInds.size ());
 
@@ -610,8 +612,12 @@ namespace { // (anonymous)
           // indices, since we don't want to rely on a promise of
           // ordering.  Apply the same permutation to the values, so
           // we can compare.
-          std::vector<GO> gblInds_got (gblInds.begin (), gblInds.end ());
-          std::vector<ST> vals_got (vals.begin (), vals.end ());
+          std::vector<GO> gblInds_got(numEnt);
+          std::vector<ST> vals_got(numEnt);
+          for (LO k = 0; k < numEnt; ++k) {
+            gblInds_got[k] = gblInds[k];
+            vals_got[k] = vals[k];
+          }
           Tpetra::sort2 (gblInds_got.begin (), gblInds_got.end (),
                          vals_got.begin ());
           testArrayEquality (out, success, gblInds_got.data (),
@@ -839,28 +845,28 @@ namespace { // (anonymous)
       const GO gblRow = (myRank == 0) ? GO (0) : GO (1);
       const LO lclRow =
         A_nonoverlapping.getRowMap ()->getLocalElement (gblRow);
-      LO numEnt = 0;
-      const LO* lclColInds = NULL;
-      const double* vals = NULL;
-      const LO errCode = A_nonoverlapping.getLocalRowViewRaw (lclRow, numEnt,
-                                                              lclColInds, vals);
-      TEST_EQUALITY_CONST( errCode, LO (0) );
-      if (errCode == LO (0)) {
-        // Sort the input matrix's row data, so that this test does
-        // not depend on any assumption of sorted rows.
-        std::vector<double> vals_got (vals, vals + numEnt);
-        std::vector<LO> lclColInds_got (lclColInds, lclColInds + numEnt);
-        Tpetra::sort2 (lclColInds_got.begin (), lclColInds_got.end (),
-                       vals_got.begin ());
-        testArrayEquality (out, success, lclColInds_got.data (),
-                           lclColInds_got.size (), "lclColInds_got",
-                           lclColInds_expected.data (),
-                           lclColInds_expected.size (),
-                           "lclColInds_expected");
-        testArrayEquality (out, success, vals_got.data (), vals_got.size (),
-                           "vals_got", vals_expected.data (),
-                           vals_expected.size (), "vals_expected");
+      typename CrsMatrixType::local_inds_host_view_type lclColInds;
+      typename CrsMatrixType::values_host_view_type vals;
+      A_nonoverlapping.getLocalRowView(lclRow, lclColInds, vals);
+      LO numEnt = lclColInds.extent(0);
+      // Sort the input matrix's row data, so that this test does
+      // not depend on any assumption of sorted rows.
+      std::vector<LO> lclColInds_got (numEnt);
+      std::vector<double> vals_got(numEnt);
+      for (LO k = 0; k < numEnt; ++k) {
+        lclColInds_got[k] = lclColInds[k];
+        vals_got[k] = vals[k];
       }
+      Tpetra::sort2 (lclColInds_got.begin (), lclColInds_got.end (),
+                     vals_got.begin ());
+      testArrayEquality (out, success, lclColInds_got.data (),
+                         lclColInds_got.size (), "lclColInds_got",
+                         lclColInds_expected.data (),
+                         lclColInds_expected.size (),
+                         "lclColInds_expected");
+      testArrayEquality (out, success, vals_got.data (), vals_got.size (),
+                         "vals_got", vals_expected.data (),
+                         vals_expected.size (), "vals_expected");
     }
     else {
       const LO lclNumRows =
diff --git a/packages/tpetra/core/test/CrsMatrix/Tpetra_Test_CrsMatrix_WithGraph.hpp b/packages/tpetra/core/test/CrsMatrix/Tpetra_Test_CrsMatrix_WithGraph.hpp
index b991d08604a2..91aac2dfa621 100644
--- a/packages/tpetra/core/test/CrsMatrix/Tpetra_Test_CrsMatrix_WithGraph.hpp
+++ b/packages/tpetra/core/test/CrsMatrix/Tpetra_Test_CrsMatrix_WithGraph.hpp
@@ -361,8 +361,8 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
 
 
       out << "Create a CrsMatrix with the diagonal CrsGraph and Kokkos view" << endl;
-      size_t numEnt = diaggraph.getLocalGraph().entries.extent(0);
-      typename MAT::local_matrix_type::values_type val ("Tpetra::CrsMatrix::val", numEnt);
+      size_t numEnt = diaggraph.getLocalGraphDevice().entries.extent(0);
+      typename MAT::local_matrix_device_type::values_type val ("Tpetra::CrsMatrix::val", numEnt);
       MAT matrix(rcpFromRef(diaggraph),val);
 
       out << "Call setAllToScalar on the CrsMatrix; it should not throw" << endl;
diff --git a/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp b/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp
index 7a6ad471bad3..a8da524ba769 100644
--- a/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/UnpackMerge.cpp
@@ -189,8 +189,8 @@ namespace { // (anonymous)
 
       Kokkos::fence(); // since we're accessing data on host now
 
-      Teuchos::ArrayView<const LO> lclColInds;
-      Teuchos::ArrayView<const Scalar> vals;
+      typename crs_matrix_type::local_inds_host_view_type lclColInds;
+      typename crs_matrix_type::values_host_view_type vals;
       const LO lclRowToTest (0);
       A_tgt.getLocalRowView(lclRowToTest, lclColInds, vals);
 
@@ -224,6 +224,8 @@ namespace { // (anonymous)
     using crs_matrix_type = Tpetra::CrsMatrix<Scalar, LO, GO, Node>;
     using import_type = Tpetra::Import<LO, GO, Node>;
     using map_type = Tpetra::Map<LO, GO, Node>;
+    using gids_type = typename crs_matrix_type::nonconst_global_inds_host_view_type;
+    using vals_type = typename crs_matrix_type::nonconst_values_host_view_type;
     int lclSuccess = 1;
     int gblSuccess = 0;
 
@@ -387,16 +389,16 @@ namespace { // (anonymous)
     if (myRank == 0) {
       const GO gblRowToTest = tgtRowMap->getMinGlobalIndex();
       size_t numEnt = A_tgt.getNumEntriesInGlobalRow(gblRowToTest);
-      Teuchos::Array<GO> gblColInds(numEnt);
-      Teuchos::Array<Scalar> vals(numEnt);
-      A_tgt.getGlobalRowCopy(gblRowToTest, gblColInds(),
-                             vals(), numEnt);
+      gids_type gblColInds("gids",numEnt);
+      vals_type vals("vals",numEnt);
+      A_tgt.getGlobalRowCopy(gblRowToTest, gblColInds,
+                             vals, numEnt);
 
       const LO expectedNumEnt(expectedTgtVals.size());
       TEST_EQUALITY( size_t(numEnt), size_t(expectedNumEnt) );
-      TEST_EQUALITY( size_t(gblColInds.size()),
+      TEST_EQUALITY( size_t(gblColInds.extent(0)),
                      size_t(expectedNumEnt) );
-      TEST_EQUALITY( size_t(vals.size()), size_t(expectedNumEnt) );
+      TEST_EQUALITY( size_t(vals.extent(0)), size_t(expectedNumEnt) );
 
       if (success) {
         for (LO k = 0; k < expectedNumEnt; ++k) {
diff --git a/packages/tpetra/core/test/CrsMatrix/applyDirichlet.cpp b/packages/tpetra/core/test/CrsMatrix/applyDirichlet.cpp
index f36500d478d7..8ddbcb5768dd 100644
--- a/packages/tpetra/core/test/CrsMatrix/applyDirichlet.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/applyDirichlet.cpp
@@ -88,8 +88,10 @@ namespace { // (anonymous)
     using GST = Tpetra::global_size_t;
     using STS = Teuchos::ScalarTraits<SC>;
     using KAT = Kokkos::ArithTraits<IST>;    
-    using local_matrix_type = typename crs_matrix_type::local_matrix_type;
-    using local_graph_type = typename local_matrix_type::staticcrsgraph_type;
+    using local_matrix_device_type = 
+          typename crs_matrix_type::local_matrix_device_type;
+    using local_graph_device_type = 
+          typename local_matrix_device_type::staticcrsgraph_type;
     using device_type = typename crs_matrix_type::device_type;
     using execution_space = typename crs_matrix_type::execution_space;
     using range_type = Kokkos::RangePolicy<execution_space, LO>;
@@ -111,9 +113,9 @@ namespace { // (anonymous)
     vec2.putScalar (STS::zero ());
 
     using row_offsets_type =
-      typename local_graph_type::row_map_type::non_const_type;
+      typename local_graph_device_type::row_map_type::non_const_type;
     using lcl_col_inds_type =
-      typename local_graph_type::entries_type::non_const_type;
+      typename local_graph_device_type::entries_type::non_const_type;
     row_offsets_type rowOffsets ("rowOffsets", lclNumRows+1);    
     lcl_col_inds_type lclColInds ("lclColInds", lclNumRows);
     Kokkos::View<IST*, device_type> values ("values", lclNumRows);
@@ -145,8 +147,8 @@ namespace { // (anonymous)
 	  values(lclRow) = KAT::one () + KAT::one ();
 	});
       
-      local_graph_type G_lcl (lclColInds, rowOffsets);
-      local_matrix_type A_lcl ("A_lcl", G_lcl);
+      local_graph_device_type G_lcl (lclColInds, rowOffsets);
+      local_matrix_device_type A_lcl ("A_lcl", G_lcl);
       crs_matrix_type eye (A_lcl, rowMap, colMap, domMap, ranMap);
       TEST_ASSERT( eye.isFillComplete () );
 
@@ -169,8 +171,8 @@ namespace { // (anonymous)
 	  values(lclRow) = KAT::one () + KAT::one ();
 	});
       
-      local_graph_type G_lcl (lclColInds, rowOffsets);
-      local_matrix_type A_lcl ("A_lcl", G_lcl);
+      local_graph_device_type G_lcl (lclColInds, rowOffsets);
+      local_matrix_device_type A_lcl ("A_lcl", G_lcl);
       crs_matrix_type eye (A_lcl, rowMap, colMap, domMap, ranMap);
       TEST_ASSERT( eye.isFillComplete () );
 
@@ -194,8 +196,8 @@ namespace { // (anonymous)
 	  values(lclRow) = KAT::one () + KAT::one ();
 	});
       
-      local_graph_type G_lcl (lclColInds, rowOffsets);
-      local_matrix_type A_lcl ("A_lcl", G_lcl);
+      local_graph_device_type G_lcl (lclColInds, rowOffsets);
+      local_matrix_device_type A_lcl ("A_lcl", G_lcl);
       crs_matrix_type eye (A_lcl, rowMap, colMap, domMap, ranMap);
       TEST_ASSERT( eye.isFillComplete () );
 
diff --git a/packages/tpetra/core/test/CrsMatrix/sumIntoStaticProfileExtraSpace.cpp b/packages/tpetra/core/test/CrsMatrix/sumIntoStaticProfileExtraSpace.cpp
index 1f77705e4cd7..4ffe85c8a1c7 100644
--- a/packages/tpetra/core/test/CrsMatrix/sumIntoStaticProfileExtraSpace.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/sumIntoStaticProfileExtraSpace.cpp
@@ -112,8 +112,8 @@ TEUCHOS_UNIT_TEST( CrsMatrix, sumIntoStaticProfileExtraSpace )
       const size_t newNumEnt = A.getNumEntriesInLocalRow (lclRow);
       TEST_ASSERT( newNumEnt == 2 );
 
-      Teuchos::ArrayView<const LO> inds_av;
-      Teuchos::ArrayView<const double> vals_av;
+      typename crs_matrix_type::local_inds_host_view_type inds_av;
+      typename crs_matrix_type::values_host_view_type vals_av;;
       A.getLocalRowView (lclRow, inds_av, vals_av);
       TEST_ASSERT( inds_av.size () == ptrdiff_t (2) );
       TEST_ASSERT( vals_av.size () == ptrdiff_t (2) );
diff --git a/packages/tpetra/core/test/Distributor/Issue1752.cpp b/packages/tpetra/core/test/Distributor/Issue1752.cpp
index d008e8101b33..d721a7444656 100644
--- a/packages/tpetra/core/test/Distributor/Issue1752.cpp
+++ b/packages/tpetra/core/test/Distributor/Issue1752.cpp
@@ -124,7 +124,6 @@ std::pair<int, std::string> check_matrix(CrsMatrixType& matrix)
   typedef typename CrsMatrixType::scalar_type ST;
   typedef typename CrsMatrixType::local_ordinal_type LO;
   typedef typename CrsMatrixType::global_ordinal_type GO;
-  typedef typename ArrayView<LO>::size_type size_type;
   typedef Tpetra::global_size_t GST;
 
   int ierr = 0;
@@ -140,8 +139,8 @@ std::pair<int, std::string> check_matrix(CrsMatrixType& matrix)
 
   for (LO i=0; i<static_cast<LO>(my_num_rows); i++) {
     auto gbl_row = map->getGlobalElement(i);
-    ArrayView<const LO> cols;
-    ArrayView<const ST> vals;
+    typename CrsMatrixType::local_inds_host_view_type cols;
+    typename CrsMatrixType::values_host_view_type vals;
     matrix.getLocalRowView(i, cols, vals);
 
     std::map<GO,ST> expected;
@@ -162,7 +161,7 @@ std::pair<int, std::string> check_matrix(CrsMatrixType& matrix)
       expected[gbl_row+1] = neg_one;
     }
 
-    if (static_cast<size_type>(expected.size()) != cols.size()) {
+    if (expected.size() != cols.size()) {
       ierr++;
       os << " Error: expected row " << gbl_row
          << " to have " << expected.size()
@@ -171,7 +170,7 @@ std::pair<int, std::string> check_matrix(CrsMatrixType& matrix)
       continue;
     }
 
-    for (typename ArrayView<const ST>::size_type j=0; j<cols.size(); j++) {
+    for (size_t j=0; j<cols.size(); j++) {
       auto gbl_col = matrix.getColMap()->getGlobalElement(cols[j]);
       if (vals[j] != expected[gbl_col]) {
         ierr++;
diff --git a/packages/tpetra/core/test/Distributor/createfromsendsandrecvs.cpp b/packages/tpetra/core/test/Distributor/createfromsendsandrecvs.cpp
index 1bf32184838f..cc8da7481de2 100644
--- a/packages/tpetra/core/test/Distributor/createfromsendsandrecvs.cpp
+++ b/packages/tpetra/core/test/Distributor/createfromsendsandrecvs.cpp
@@ -144,7 +144,7 @@ namespace { // (anonymous)
 
     // Fill in the block sparse matrix.
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) { // for each of my rows
-      Teuchos::ArrayView<const LO> lclColInds;
+      typename matrix_type::local_inds_host_view_type lclColInds;
       graph->getLocalRowView (lclRow, lclColInds);
 
       // Put some entries in the matrix.
diff --git a/packages/tpetra/core/test/FECrsGraph/FECrsGraph_UnitTests.cpp b/packages/tpetra/core/test/FECrsGraph/FECrsGraph_UnitTests.cpp
index 7d9ee16bc0b2..5a5d500cb01c 100644
--- a/packages/tpetra/core/test/FECrsGraph/FECrsGraph_UnitTests.cpp
+++ b/packages/tpetra/core/test/FECrsGraph/FECrsGraph_UnitTests.cpp
@@ -88,11 +88,11 @@ bool compare_final_graph_structure(Teuchos::FancyOStream &out,Tpetra::CrsGraph<L
   if (!g1.getDomainMap()->isSameAs(*g2.getDomainMap())) {out<<"Compare: DomainMap failed"<<endl;return false;}
   if (!g1.getColMap()->isSameAs(*g2.getColMap())) {out<<"Compare: ColMap failed"<<endl;g1.describe(out,Teuchos::VERB_EXTREME);g2.describe(out,Teuchos::VERB_EXTREME);return false;}
 
-  auto rowptr1 = g1.getLocalGraph().row_map;
-  auto rowptr2 = g2.getLocalGraph().row_map;
+  auto rowptr1 = g1.getLocalGraphHost().row_map;
+  auto rowptr2 = g2.getLocalGraphHost().row_map;
 
-  auto colind1 = g1.getLocalGraph().entries;
-  auto colind2 = g2.getLocalGraph().entries;
+  auto colind1 = g1.getLocalGraphHost().entries;
+  auto colind2 = g2.getLocalGraphHost().entries;
 
   if (rowptr1.extent(0) != rowptr2.extent(0)) {out<<"Compare: rowptr extent failed"<<endl;return false;}      
   if (colind1.extent(0) != colind2.extent(0)) {out<<"Compare: colind extent failed: "<<colind1.extent(0)<<" vs "<<colind2.extent(0)<<endl;
@@ -148,12 +148,15 @@ bool compare_final_graph_structure_relaxed(Teuchos::FancyOStream &out,
     return false;
   }
 
-  auto hasLID = [](const Teuchos::ArrayView<const LO>& lids, const LO lid) -> bool {
-    auto it = std::find(lids.begin(),lids.end(),lid);
-    return it!=lids.end();
+  typedef typename Tpetra::CrsGraph<LO,GO,Node>::local_inds_host_view_type
+                   lcl_ind_type;
+
+  auto hasLID = [](const lcl_ind_type & lids, const LO lid) -> bool {
+    auto it = std::find(lids.data(),lids.data()+lids.extent(0),lid);
+    return it!=lids.data()+lids.extent(0);
   };
 
-  Teuchos::ArrayView<const LO> cols1, cols2;
+  lcl_ind_type cols1, cols2;
   const LO invLO = Teuchos::OrdinalTraits<LO>::invalid();
   const auto& colMap1 = *g1.getColMap();
   const auto& colMap2 = *g2.getColMap();
diff --git a/packages/tpetra/core/test/FECrsMatrix/FECrsMatrix_UnitTests.cpp b/packages/tpetra/core/test/FECrsMatrix/FECrsMatrix_UnitTests.cpp
index de50cfe6a031..39d8d37a032a 100644
--- a/packages/tpetra/core/test/FECrsMatrix/FECrsMatrix_UnitTests.cpp
+++ b/packages/tpetra/core/test/FECrsMatrix/FECrsMatrix_UnitTests.cpp
@@ -79,14 +79,17 @@ bool compare_final_matrix_structure_impl(Teuchos::FancyOStream &out,Tpetra::CrsM
   if (!g1.getColMap()->isSameAs(*g2.getColMap())) {out<<"Compare: ColMap failed"<<endl;return false;}
   if (!g1.getDomainMap()->isSameAs(*g2.getDomainMap())) {out<<"Compare: DomainMap failed"<<endl;return false;}
 
-  auto rowptr1 = g1.getLocalMatrix().graph.row_map;
-  auto rowptr2 = g2.getLocalMatrix().graph.row_map;
+  auto lclMtx1 = g1.getLocalMatrixHost();
+  auto lclMtx2 = g2.getLocalMatrixHost();
 
-  auto colind1 = g1.getLocalMatrix().graph.entries;
-  auto colind2 = g2.getLocalMatrix().graph.entries;
+  auto rowptr1 = lclMtx1.graph.row_map;
+  auto rowptr2 = lclMtx2.graph.row_map;
 
-  auto values1 = g1.getLocalMatrix().values;
-  auto values2 = g2.getLocalMatrix().values;
+  auto colind1 = lclMtx1.graph.entries;
+  auto colind2 = lclMtx2.graph.entries;
+
+  auto values1 = lclMtx1.values;
+  auto values2 = lclMtx2.values;
 
   if (rowptr1.extent(0) != rowptr2.extent(0)) {out<<"Compare: rowptr extent failed"<<endl;return false;}      
   if (colind1.extent(0) != colind2.extent(0)) {out<<"Compare: colind extent failed"<<endl;return false;}      
@@ -346,25 +349,26 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( FECrsMatrix, Assemble1D_Kokkos, LO, GO, Scala
   FEMAT mat1(graph); // Here we use graph as a FECrsGraph
   CMAT mat2(graph);  // Here we use graph as a CrsGraph in OWNED mode
   mat1.beginFill();
-
-  auto k_e2n = pack.k_element2node;
-  auto localMat = mat1.getLocalMatrix();
-  auto localMap = pack.overlapMap->getLocalMap();
-  //get local map too
-
-  Kokkos::parallel_for("assemble_1d",
-		       range_type (0,k_e2n.extent(0)), 
-		       KOKKOS_LAMBDA(const size_t i) {
-    size_t extent = k_e2n.extent(1);
-    for(size_t j=0; j < extent; j++) {
-      LO lid_j = localMap.getLocalElement(k_e2n(i, j));
-      for(size_t k=0; k < extent; k++) {
-        LO lid_k = localMap.getLocalElement(k_e2n(i, k));
-	ImplScalarType tmp = kokkosValues(j, k);
-	localMat.sumIntoValues(lid_j, &lid_k, 1, &tmp, true, true);
+  {
+    auto k_e2n = pack.k_element2node;
+    auto localMat = mat1.getLocalMatrixDevice();
+    auto localMap = pack.overlapMap->getLocalMap();
+    //get local map too
+
+    Kokkos::parallel_for("assemble_1d",
+		         range_type (0,k_e2n.extent(0)), 
+		         KOKKOS_LAMBDA(const size_t i) {
+      size_t extent = k_e2n.extent(1);
+      for(size_t j=0; j < extent; j++) {
+        LO lid_j = localMap.getLocalElement(k_e2n(i, j));
+        for(size_t k=0; k < extent; k++) {
+          LO lid_k = localMap.getLocalElement(k_e2n(i, k));
+	  ImplScalarType tmp = kokkosValues(j, k);
+	  localMat.sumIntoValues(lid_j, &lid_k, 1, &tmp, true, true);
+        }
       }
-    }
-  });
+    });
+  }
   mat1.endFill();
   
   for(size_t i=0; i<(size_t)pack.element2node.size(); i++) {
@@ -486,22 +490,24 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( FECrsMatrix, Assemble1D_LocalIndex_Kokkos, LO
   FEMAT mat1(graph); // Here we use graph as a FECrsGraph
   CMAT mat2(graph);  // Here we use graph as a CrsGraph in OWNED mode
   mat1.beginFill();
-  auto k_e2n = pack.k_element2node;
-  auto localMat = mat1.getLocalMatrix();
-  auto localMap = pack.overlapMap->getLocalMap();
-
-  Kokkos::parallel_for("assemble_1d_local_index", 
-		       range_type (0, k_e2n.extent(0)),
-		       KOKKOS_LAMBDA(const size_t i) {
-    for(size_t j=0; j<k_e2n.extent(1); j++) {
-      LO lid_j = localMap.getLocalElement(k_e2n(i, j));
-      for(size_t k=0; k<k_e2n.extent(1); k++) {
-        LO lid_k = localMap.getLocalElement(k_e2n(i, k));
-	ImplScalarType tmp = kokkosValues(j, k);
-	localMat.sumIntoValues(lid_j, &lid_k, 1, &tmp, true, true);
+  {
+    auto k_e2n = pack.k_element2node;
+    auto localMat = mat1.getLocalMatrixDevice();
+    auto localMap = pack.overlapMap->getLocalMap();
+
+    Kokkos::parallel_for("assemble_1d_local_index", 
+		         range_type (0, k_e2n.extent(0)),
+		         KOKKOS_LAMBDA(const size_t i) {
+      for(size_t j=0; j<k_e2n.extent(1); j++) {
+        LO lid_j = localMap.getLocalElement(k_e2n(i, j));
+        for(size_t k=0; k<k_e2n.extent(1); k++) {
+          LO lid_k = localMap.getLocalElement(k_e2n(i, k));
+	  ImplScalarType tmp = kokkosValues(j, k);
+	  localMat.sumIntoValues(lid_j, &lid_k, 1, &tmp, true, true);
+        }
       }
-    }
-  });
+    });
+  }
   mat1.endFill();
 
   for(size_t i=0; i<(size_t)pack.element2node.size(); i++) {
@@ -573,7 +579,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( FECrsMatrix, Assemble1D_LocalIndex_Kokkos_Mul
     mat1.beginFill();
     mat1.setAllToScalar(SC_ZERO);
     auto k_e2n = pack.k_element2node;
-    auto localMat = mat1.getLocalMatrix();
+    auto localMat = mat1.getLocalMatrixDevice();
     auto localMap = pack.overlapMap->getLocalMap();
     
     Kokkos::parallel_for("assemble_1d_local_index", 
diff --git a/packages/tpetra/core/test/ImportExport/ExportToStaticGraphCrsMatrix.cpp b/packages/tpetra/core/test/ImportExport/ExportToStaticGraphCrsMatrix.cpp
index ce30821b4503..0f30119b1e96 100644
--- a/packages/tpetra/core/test/ImportExport/ExportToStaticGraphCrsMatrix.cpp
+++ b/packages/tpetra/core/test/ImportExport/ExportToStaticGraphCrsMatrix.cpp
@@ -393,8 +393,8 @@ namespace {
 
       // const size_t srcMaxNumRowEntries = A_src->getNodeMaxNumRowEntries ();
       // const size_t tgtMaxNumRowEntries = A_tgt->getNodeMaxNumRowEntries ();
-      ArrayView<const LO> srcIndView, tgtIndView;
-      ArrayView<const ST> srcValView, tgtValView;
+      typename matrix_type::local_inds_host_view_type srcIndView, tgtIndView;
+      typename matrix_type::values_host_view_type srcValView, tgtValView;
 
       // We assume that the row Maps of A_src and A_tgt share the same
       // GIDs, except that the row Map of A_src has overlap, and the
@@ -429,14 +429,14 @@ namespace {
         A_tgt->getLocalRowView (tgtLocalRow, tgtIndView, tgtValView);
 
         // Assume for now that the entries are sorted by column index.
-        if (! std::equal (srcIndView.begin(), srcIndView.end(), tgtIndView.begin())) {
+        if (! std::equal (srcIndView.data(), srcIndView.data()+srcIndView.extent(0), tgtIndView.data())) {
           allRowsAgree = false;
           disagreeingRows.push_back (globalRow);
           continue;
         }
         // FIXME (mfh 15 Mar 2012) Should we include a small error
         // tolerance here for roundoff?
-        if (! std::equal (srcValView.begin(), srcValView.end(), tgtValView.begin())) {
+        if (! std::equal (srcValView.data(), srcValView.data()+srcIndView.extent(0), tgtValView.data())) {
           allRowsAgree = false;
           disagreeingRows.push_back (globalRow);
           continue;
diff --git a/packages/tpetra/core/test/ImportExport/ImportExport_ImportConstructExpert.cpp b/packages/tpetra/core/test/ImportExport/ImportExport_ImportConstructExpert.cpp
index 86efff1fa91f..ea09dc501bf5 100644
--- a/packages/tpetra/core/test/ImportExport/ImportExport_ImportConstructExpert.cpp
+++ b/packages/tpetra/core/test/ImportExport/ImportExport_ImportConstructExpert.cpp
@@ -148,7 +148,7 @@ namespace {
 
     // Fill in the block sparse matrix.
     for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) { // for each of my rows
-      Teuchos::ArrayView<const LO> lclColInds;
+      typename graph_type::local_inds_host_view_type lclColInds;
       G->getLocalRowView (lclRow, lclColInds);
 
       // Put some entries in the matrix.
diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_CrsSortingUtils.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_CrsSortingUtils.cpp
index 4cffccdca55c..f7f6c225f636 100644
--- a/packages/tpetra/core/test/ImportExport2/ImportExport2_CrsSortingUtils.cpp
+++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_CrsSortingUtils.cpp
@@ -55,10 +55,12 @@
 namespace {
 
 template<class DeviceViewType>
-struct create_views {
+struct create_device_views {
   DeviceViewType d;
   typename DeviceViewType::HostMirror h;
-  create_views(const std::vector<typename DeviceViewType::value_type>& x)
+  create_device_views(
+    const std::vector<typename DeviceViewType::value_type>& x
+  )
   {
     d = DeviceViewType("x_d", x.size());
     h = Kokkos::create_mirror_view(d);
@@ -319,11 +321,13 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( Import_Util, SortCrsEntriesKokkos, Scalar, LO
 {
   using Tpetra::Import_Util::sortCrsEntries;
 
-  typedef typename Tpetra::CrsMatrix<Scalar,LO,GO,NT>::local_matrix_type local_matrix_type;
-  typedef typename local_matrix_type::StaticCrsGraphType graph_type;
+  typedef typename Tpetra::CrsMatrix<Scalar,LO,GO,NT>::local_matrix_device_type
+                   local_matrix_device_type;
+  typedef typename local_matrix_device_type::StaticCrsGraphType graph_type;
   typedef typename graph_type::row_map_type::non_const_type rowptr_view_type;
   typedef typename graph_type::entries_type::non_const_type colind_view_type;
-  typedef typename local_matrix_type::values_type::non_const_type vals_view_type;
+  typedef typename local_matrix_device_type::values_type::non_const_type 
+                   vals_view_type;
 
   typedef typename rowptr_view_type::value_type index_type;
   typedef typename colind_view_type::value_type ordinal_type;
@@ -348,7 +352,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( Import_Util, SortCrsEntriesKokkos, Scalar, LO
   vals_type vals, vals2;
 
   generate_crs_entries<scalar_type,ordinal_type,index_type>(
-      rowptr, rowptr2, colind, colind2, vals, vals2, max_num_entries_per_row, num_cols);
+      rowptr, rowptr2, colind, colind2, vals, vals2, 
+      max_num_entries_per_row, num_cols);
 
   {
     //
@@ -364,10 +369,14 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( Import_Util, SortCrsEntriesKokkos, Scalar, LO
         colind_rand, vals_rand, rowptr, colind, vals);
 
     // Create mirror views of the CRS entries
-    auto rowptr_views = create_views<rowptr_view_type>(rowptr);
-    auto colind_rand_views = create_views<colind_view_type>(colind_rand);
-    auto colind_rand_copy_views = create_views<colind_view_type>(colind_rand);
-    auto vals_rand_views = create_views<vals_view_type>(vals_rand);
+    auto rowptr_views = 
+         create_device_views<rowptr_view_type>(rowptr);
+    auto colind_rand_views = 
+         create_device_views<colind_view_type>(colind_rand);
+    auto colind_rand_copy_views = 
+         create_device_views<colind_view_type>(colind_rand);
+    auto vals_rand_views = 
+         create_device_views<vals_view_type>(vals_rand);
 
     //
     // Sort the GIDs and associated values
diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp
index 0705713bf006..0451932d45d0 100644
--- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp
+++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp
@@ -285,7 +285,7 @@ namespace {
       }
       row = 0;
       for (size_t i = 0; i < tgt_map->getNodeNumElements (); ++i, ++row) {
-        ArrayView<const LO> rowview;
+        typename CrsGraph<LO,GO>::local_inds_host_view_type rowview;
         tgt_graph->getLocalRowView( row, rowview );
         TEST_EQUALITY(rowview.size(), 1);
         TEST_EQUALITY(rowview[0], row);
@@ -358,9 +358,9 @@ namespace {
              globalrow <= tgt_map->getMaxGlobalIndex ();
              ++globalrow) {
           LO localrow = tgt_map->getLocalElement (globalrow);
-          ArrayView<const LO> rowview;
+          typename CrsGraph<LO,GO>::local_inds_host_view_type rowview;
           tgt_graph->getLocalRowView (localrow, rowview);
-          TEST_EQUALITY(rowview.size(), globalrow+1);
+          TEST_EQUALITY((size_t)rowview.size(), (size_t)globalrow+1);
 
           // The target graph doesn't necessarily promise sorted
           // order.  Thus, we copy out the local row view, convert to
@@ -396,6 +396,7 @@ namespace {
   {
     typedef Tpetra::global_size_t GST;
     typedef Map<LO, GO> map_type;
+    typedef CrsMatrix<Scalar, LO, GO> crs_type;
 
     out << "(CrsMatrixImportExport,doImport) test" << endl;
     OSTab tab1 (out); // Add one tab level
@@ -475,8 +476,8 @@ namespace {
              ++gblRow) {
           const LO lclRow = tgt_map->getLocalElement (gblRow);
 
-          ArrayView<const LO> lclInds;
-          ArrayView<const Scalar> lclVals;
+          typename crs_type::local_inds_host_view_type lclInds;
+          typename crs_type::values_host_view_type lclVals;
           tgt_mat->getLocalRowView (lclRow, lclInds, lclVals);
           TEST_EQUALITY_CONST(lclInds.size(), 1);
           TEST_EQUALITY_CONST(lclVals.size(), 1);
@@ -523,11 +524,13 @@ namespace {
       // together, so we make a rough guess.
       const magnitude_type tol =
           as<magnitude_type> (10) * ScalarTraits<magnitude_type>::eps ();
-
-      Array<LO> tgtRowInds;
-      Array<Scalar>  tgtRowVals;
-      Array<LO> tgt2RowInds;
-      Array<Scalar>  tgt2RowVals;
+      typedef typename CrsMatrix<Scalar, LO, GO>::nonconst_local_inds_host_view_type lids_type;
+      typedef typename CrsMatrix<Scalar,LO,GO>::nonconst_values_host_view_type vals_type;
+ 
+      lids_type tgtRowInds;
+      vals_type tgtRowVals;
+      lids_type tgt2RowInds;
+      vals_type tgt2RowVals;
       for (LO localrow = tgt_map->getMinLocalIndex();
            localrow <= tgt_map->getMaxLocalIndex();
            ++localrow)
@@ -539,21 +542,21 @@ namespace {
         TEST_EQUALITY(tgtNumEntries, tgt2NumEntries);
 
         if (tgtNumEntries > as<size_t> (tgtRowInds.size ())) {
-          tgtRowInds.resize (tgtNumEntries);
-          tgtRowVals.resize (tgtNumEntries);
+          Kokkos::resize(tgtRowInds,tgtNumEntries);
+          Kokkos::resize(tgtRowVals,tgtNumEntries);
         }
         if (tgt2NumEntries > as<size_t> (tgt2RowInds.size ())) {
-          tgt2RowInds.resize (tgt2NumEntries);
-          tgt2RowVals.resize (tgt2NumEntries);
+          Kokkos::resize(tgt2RowInds,tgt2NumEntries);
+          Kokkos::resize(tgt2RowVals,tgt2NumEntries);
         }
-        tgt_mat->getLocalRowCopy (localrow, tgtRowInds(), tgtRowVals(), tgtNumEntries);
-        A_tgt2->getLocalRowCopy (localrow, tgt2RowInds(), tgt2RowVals(), tgt2NumEntries);
+        tgt_mat->getLocalRowCopy (localrow, tgtRowInds, tgtRowVals, tgtNumEntries);
+        A_tgt2->getLocalRowCopy (localrow, tgt2RowInds, tgt2RowVals, tgt2NumEntries);
 
         // Entries should be sorted, but let's sort them by column
         // index just in case.  This is why we got a row copy instead
         // of a row view.
-        Tpetra::sort2 (tgtRowInds.begin(), tgtRowInds.end(), tgtRowVals.begin());
-        Tpetra::sort2 (tgt2RowInds.begin(), tgt2RowInds.end(), tgt2RowVals.begin());
+        Tpetra::sort2 (tgtRowInds, tgtRowInds.extent(0), tgtRowVals);
+        Tpetra::sort2 (tgt2RowInds, tgt2RowInds.extent(0), tgt2RowVals);
 
         // Now that the entries are sorted, compare to make sure they
         // have the same column indices and values.  In the fully
@@ -634,11 +637,12 @@ namespace {
         for (GO globalrow=tgt_map->getMinGlobalIndex();
              globalrow<=tgt_map->getMaxGlobalIndex(); ++globalrow) {
           LO localrow = tgt_map->getLocalElement(globalrow);
-          ArrayView<const LO> rowinds;
-          ArrayView<const Scalar> rowvals;
+          typename CrsMatrix<Scalar,LO,GO>::local_inds_host_view_type rowinds;
+          typename CrsMatrix<Scalar,LO,GO>::values_host_view_type rowvals;
+
           tgt_mat->getLocalRowView(localrow, rowinds, rowvals);
-          TEST_EQUALITY(rowinds.size(), globalrow);
-          TEST_EQUALITY(rowvals.size(), globalrow);
+          TEST_EQUALITY(rowinds.extent(0), (size_t)globalrow);
+          TEST_EQUALITY(rowvals.extent(0), (size_t)globalrow);
 
           // The target graph doesn't necessarily promise sorted
           // order.  Thus, we copy out the local row view, convert to
@@ -721,7 +725,7 @@ bool graphs_are_same(const RCP<Graph>& G1, const RCP<const Graph>& G2)
   if (errors != 0) return false;
 
   for (LO i=0; i<static_cast<LO>(G1->getNodeNumRows()); i++) {
-    ArrayView<const LO> V1, V2;
+    typename Graph::local_inds_host_view_type V1, V2;
     G1->getLocalRowView(i, V1);
     G2->getLocalRowView(i, V2);
     if (V1.size() != V2.size()) {
@@ -731,7 +735,7 @@ bool graphs_are_same(const RCP<Graph>& G1, const RCP<const Graph>& G2)
       continue;
     }
     int jerr = 0;
-    for (LO j=0; static_cast<LO>(j<V1.size()); j++) {
+    for (LO j=0; j<static_cast<LO>(V1.size()); j++) {
       if (V1[j] != V2[j])
         jerr++;
     }
@@ -2516,11 +2520,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL( Import, AdvancedConstructors, LO, GO )  {
     Tpetra::RowMatrixTransposer<Scalar, LO, GO, Node> transposer(P);
     R = transposer.createTranspose();
 
-    ArrayRCP<const size_t> rowptr;
-    ArrayRCP<const LO> colind;
-    ArrayRCP<const Scalar> vals;
-    R->getAllValues(rowptr,colind,vals);
-
     // Form AP
     AP = rcp (new CrsMatrixType(A->getRowMap(),0));
     Tpetra::MatrixMatrix::Multiply(*A,false,*P,false,*AP);
diff --git a/packages/tpetra/core/test/MatrixMatrix/FECrs_MatrixMatrix_UnitTests.cpp b/packages/tpetra/core/test/MatrixMatrix/FECrs_MatrixMatrix_UnitTests.cpp
index ac17eebe9134..78ed6208bece 100644
--- a/packages/tpetra/core/test/MatrixMatrix/FECrs_MatrixMatrix_UnitTests.cpp
+++ b/packages/tpetra/core/test/MatrixMatrix/FECrs_MatrixMatrix_UnitTests.cpp
@@ -264,17 +264,20 @@ bool compare_matrices (const Tpetra::CrsMatrix<ST,LO,GO,NT>& A,
     return false;
   }
 
-  auto findLID = [](const Teuchos::ArrayView<const LO>& lids, const LO lid) -> int {
-    auto it = std::find(lids.begin(),lids.end(),lid);
-    if (it==lids.end()) {
+  typedef typename Tpetra::CrsMatrix<ST,LO,GO,NT> crs_matrix_type;
+  auto findLID = [](
+       const typename crs_matrix_type::local_inds_host_view_type& lids,
+       const LO lid) -> int {
+    auto it = std::find(lids.data(),lids.data()+lids.extent(0),lid);
+    if (it==lids.data()+lids.extent(0)) {
       return -1;
     } else {
-      return std::distance(lids.begin(),it);
+      return std::distance(lids.data(),it);
     }
   };
 
-  Teuchos::ArrayView<const ST> valsA, valsB;
-  Teuchos::ArrayView<const LO> colsA, colsB;
+  typename crs_matrix_type::values_host_view_type  valsA, valsB;
+  typename crs_matrix_type::local_inds_host_view_type colsA, colsB;
   const LO invLO = Teuchos::OrdinalTraits<LO>::invalid();
   const auto& colMapA = *gA.getColMap();
   const auto& colMapB = *gB.getColMap();
@@ -293,7 +296,7 @@ bool compare_matrices (const Tpetra::CrsMatrix<ST,LO,GO,NT>& A,
     }
 
     // Loop over rows entries
-    for (int j=0; j<numEntries; ++j) {
+    for (size_t j=0; j<numEntries; ++j) {
       const LO lidA = colsA[j];
       const GO gid = colMapA.getGlobalElement(lidA);
       const LO lidB = colMapB.getLocalElement(gid);
diff --git a/packages/tpetra/core/test/MatrixMatrix/MatrixMatrixKernels_UnitTests.cpp b/packages/tpetra/core/test/MatrixMatrix/MatrixMatrixKernels_UnitTests.cpp
index d4e59e5dc838..174e2c605a27 100644
--- a/packages/tpetra/core/test/MatrixMatrix/MatrixMatrixKernels_UnitTests.cpp
+++ b/packages/tpetra/core/test/MatrixMatrix/MatrixMatrixKernels_UnitTests.cpp
@@ -405,7 +405,8 @@ mult_test_results multiply_test_kernel(
   }
 
   // Extract Kokkos CrsMatrices
-  typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_type KCRS;
+  typedef typename Tpetra::CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type 
+                   KCRS;
   typedef typename KCRS::device_type device_t;
   typedef typename KCRS::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
@@ -415,8 +416,8 @@ mult_test_results multiply_test_kernel(
           typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
 
   // Grab the  Kokkos::SparseCrsMatrix-es
-  const KCRS & Ak = Aeff->getLocalMatrix();
-  const KCRS & Bk = Beff->getLocalMatrix();
+  const KCRS & Ak = Aeff->getLocalMatrixDevice();
+  const KCRS & Bk = Beff->getLocalMatrixDevice();
 
   // Setup
   // As a note "SPGEMM_MKL" will *NOT* pass all of these tests
diff --git a/packages/tpetra/core/test/MatrixMatrix/MatrixMatrix_UnitTests.cpp b/packages/tpetra/core/test/MatrixMatrix/MatrixMatrix_UnitTests.cpp
index caa7279ea8b2..0256eafe3f78 100644
--- a/packages/tpetra/core/test/MatrixMatrix/MatrixMatrix_UnitTests.cpp
+++ b/packages/tpetra/core/test/MatrixMatrix/MatrixMatrix_UnitTests.cpp
@@ -1527,7 +1527,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
   size_t nrows = 1000;
   size_t nnzPerRow = 20;
   using crs_matrix_type = Tpetra::CrsMatrix<SC, LO, GO, NT>;
-  using KCRS = typename crs_matrix_type::local_matrix_type;
+  using KCRS = typename crs_matrix_type::local_matrix_device_type;
   using ISC = typename crs_matrix_type::impl_scalar_type;
   using ValuesType = typename KCRS::values_type::non_const_type;
   using RowptrsType = typename KCRS::row_map_type::non_const_type;
@@ -1539,10 +1539,16 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
   RowptrsType rowptrsCRS[3];
   ColindsType colindsCRS[3];
   //Populate A and B
+  typename ValuesType::HostMirror vals[3];
+     vals[0] = typename ValuesType::HostMirror("vals0", nrows*nnzPerRow);
+     vals[1] = typename ValuesType::HostMirror("vals1", nrows*nnzPerRow);
+  typename RowptrsType::HostMirror rowptrs[3];
+     rowptrs[0] = typename RowptrsType::HostMirror("rowptr0", nrows+1);
+     rowptrs[1] = typename RowptrsType::HostMirror("rowptr1", nrows+1);
+  typename ColindsType::HostMirror colinds[3]; 
+     colinds[0] = typename ColindsType::HostMirror("colind0", nrows*nnzPerRow);
+     colinds[1] = typename ColindsType::HostMirror("colind1", nrows*nnzPerRow);
   {
-    ISC* vals[2] = {new ISC[nrows * nnzPerRow], new ISC[nrows * nnzPerRow]};
-    LO* rowptrs[2] = {new LO[nrows * nnzPerRow], new LO[nrows * nnzPerRow]};
-    LO* colinds[2] = {new LO[nrows * nnzPerRow], new LO[nrows * nnzPerRow]};
     //want consistent test results
     srand(12);
     for(LO m = 0; m < 2; m++)
@@ -1577,34 +1583,37 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
       {
         rowptrs[m][row] = row * nnzPerRow;
       }
-      valsCRS[m] = ValuesType("Values", nrows * nnzPerRow);
-      rowptrsCRS[m] = RowptrsType("RowPtrs", nrows + 1);
-      colindsCRS[m] = ColindsType("ColInds", nrows * nnzPerRow);
-      for(size_t i = 0; i < nrows + 1; i++)
-      {
-        rowptrsCRS[m](i) = rowptrs[m][i];
-      }
-      for(size_t i = 0; i < nrows * nnzPerRow; i++)
-      {
-        valsCRS[m](i) = vals[m][i];
-        colindsCRS[m](i) = colinds[m][i];
-      }
-    }
-    for(int i = 0; i < 2; i++)
-    {
-      delete[] vals[i];
-      delete[] rowptrs[i];
-      delete[] colinds[i];
+      valsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), vals[m]);
+      rowptrsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), rowptrs[m]);
+      colindsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), colinds[m]);
     }
   }
   //now run the threaded addition on mats[0] and mats[1]
   ISC zero(0);
   ISC one(1);
-  Tpetra::MMdetails::AddKernels<SC, LO, GO, NT>::addSorted(valsCRS[0], rowptrsCRS[0], colindsCRS[0], one, valsCRS[1], rowptrsCRS[1], colindsCRS[1], one, valsCRS[2], rowptrsCRS[2], colindsCRS[2]);
-  //the above function is an unfenced kernel launch, and the verification below relies on UVM, so fence here.
+  Tpetra::MMdetails::AddKernels<SC, LO, GO, NT>::addSorted(
+                                valsCRS[0], rowptrsCRS[0], colindsCRS[0], one, 
+                                valsCRS[1], rowptrsCRS[1], colindsCRS[1], one, 
+                                valsCRS[2], rowptrsCRS[2], colindsCRS[2]);
+
   ExecSpace().fence();
+
+  vals[2] = Kokkos::create_mirror_view_and_copy(
+                           typename ValuesType::HostMirror::memory_space(),
+                           valsCRS[2]);
+  rowptrs[2] = Kokkos::create_mirror_view_and_copy(
+                              typename RowptrsType::HostMirror::memory_space(),
+                              rowptrsCRS[2]);
+  colinds[2] = Kokkos::create_mirror_view_and_copy(
+                              typename ColindsType::HostMirror::memory_space(),
+                              colindsCRS[2]);
+  
+  //the above function is an unfenced kernel launch, and the verification below relies on UVM, so fence here.
   //now scan through C's rows and entries to check they are correct
-  TEST_ASSERT(rowptrsCRS[0].extent(0) == rowptrsCRS[2].extent(0));
+  TEST_ASSERT(rowptrs[0].extent(0) == rowptrs[2].extent(0));
   for(size_t i = 0; i < nrows; i++)
   {
     //also compute what C's row should be (as dense values)
@@ -1612,11 +1621,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
     std::vector<bool> correctEntries(nrows, false);
     for(size_t j = 0; j < nnzPerRow; j++)
     {
-      int col1 = colindsCRS[0](i * nnzPerRow + j);
-      int col2 = colindsCRS[1](i * nnzPerRow + j);
-      correctVals[col1] += valsCRS[0](i * nnzPerRow + j);
+      int col1 = colinds[0](i * nnzPerRow + j);
+      int col2 = colinds[1](i * nnzPerRow + j);
+      correctVals[col1] += vals[0](i * nnzPerRow + j);
       correctEntries[col1] = true;
-      correctVals[col2] += valsCRS[1](i * nnzPerRow + j);
+      correctVals[col2] += vals[1](i * nnzPerRow + j);
       correctEntries[col2] = true;
     }
     size_t actualNNZ = 0;
@@ -1625,8 +1634,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
       if(correctEntries[j])
         actualNNZ++;
     }
-    size_t Crowstart = rowptrsCRS[2](i);
-    size_t Crowlen = rowptrsCRS[2](i + 1) - Crowstart;
+    size_t Crowstart = rowptrs[2](i);
+    size_t Crowlen = rowptrs[2](i + 1) - Crowstart;
     TEST_ASSERT(Crowlen == actualNNZ);
     for(size_t j = 0; j < Crowlen; j++)
     {
@@ -1634,9 +1643,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_sorted, SC, LO, GO
       if(j > 0)
       {
         //Check entries are sorted
-        TEST_ASSERT(colindsCRS[2](Coffset - 1) <= colindsCRS[2](Coffset));
+        TEST_ASSERT(colinds[2](Coffset - 1) <= colinds[2](Coffset));
       }
-      TEST_FLOATING_EQUALITY(valsCRS[2](Coffset), correctVals[colindsCRS[2](Coffset)], 1e-12);
+      TEST_FLOATING_EQUALITY(vals[2](Coffset), correctVals[colinds[2](Coffset)], 1e-12);
     }
   }
 }
@@ -1672,7 +1681,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
   size_t nrows = 1000;
   size_t nnzPerRow = 20;
   typedef Tpetra::CrsMatrix<SC, LO, GO, NT> crs_matrix_type;
-  typedef typename crs_matrix_type::local_matrix_type KCRS;
+  typedef typename crs_matrix_type::local_matrix_device_type KCRS;
   typedef typename crs_matrix_type::impl_scalar_type ISC;
   typedef typename KCRS::values_type::non_const_type ValuesType;
   typedef typename KCRS::row_map_type::non_const_type RowptrsType;
@@ -1682,10 +1691,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
   RowptrsType rowptrsCRS[3];
   ColindsType colindsCRS[3];
   //Populate A and B
+  typename ValuesType::HostMirror vals[3];
+     vals[0] = typename ValuesType::HostMirror("vals0", nrows*nnzPerRow);
+     vals[1] = typename ValuesType::HostMirror("vals1", nrows*nnzPerRow);
+     vals[2] = typename ValuesType::HostMirror("vals2", nrows*nnzPerRow);
+  typename RowptrsType::HostMirror rowptrs[3];
+     rowptrs[0] = typename RowptrsType::HostMirror("rowptr0", nrows+1);
+     rowptrs[1] = typename RowptrsType::HostMirror("rowptr1", nrows+1);
+     rowptrs[2] = typename RowptrsType::HostMirror("rowptr2", nrows+1);
+  typename ColindsType::HostMirror colinds[3]; 
+     colinds[0] = typename ColindsType::HostMirror("colind0", nrows*nnzPerRow);
+     colinds[1] = typename ColindsType::HostMirror("colind1", nrows*nnzPerRow);
+     colinds[2] = typename ColindsType::HostMirror("colind2", nrows*nnzPerRow);
   {
-    ISC* vals[2] = {new ISC[nrows * nnzPerRow], new ISC[nrows * nnzPerRow]};
-    LO* rowptrs[2] = {new LO[nrows * nnzPerRow], new LO[nrows * nnzPerRow]};
-    LO* colinds[2] = {new LO[nrows * nnzPerRow], new LO[nrows * nnzPerRow]};
     //want consistent test results
     srand(12);
     for(LO m = 0; m < 2; m++)
@@ -1719,32 +1737,35 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
       {
         rowptrs[m][row] = row * nnzPerRow;
       }
-      valsCRS[m] = ValuesType("Values", nrows * nnzPerRow);
-      rowptrsCRS[m] = RowptrsType("RowPtrs", nrows + 1);
-      colindsCRS[m] = ColindsType("ColInds", nrows * nnzPerRow);
-      for(size_t i = 0; i < nrows + 1; i++)
-      {
-        rowptrsCRS[m](i) = rowptrs[m][i];
-      }
-      for(size_t i = 0; i < nrows * nnzPerRow; i++)
-      {
-        valsCRS[m](i) = vals[m][i];
-        colindsCRS[m](i) = colinds[m][i];
-      }
-    }
-    for(int i = 0; i < 2; i++)
-    {
-      delete[] vals[i];
-      delete[] rowptrs[i];
-      delete[] colinds[i];
+      valsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), vals[m]);
+      rowptrsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), rowptrs[m]);
+      colindsCRS[m] = Kokkos::create_mirror_view_and_copy(
+                                      typename NT::memory_space(), colinds[m]);
     }
   }
   //now run the threaded addition on mats[0] and mats[1]
   ISC zero(0);
   ISC one(1);
-  Tpetra::MMdetails::AddKernels<SC, LO, GO, NT>::addUnsorted(valsCRS[0], rowptrsCRS[0], colindsCRS[0], one, valsCRS[1], rowptrsCRS[1], colindsCRS[1], one, nrows, valsCRS[2], rowptrsCRS[2], colindsCRS[2]);
+  Tpetra::MMdetails::AddKernels<SC, LO, GO, NT>::addUnsorted(
+                               valsCRS[0], rowptrsCRS[0], colindsCRS[0], one, 
+                               valsCRS[1], rowptrsCRS[1], colindsCRS[1], one, 
+                               nrows, valsCRS[2], rowptrsCRS[2], colindsCRS[2]);
+
   //now scan through C's rows and entries to check they are correct
   TEST_ASSERT(rowptrsCRS[0].extent(0) == rowptrsCRS[2].extent(0));
+
+  vals[2] = Kokkos::create_mirror_view_and_copy(
+                           typename ValuesType::HostMirror::memory_space(),
+                           valsCRS[2]);
+  rowptrs[2] = Kokkos::create_mirror_view_and_copy(
+                              typename RowptrsType::HostMirror::memory_space(),
+                              rowptrsCRS[2]);
+  colinds[2] = Kokkos::create_mirror_view_and_copy(
+                              typename ColindsType::HostMirror::memory_space(),
+                              colindsCRS[2]);
+
   for(size_t i = 0; i < nrows; i++)
   {
     //also compute what C's row should be (as dense values)
@@ -1752,11 +1773,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
     std::vector<bool> correctEntries(nrows, false);
     for(size_t j = 0; j < nnzPerRow; j++)
     {
-      int col1 = colindsCRS[0][i * nnzPerRow + j];
-      int col2 = colindsCRS[1][i * nnzPerRow + j];
-      correctVals[col1] += valsCRS[0](i * nnzPerRow + j);
+      int col1 = colinds[0][i * nnzPerRow + j];
+      int col2 = colinds[1][i * nnzPerRow + j];
+      correctVals[col1] += vals[0](i * nnzPerRow + j);
       correctEntries[col1] = true;
-      correctVals[col2] += valsCRS[1](i * nnzPerRow + j);
+      correctVals[col2] += vals[1](i * nnzPerRow + j);
       correctEntries[col2] = true;
     }
     size_t actualNNZ = 0;
@@ -1765,8 +1786,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
       if(correctEntries[j])
         actualNNZ++;
     }
-    size_t Crowstart = rowptrsCRS[2](i);
-    size_t Crowlen = rowptrsCRS[2](i + 1) - Crowstart;
+    size_t Crowstart = rowptrs[2](i);
+    size_t Crowlen = rowptrs[2](i + 1) - Crowstart;
     TEST_ASSERT(Crowlen == actualNNZ);
     for(size_t j = 0; j < Crowlen; j++)
     {
@@ -1774,9 +1795,9 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Tpetra_MatMat, threaded_add_unsorted, SC, LO,
       if(j > 0)
       {
         //Check entries are sorted
-        TEST_ASSERT(colindsCRS[2](Coffset - 1) <= colindsCRS[2](Coffset));
+        TEST_ASSERT(colinds[2](Coffset - 1) <= colinds[2](Coffset));
       }
-      TEST_FLOATING_EQUALITY(valsCRS[2](Coffset), correctVals[colindsCRS[2](Coffset)], 1e-12);
+      TEST_FLOATING_EQUALITY(vals[2](Coffset), correctVals[colinds[2](Coffset)], 1e-12);
     }
   }
 }
@@ -1792,7 +1813,7 @@ bool checkLocallySorted(const CrsMat& A)
   using LO = typename CrsMat::local_ordinal_type;
   using Teuchos::reduceAll;
   using Teuchos::outArg;
-  auto graph = A.getLocalMatrix().graph;
+  auto graph = A.getLocalMatrixHost().graph;
   LO numLocalRows = A.getNodeNumRows();
   int allSorted = 1;
   for(int i = 0; i < numLocalRows; i++)
@@ -1822,37 +1843,40 @@ bool verifySum(const CrsMat& A, const CrsMat& B, const CrsMat& C)
   using GO = typename CrsMat::global_ordinal_type;
   using KAT = Kokkos::Details::ArithTraits<SC>;
   using Teuchos::Array;
+  typedef typename CrsMat::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMat::nonconst_values_host_view_type vals_type;
+
   auto rowMap = A.getRowMap();
   LO numLocalRows = rowMap->getNodeNumElements();
   GO Amax = A.getGlobalMaxNumRowEntries();
   GO Bmax = B.getGlobalMaxNumRowEntries();
   GO Cmax = C.getGlobalMaxNumRowEntries();
-  Array<GO> Ainds(Amax);
-  Array<SC> Avals(Amax);
-  Array<GO> Binds(Bmax);
-  Array<SC> Bvals(Bmax);
-  Array<GO> Cinds(Cmax);
-  Array<SC> Cvals(Cmax);
+  gids_type Ainds("Ainds",Amax);
+  vals_type Avals("Avals",Amax);
+  gids_type Binds("Binds",Bmax);
+  vals_type Bvals("Bvals",Bmax);
+  gids_type Cinds("Cinds",Cmax);
+  vals_type Cvals("Cvals",Cmax);
   for(LO i = 0; i < numLocalRows; i++)
   {
     GO gid = rowMap->getGlobalElement(i);
     size_t Aentries;
     size_t Bentries;
     size_t Centries;
-    A.getGlobalRowCopy(gid, Ainds(), Avals(), Aentries);
-    B.getGlobalRowCopy(gid, Binds(), Bvals(), Bentries);
-    C.getGlobalRowCopy(gid, Cinds(), Cvals(), Centries);
-    Tpetra::sort2(Ainds.begin(), Ainds.begin() + Aentries, Avals.begin());
-    Tpetra::sort2(Binds.begin(), Binds.begin() + Bentries, Bvals.begin());
-    Tpetra::sort2(Cinds.begin(), Cinds.begin() + Centries, Cvals.begin());
+    A.getGlobalRowCopy(gid, Ainds, Avals, Aentries);
+    B.getGlobalRowCopy(gid, Binds, Bvals, Bentries);
+    C.getGlobalRowCopy(gid, Cinds, Cvals, Centries);
+    Tpetra::sort2(Ainds, Aentries, Avals);
+    Tpetra::sort2(Binds, Bentries, Bvals);
+    Tpetra::sort2(Cinds, Centries, Cvals);
     //Now, scan through the row to make sure C's entries match
     size_t Ait = 0;
     size_t Bit = 0;
     for(size_t Cit = 0; Cit < Centries; Cit++)
     {
       GO col = Cinds[Cit];
-      SC val = Cvals[Cit];
-      SC goldVal = 0;
+      typename vals_type::value_type val = Cvals[Cit];
+      typename vals_type::value_type goldVal = 0;
       if(Ait < Aentries && Ainds[Ait] == col)
         goldVal += Avals[Ait++];
       if(Bit < Bentries && Binds[Bit] == col)
@@ -1946,7 +1970,7 @@ RCP<Tpetra::CrsMatrix<SC, LO, GO, NT>> getUnsortedTestMatrix(
   using Teuchos::RCP;
   using Teuchos::ParameterList;
   using crs_matrix_type = Tpetra::CrsMatrix<SC, LO, GO, NT>;
-  using KCRS = typename crs_matrix_type::local_matrix_type;
+  using KCRS = typename crs_matrix_type::local_matrix_device_type;
   using size_type = typename KCRS::row_map_type::non_const_value_type;
   using lno_t =     typename KCRS::index_type::non_const_value_type;
   using kk_scalar_t =  typename KCRS::values_type::non_const_value_type;
diff --git a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
index 9b42156dd958..f39dcfa0cff5 100644
--- a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
+++ b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
@@ -69,6 +69,7 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
   std::string addTimerName = "CG: axpby";
   std::string matvecTimerName = "CG: spmv";
   std::string dotTimerName = "CG: dot";
+  std::string replaceValsName = "CG: replaceLocalValues";
   static_assert (std::is_same<typename CrsMatrix::scalar_type, typename Vector::scalar_type>::value,
                  "The CrsMatrix and Vector template parameters must have the same scalar_type.");
 
@@ -82,11 +83,14 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
   p = Tpetra::createVector<ScalarType>(A->getRangeMap());
   Ap = Tpetra::createVector<ScalarType>(A->getRangeMap());
 
-  int length = r->getLocalLength();
-  for(int i = 0;i<length;i++) {
-    x->replaceLocalValue(i,0);
-    r->replaceLocalValue(i,1);
-    Ap->replaceLocalValue(i,1);
+  {
+    TimeMonitor t(*TimeMonitor::getNewTimer(replaceValsName));
+    int length = r->getLocalLength();
+    for(int i = 0;i<length;i++) {
+      x->replaceLocalValue(i,0);
+      r->replaceLocalValue(i,1);
+      Ap->replaceLocalValue(i,1);
+    }
   }
 
   magnitude_type normr = 0;
diff --git a/packages/tpetra/core/test/RowMatrixTransposer/main.cpp b/packages/tpetra/core/test/RowMatrixTransposer/main.cpp
index 28a74475f23d..f4c5d36bcb96 100644
--- a/packages/tpetra/core/test/RowMatrixTransposer/main.cpp
+++ b/packages/tpetra/core/test/RowMatrixTransposer/main.cpp
@@ -52,12 +52,10 @@ typename CrsMatrix_t::scalar_type getNorm(CrsMatrix_t& matrix){
   typedef typename CrsMatrix_t::scalar_type Scalar;
   Scalar mySum = 0;
 
-  Teuchos::Array<LO> inds(matrix.getNodeMaxNumRowEntries());
-  Teuchos::Array<Scalar> vals(matrix.getNodeMaxNumRowEntries());
   for(int i =0; ((size_t)i)<matrix.getNodeNumRows(); ++i){
     size_t numRowEnts = matrix.getNumEntriesInLocalRow(i);
-    Teuchos::ArrayView<const LO> indsView = inds();
-    Teuchos::ArrayView<const Scalar> valsView = vals();
+    typename CrsMatrix_t::local_inds_host_view_type indsView;
+    typename CrsMatrix_t::values_host_view_type valsView;
     matrix.getLocalRowView(i, indsView, valsView);
     for(size_t j=0; ((size_t)j)<numRowEnts; ++j){
       mySum += valsView[j]*valsView[j];
diff --git a/packages/tpetra/core/test/RowMatrixTransposer/sorted.cpp b/packages/tpetra/core/test/RowMatrixTransposer/sorted.cpp
index 006cf15aa4e8..2ad0a00b2555 100644
--- a/packages/tpetra/core/test/RowMatrixTransposer/sorted.cpp
+++ b/packages/tpetra/core/test/RowMatrixTransposer/sorted.cpp
@@ -228,8 +228,8 @@ testTranspose (bool& success,
       std::vector<LO> lclColIndsBuf;
       std::vector<ST> valsBuf;
       for (LO lclRow = 0; lclRow < lclNumRows_at; ++lclRow) {
-        Teuchos::ArrayView<const LO> lclColInds;
-        Teuchos::ArrayView<const ST> vals;
+        typename crs_matrix_type::local_inds_host_view_type lclColInds;
+        typename crs_matrix_type::values_host_view_type vals;
         AT_unsorted->getLocalRowView (lclRow, lclColInds, vals);
 
         const GO gblNumRows = GO (lclNumRows) * GO (comm->getSize ());
@@ -240,9 +240,9 @@ testTranspose (bool& success,
           // Result's rows may not be sorted.
           lclColIndsBuf.resize (lclColInds.size ());
           valsBuf.resize (vals.size ());
-          std::copy (lclColInds.begin (), lclColInds.end (),
+          std::copy (lclColInds.data(), lclColInds.data()+lclColInds.extent(0),
                      lclColIndsBuf.begin ());
-          std::copy (vals.begin (), vals.end (), valsBuf.begin ());
+          std::copy (vals.data(), vals.data()+vals.extent(0), valsBuf.begin ());
           Tpetra::sort2 (lclColIndsBuf.begin (), lclColIndsBuf.end (),
                          valsBuf.begin ());
 
@@ -298,8 +298,8 @@ testTranspose (bool& success,
     const LO lclNumRows_at (rowMap_at->getNodeNumElements ());
     if (lclNumRows_at != 0) {
       for (LO lclRow = 0; lclRow < lclNumRows_at; ++lclRow) {
-        Teuchos::ArrayView<const LO> lclColInds;
-        Teuchos::ArrayView<const ST> vals;
+        typename crs_matrix_type::local_inds_host_view_type lclColInds;
+        typename crs_matrix_type::values_host_view_type vals;
         AT_sorted->getLocalRowView (lclRow, lclColInds, vals);
 
         const GO gblNumRows = GO (lclNumRows) * GO (comm->getSize ());
diff --git a/packages/tpetra/core/test/Tpetra_TestingUtilities.hpp b/packages/tpetra/core/test/Tpetra_TestingUtilities.hpp
index d6e4f52b9a3a..3c0b2234cc43 100644
--- a/packages/tpetra/core/test/Tpetra_TestingUtilities.hpp
+++ b/packages/tpetra/core/test/Tpetra_TestingUtilities.hpp
@@ -52,6 +52,9 @@
 ///   or on any contents of this file.
 
 #include "Teuchos_UnitTestHarness.hpp"
+#include "Teuchos_ArrayRCP.hpp"
+#include "Kokkos_View.hpp"
+#include "KokkosCompat_View.hpp"
 #include "Tpetra_Core.hpp"
 #include "TpetraCore_ETIHelperMacros.h"
 #include "Teuchos_DefaultSerialComm.hpp"
@@ -116,6 +119,24 @@ namespace Tpetra {
         return serialComm_;
       }
     }
+    
+    // Wrap a subview of Kokkos::View into an ArrayRCP
+    template<class ViewType>
+    Teuchos::ArrayRCP<typename ViewType::value_type> arcp_from_view(ViewType &view, int size=-1) {
+      if(size == -1) size=view.extent(0);
+      return Kokkos::Compat::persistingView<ViewType>(view, 0, size);
+    }
+
+     // Copy a subview of Kokkos::View into an Teuchos::Array
+    template<class T>
+    Kokkos::View<T*,Kokkos::LayoutLeft,Kokkos::HostSpace> copy_view_from_array(const Teuchos::Array<T> &v_in, int size=-1) {
+      if(size == -1) size=v_in.size();
+      Kokkos::View<const T*,Kokkos::LayoutLeft,Kokkos::HostSpace> v_in_wrap(v_in.data(),size);
+      Kokkos::View<T*,Kokkos::LayoutLeft,Kokkos::HostSpace> v_out("array_copy",size);
+      Kokkos::deep_copy(v_out,v_in_wrap);
+      return v_out;
+    }
+
 
 
   } // namespace TestingUtilities
diff --git a/packages/tpetra/core/test/Utils/CMakeLists.txt b/packages/tpetra/core/test/Utils/CMakeLists.txt
index 9a1881746cde..f193e85288fe 100644
--- a/packages/tpetra/core/test/Utils/CMakeLists.txt
+++ b/packages/tpetra/core/test/Utils/CMakeLists.txt
@@ -1,4 +1,22 @@
 
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  TpetraUtils_WrappedDualView
+  SOURCES
+    TpetraUtils_WrappedDualView
+  ARGS ${ARGS}
+  COMM serial mpi
+  STANDARD_PASS_OUTPUT
+  )
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  TpetraUtils_WrappedDualViewNull
+  SOURCES
+    TpetraUtils_WrappedDualViewNull
+  ARGS ${ARGS}
+  COMM serial mpi
+  STANDARD_PASS_OUTPUT
+  )
+
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   TpetraUtils_UnitTests
   SOURCES
diff --git a/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualView.cpp b/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualView.cpp
new file mode 100644
index 000000000000..c4955c928310
--- /dev/null
+++ b/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualView.cpp
@@ -0,0 +1,1362 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//          Tpetra: Templated Linear Algebra Services Package
+//                 Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ************************************************************************
+// @HEADER
+*/
+
+#include "Kokkos_StaticCrsGraph.hpp"
+
+#include <Tpetra_Details_WrappedDualView.hpp>
+#include <Tpetra_Map.hpp>
+#include <Tpetra_Access.hpp>
+#include <Tpetra_Core.hpp>
+
+#include <Kokkos_DualView.hpp>
+
+#include <Teuchos_UnitTestHarness.hpp>
+#include <Teuchos_DefaultComm.hpp>
+
+namespace {
+
+using DeviceType = Tpetra::Map<>::device_type;
+
+using DualViewType = Kokkos::DualView<int*, DeviceType>;
+using WrappedDualViewType = Tpetra::Details::WrappedDualView<DualViewType>;
+
+using HostViewType = typename DualViewType::t_host;
+using DeviceViewType = typename DualViewType::t_dev;
+using ConstDeviceViewType = typename DualViewType::t_dev::const_type;
+
+using ConstDualViewType = Kokkos::DualView<const int*, DeviceType>;
+using WrappedConstDualViewType = Tpetra::Details::WrappedDualView<ConstDualViewType>;
+
+class WrappedDualViewFixture {
+public:
+  static constexpr bool deviceMemoryIsHostAccessible = Kokkos::SpaceAccessibility<Kokkos::Serial, typename DeviceType::memory_space>::accessible;
+
+  WrappedDualViewFixture()
+    : viewSize(16),
+      dualView("dualView", viewSize)
+  {
+    for (int i=0; i<viewSize; i++) {
+      dualView.view_host()(i) = 0;
+    }
+    dualView.modify_host();
+    dualView.sync_device();
+  }
+
+  DualViewType getDualView() {
+    return dualView;
+  }
+
+  ConstDualViewType getConstDualView() {
+    return dualView;
+  }
+
+  void fillDualViewOnHost() {
+    auto hostView = dualView.view_host();
+    fillViewOnHost(hostView);
+    dualView.modify_host();
+  }
+
+  void fillDualViewOnDevice() {
+    auto deviceView = dualView.view_device();
+    fillViewOnDevice(deviceView);
+    dualView.modify_device();
+  }
+
+  void fillDualViewOnHostDevice() {
+    fillDualViewOnHost();
+    dualView.sync_device();
+  }
+
+  bool valuesInitializedToZero() {
+    auto hostView = dualView.view_host();
+    auto deviceView = dualView.view_device();
+    return valuesCorrectOnHost(hostView, 0) && valuesCorrectOnDevice(deviceView, 0);
+  }
+
+  template <typename ViewType>
+  void fillViewOnHost(ViewType view) {
+    fillViewOnHost(view, 0, viewSize);
+  }
+
+  template <typename ViewType>
+  void fillViewOnHost(ViewType view, int startIndex, int length) {
+    for (int i=0; i<length; i++) {
+      int value = i + startIndex;
+      view(i) = value;
+    }
+  }
+
+  template <typename ViewType>
+  void multiplyOnHost(ViewType view, int multiplier) {
+    for (unsigned i=0; i<view.size(); i++) {
+      view(i) = multiplier*view(i);
+    }
+  }
+
+  template <typename ViewType>
+  bool valuesCorrectOnHost(ViewType view, int multiplier = 1) {
+    return valuesCorrectOnHost(view, 0, viewSize, multiplier);
+  }
+
+  template <typename ViewType>
+  bool valuesCorrectOnHost(ViewType view, int startIndex, int length, int multiplier = 1) {
+    bool result = (static_cast<int>(view.size()) == length);
+    for (int i=0; i<length && result; i++) {
+      int value = multiplier*(i + startIndex);
+      result &= (view(i) == value);
+    }
+    return result;
+  }
+
+  template <typename ViewType>
+  void fillViewOnDevice(ViewType view) {
+    fillViewOnDevice(view, 0, viewSize);
+  }
+
+  template <typename ViewType>
+  void fillViewOnDevice(ViewType view, int startIndex, int length) {
+    Kokkos::parallel_for("fill on device", length, KOKKOS_LAMBDA(const int& i) {
+          int value = i + startIndex;
+          view(i) = value;
+        });
+  }
+
+  template <typename ViewType>
+  void multiplyOnDevice(ViewType view, int multiplier) {
+    Kokkos::parallel_for("multiply on device", view.size(), KOKKOS_LAMBDA(const int& i) {
+          view(i) = multiplier*view(i);
+        });
+  }
+
+  template <typename ViewType>
+  bool valuesCorrectOnDevice(ViewType view, int multiplier = 1) {
+    return valuesCorrectOnDevice(view, 0, viewSize, multiplier);
+  }
+
+  template <typename ViewType>
+  bool valuesCorrectOnDevice(ViewType view, int startIndex, int length, int multiplier = 1) {
+    int result = 0;
+    if (static_cast<int>(view.size()) != length) {
+      result++;
+    }
+    else {
+      Kokkos::parallel_reduce("check on device", length,
+          KOKKOS_LAMBDA(const int& i, int& localResult) {
+            int value = multiplier*(i + startIndex);
+            localResult = (view(i) == value) ? 0 : 1;
+          }, result);
+    }
+    return (result == 0);
+  }
+
+  template <typename ViewType>
+  bool extentCorrect(ViewType view) {
+    return (view.extent(0) == dualView.extent(0));
+  }
+
+  int getViewSize() {
+    return viewSize;
+  }
+
+private:
+  int viewSize;
+  DualViewType dualView;
+};
+
+TEUCHOS_UNIT_TEST(WrappedDualView, defaultConstructorAvailable) {
+  const WrappedDualViewType wrappedView;
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceViewConstructor) {
+  WrappedDualViewFixture fixture;
+  WrappedDualViewType wrappedView;
+
+  {
+    DeviceViewType deviceView("device view", fixture.getViewSize());
+    fixture.fillViewOnDevice(deviceView);
+
+    wrappedView = WrappedDualViewType(deviceView);
+  }
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, extent) {
+  WrappedDualViewFixture fixture;
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  TEST_ASSERT(fixture.extentCorrect(wrappedView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostReadOnly_constData) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  const WrappedConstDualViewType wrappedView(fixture.getConstDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceReadOnly_constData) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  const WrappedConstDualViewType wrappedView(fixture.getConstDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewReadOnly_constData) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  const WrappedConstDualViewType wrappedView(fixture.getConstDualView());
+
+  int startIndex = 4;
+  int length = 6;
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewReadOnly_constData) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  const WrappedConstDualViewType wrappedView(fixture.getConstDualView());
+
+  int startIndex = 4;
+  int length = 6;
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHost();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostReadWrite) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHost();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadWrite);
+  fixture.multiplyOnHost(hostView, 2);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostOverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+  fixture.fillViewOnHost(hostView);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnDevice();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceReadWrite) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadWrite);
+  fixture.multiplyOnDevice(deviceView, 2);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceOverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+  fixture.fillViewOnDevice(deviceView);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostThrowsIfDeviceViewAlive_ReadOnly) {
+  WrappedDualViewFixture fixture;
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostView(Tpetra::Access::ReadOnly));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostView(Tpetra::Access::ReadOnly), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostThrowsIfDeviceViewAlive_ReadWrite) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadWrite);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostView(Tpetra::Access::ReadWrite));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostView(Tpetra::Access::ReadWrite), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostThrowsIfDeviceViewAlive_OverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostView(Tpetra::Access::OverwriteAll));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostView(Tpetra::Access::OverwriteAll), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceThrowsIfHostViewAlive_ReadOnly) {
+  WrappedDualViewFixture fixture;
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceView(Tpetra::Access::ReadOnly));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceView(Tpetra::Access::ReadOnly), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceThrowsIfHostViewAlive_ReadWrite) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadWrite);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceView(Tpetra::Access::ReadWrite));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceView(Tpetra::Access::ReadWrite), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceThrowsIfHostViewAlive_OverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceView(Tpetra::Access::OverwriteAll));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceView(Tpetra::Access::OverwriteAll), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostReadOnly_syncToHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnDevice();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostReadWrite_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::ReadWrite);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostOverwriteAll_clearSyncState_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+  }
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceReadOnly_syncToDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnHost();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceReadWrite_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnHost();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::ReadWrite);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView));
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceOverwriteAll_clearSyncState_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+  }
+
+  auto hostView = wrappedView.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostView));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHost();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 3;
+  int length = 4;
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewReadWrite) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHost();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 2;
+  int length = 5;
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadWrite);
+  fixture.multiplyOnHost(hostSubview, 2);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewOverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 5;
+  int length = 5;
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+  fixture.fillViewOnHost(hostSubview, startIndex, length);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnDevice();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 3;
+  int length = 4;
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewReadWrite) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 2;
+  int length = 5;
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadWrite);
+  fixture.multiplyOnDevice(deviceSubview, 2);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewOverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 5;
+  int length = 5;
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+  fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewReadOnly_syncToHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnDevice();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 4;
+  int length = 3;
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewReadWrite_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 5;
+  int length = 2;
+  {
+    auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadWrite);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+    fixture.multiplyOnHost(hostSubview, 2);
+  }
+
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewOverwriteAll_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  int startIndex = 0;
+  int length = 4;
+  {
+    auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto hostSubviewFromDevice = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDevice, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto deviceSubviewUnchanged = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto deviceSubviewChanged = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChanged, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewReadOnly_syncToDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnHost();
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 4;
+  int length = 3;
+  auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewReadWrite_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+  fixture.fillDualViewOnHost();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 5;
+  int length = 2;
+  {
+    auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadWrite);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+    fixture.multiplyOnDevice(deviceSubview, 2);
+  }
+
+  auto hostSubview = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadWrite);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewOverwriteAll_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  int startIndex = 0;
+  int length = 4;
+  {
+    auto deviceSubview = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto deviceSubviewFromHost = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromHost, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto hostSubviewUnchanged = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto hostSubviewChanged = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChanged, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostSubviewThrowsIfNonOverlappingDeviceViewAlive_ReadOnly) {
+  WrappedDualViewFixture fixture;
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto deviceView = wrappedView.getDeviceSubview(startIndexA, lengthA, Tpetra::Access::ReadOnly);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::ReadOnly));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::ReadOnly), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostSubviewThrowsIfNonOverlappingDeviceViewAlive_ReadWrite) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto deviceView = wrappedView.getDeviceSubview(startIndexA, lengthA, Tpetra::Access::ReadWrite);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::ReadWrite));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::ReadWrite), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, hostSubviewThrowsIfNonOverlappingDeviceViewAlive_OverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto deviceView = wrappedView.getDeviceSubview(startIndexA, lengthA, Tpetra::Access::OverwriteAll);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::OverwriteAll));
+  }
+  else {
+    TEST_THROW(wrappedView.getHostSubview(startIndexB, lengthB, Tpetra::Access::OverwriteAll), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceSubviewThrowsIfNonOverlappingHostViewAlive_ReadOnly) {
+  WrappedDualViewFixture fixture;
+
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto hostView = wrappedView.getHostSubview(startIndexA, lengthA, Tpetra::Access::ReadOnly);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::ReadOnly));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::ReadOnly), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceSubviewThrowsIfNonOverlappingHostViewAlive_ReadWrite) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto hostView = wrappedView.getHostSubview(startIndexA, lengthA, Tpetra::Access::ReadWrite);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::ReadWrite));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::ReadWrite), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, deviceSubviewThrowsIfNonOverlappingHostViewAlive_OverwriteAll) {
+  WrappedDualViewFixture fixture;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndexA = 0;
+  int lengthA = 4;
+
+  int startIndexB = 8;
+  int lengthB = 4;
+
+  auto hostView = wrappedView.getHostSubview(startIndexA, lengthA, Tpetra::Access::OverwriteAll);
+  if (fixture.deviceMemoryIsHostAccessible) {
+    TEST_NOTHROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::OverwriteAll));
+  }
+  else {
+    TEST_THROW(wrappedView.getDeviceSubview(startIndexB, lengthB, Tpetra::Access::OverwriteAll), std::runtime_error);
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, aliasedSubviewConstructor) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+  const WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startIndex = 4;
+  int length = 6;
+  const WrappedDualViewType wrappedSubview(wrappedView, startIndex, length);
+
+  TEST_EQUALITY(wrappedSubview.extent(0), static_cast<size_t>(length));
+  {
+    auto hostSubview = wrappedSubview.getHostView(Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubview, startIndex, length));
+  }
+  {
+    auto deviceSubview = wrappedSubview.getDeviceView(Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubview, startIndex, length));
+  }
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostViewWrappedSubviewOverwriteAll_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  int startIndex = 0;
+  int length = 4;
+  WrappedDualViewType wrappedSubview(wrappedView, startIndex, length);
+  {
+    auto hostSubview = wrappedSubview.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto hostSubviewFromDevice = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDevice, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto deviceSubviewUnchanged = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto deviceSubviewChanged = wrappedSubview.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChanged, startIndex, length));
+
+  auto deviceSubviewChangedOriginal = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceViewWrappedSubviewOverwriteAll_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  int startIndex = 0;
+  int length = 4;
+  WrappedDualViewType wrappedSubview(wrappedView, startIndex, length);
+  {
+    auto deviceSubview = wrappedSubview.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto deviceSubviewFromHost = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromHost, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto hostSubviewUnchanged = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto hostSubviewChanged = wrappedSubview.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChanged, startIndex, length));
+
+  auto hostSubviewChangedOriginal = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostViewIntermediateWrappedSubviewOverwriteAll_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  WrappedDualViewType intermediateWrappedSubview(wrappedView, 0, 8);
+
+  int startIndex = 0;
+  int length = 4;
+  WrappedDualViewType wrappedSubview(intermediateWrappedSubview, startIndex, length);
+  {
+    auto hostSubview = wrappedSubview.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto hostSubviewFromDevice = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDevice, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto deviceSubviewUnchanged = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto deviceSubviewChanged = wrappedSubview.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChanged, startIndex, length));
+
+  auto deviceSubviewChangedOriginal = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceViewIntermediateWrappedSubviewOverwriteAll_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  WrappedDualViewType intermediateWrappedSubview(wrappedView, 0, 8);
+
+  int startIndex = 0;
+  int length = 4;
+  WrappedDualViewType wrappedSubview(intermediateWrappedSubview, startIndex, length);
+  {
+    auto deviceSubview = wrappedSubview.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto deviceSubviewFromHost = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromHost, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto hostSubviewUnchanged = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto hostSubviewChanged = wrappedSubview.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChanged, startIndex, length));
+
+  auto hostSubviewChangedOriginal = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewIntermediateWrappedSubviewOverwriteAll_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  WrappedDualViewType intermediateWrappedSubview(wrappedView, 0, 8);
+  int startIndex = 0;
+  int length = 4;
+  {
+    auto hostSubview = intermediateWrappedSubview.getHostSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto hostSubviewFromDevice = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDevice, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto deviceSubviewUnchanged = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto deviceSubviewChanged = intermediateWrappedSubview.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChanged, startIndex, length));
+
+  auto deviceSubviewChangedOriginal = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewIntermediateWrappedSubviewOverwriteAll_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  WrappedDualViewType intermediateWrappedSubview(wrappedView, 0, 8);
+  int startIndex = 0;
+  int length = 4;
+  {
+    auto deviceSubview = intermediateWrappedSubview.getDeviceSubview(startIndex, length, Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  }
+
+  int startIndexUnchanged = length;
+  int lengthUnchanged = fixture.getViewSize() - length;
+
+  {
+    auto deviceSubviewFromHost = wrappedView.getDeviceSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromHost, startIndexUnchanged, lengthUnchanged, 2));
+  }
+
+  auto hostSubviewUnchanged = wrappedView.getHostSubview(startIndexUnchanged, lengthUnchanged, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewUnchanged, startIndexUnchanged, lengthUnchanged, 2));
+
+  auto hostSubviewChanged = intermediateWrappedSubview.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChanged, startIndex, length));
+
+  auto hostSubviewChangedOriginal = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceViewWrappedSubviewInMiddleOverwriteAll_syncToDevice_modifyOnDevice) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceView);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  int startIndex = 3;
+  int length = 3;
+  WrappedDualViewType wrappedSubview(wrappedView, startIndex, length);
+  {
+    auto hostSubview = wrappedSubview.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostSubview, startIndex, length);
+  }
+
+  int startIndexFront = 0;
+  int lengthFront = startIndex;
+
+  int startIndexBack = startIndex + length;
+  int lengthBack = fixture.getViewSize() - startIndexBack;
+
+  {
+    auto hostSubviewFromDeviceFront = wrappedView.getHostSubview(startIndexFront, lengthFront, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDeviceFront, startIndexFront, lengthFront, 2));
+
+    auto hostSubviewFromDeviceBack = wrappedView.getHostSubview(startIndexBack, lengthBack, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFromDeviceBack, startIndexBack, lengthBack, 2));
+  }
+
+  auto deviceSubviewFront = wrappedView.getDeviceSubview(startIndexFront, lengthFront, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFront, startIndexFront, lengthFront, 2));
+
+  auto deviceSubviewBack = wrappedView.getDeviceSubview(startIndexBack, lengthBack, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewBack, startIndexBack, lengthBack, 2));
+
+  auto deviceSubviewChanged = wrappedSubview.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChanged, startIndex, length));
+
+  auto deviceSubviewChangedOriginal = wrappedView.getDeviceSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostViewWrappedSubviewInMiddleOverwriteAll_syncToHost_modifyOnHost) {
+  WrappedDualViewFixture fixture;
+  TEST_ASSERT(fixture.valuesInitializedToZero());
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnHost(hostView);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  int startIndex = 3;
+  int length = 3;
+  WrappedDualViewType wrappedSubview(wrappedView, startIndex, length);
+  {
+    auto deviceSubview = wrappedSubview.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.fillViewOnDevice(deviceSubview, startIndex, length);
+  }
+
+  int startIndexFront = 0;
+  int lengthFront = startIndex;
+
+  int startIndexBack = startIndex + length;
+  int lengthBack = fixture.getViewSize() - startIndexBack;
+
+  {
+    auto deviceSubviewFromDeviceFront = wrappedView.getDeviceSubview(startIndexFront, lengthFront, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromDeviceFront, startIndexFront, lengthFront, 2));
+
+    auto deviceSubviewFromDeviceBack = wrappedView.getDeviceSubview(startIndexBack, lengthBack, Tpetra::Access::ReadOnly);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFromDeviceBack, startIndexBack, lengthBack, 2));
+  }
+
+  auto hostSubviewFront = wrappedView.getHostSubview(startIndexFront, lengthFront, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFront, startIndexFront, lengthFront, 2));
+
+  auto hostSubviewBack = wrappedView.getHostSubview(startIndexBack, lengthBack, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewBack, startIndexBack, lengthBack, 2));
+
+  auto hostSubviewChanged = wrappedSubview.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChanged, startIndex, length));
+
+  auto hostSubviewChangedOriginal = wrappedView.getHostSubview(startIndex, length, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewChangedOriginal, startIndex, length));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostTwoSubviews_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  int startFirstHalf = 0;
+  int lengthHalf = fixture.getViewSize()/2;
+  int startSecondHalf = lengthHalf;
+
+  const WrappedDualViewType wrappedSubview(wrappedView, startFirstHalf, lengthHalf);
+  auto hostSubviewFirstHalf = wrappedSubview.getHostView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFirstHalf, startFirstHalf, lengthHalf, 2));
+
+  auto hostSubviewSecondHalf = wrappedView.getHostSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewSecondHalf, startSecondHalf, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceTwoSubviews_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  int startFirstHalf = 0;
+  int lengthHalf = fixture.getViewSize()/2;
+  int startSecondHalf = lengthHalf;
+
+  const WrappedDualViewType wrappedSubview(wrappedView, startFirstHalf, lengthHalf);
+  auto deviceSubviewFirstHalf = wrappedSubview.getDeviceView(Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFirstHalf, startFirstHalf, lengthHalf, 2));
+
+  auto deviceSubviewSecondHalf = wrappedView.getDeviceSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewSecondHalf, startSecondHalf, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostSubviewOfSubviewAndSubview_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  int startFirstHalf = 0;
+  int lengthHalf = fixture.getViewSize()/2;
+  int startSecondHalf = lengthHalf;
+  int lengthQuarter = lengthHalf/2;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+  WrappedDualViewType wrappedSubview(wrappedView, startFirstHalf, lengthHalf);
+  {
+    auto deviceView = wrappedView.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnDevice(deviceView, 2);
+  }
+
+  auto hostSubviewFirstHalf = wrappedSubview.getHostSubview(startFirstHalf, lengthQuarter, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFirstHalf, startFirstHalf, lengthQuarter, 2));
+
+  auto hostSubviewSecondHalf = wrappedView.getHostSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewSecondHalf, startSecondHalf, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceSubviewOfSubviewAndSubview_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  int startFirstHalf = 0;
+  int lengthHalf = fixture.getViewSize()/2;
+  int startSecondHalf = lengthHalf;
+  int lengthQuarter = lengthHalf/2;
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+  WrappedDualViewType wrappedSubview(wrappedView, startFirstHalf, lengthHalf);
+  {
+    auto hostView = wrappedView.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnHost(hostView, 2);
+  }
+
+  auto deviceSubviewFirstHalf = wrappedSubview.getDeviceSubview(startFirstHalf, lengthQuarter, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFirstHalf, startFirstHalf, lengthQuarter, 2));
+
+  auto deviceSubviewSecondHalf = wrappedView.getDeviceSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewSecondHalf, startSecondHalf, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessHostTwoSubviewsOfSubview_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startSubview = 4;
+  int lengthSubview = 8;
+  WrappedDualViewType wrappedSubview(wrappedView, startSubview, lengthSubview);
+  {
+    auto deviceView = wrappedSubview.getDeviceView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnDevice(deviceView, 2);
+    TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceView, startSubview, lengthSubview, 2));
+  }
+
+  int startFirstHalf = 0;
+  int lengthHalf = lengthSubview/2;
+  int startSecondHalf = lengthHalf;
+
+  auto hostSubviewFirstHalf = wrappedSubview.getHostSubview(startFirstHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewFirstHalf, startFirstHalf+startSubview, lengthHalf, 2));
+
+  auto hostSubviewSecondHalf = wrappedSubview.getHostSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnHost(hostSubviewSecondHalf, startSecondHalf+startSubview, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, accessDeviceTwoSubviewsOfSubview_ReadOnly) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+
+  WrappedDualViewType wrappedView(fixture.getDualView());
+
+  int startSubview = 4;
+  int lengthSubview = 8;
+  WrappedDualViewType wrappedSubview(wrappedView, startSubview, lengthSubview);
+  {
+    auto hostView = wrappedSubview.getHostView(Tpetra::Access::OverwriteAll);
+    fixture.multiplyOnHost(hostView, 2);
+    TEST_ASSERT(fixture.valuesCorrectOnHost(hostView, startSubview, lengthSubview, 2));
+  }
+
+  int startFirstHalf = 0;
+  int lengthHalf = lengthSubview/2;
+  int startSecondHalf = lengthHalf;
+
+  auto deviceSubviewFirstHalf = wrappedSubview.getDeviceSubview(startFirstHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewFirstHalf, startFirstHalf+startSubview, lengthHalf, 2));
+
+  auto deviceSubviewSecondHalf = wrappedSubview.getDeviceSubview(startSecondHalf, lengthHalf, Tpetra::Access::ReadOnly);
+  TEST_ASSERT(fixture.valuesCorrectOnDevice(deviceSubviewSecondHalf, startSecondHalf+startSubview, lengthHalf, 2));
+}
+
+TEUCHOS_UNIT_TEST(WrappedDualView, attemptConstructUnmanaged) {
+  WrappedDualViewFixture fixture;
+  fixture.fillDualViewOnHostDevice();
+  WrappedDualViewType wrappedView(fixture.getDualView());
+  auto owningView = wrappedView.getDeviceView(Tpetra::Access::ReadWrite);
+  static_assert(decltype(owningView)::rank == 1,
+      "This test requires WrappedDualViewType to be rank 1. If this breaks, use a custom type here.");
+
+  //Although this view doesn't have Unmanaged memory traits in its type,
+  //it behaves as if it did (does not do reference counting), and has use_count() == 0
+  typename WrappedDualViewType::DeviceViewType unmanagedView(owningView.data(), owningView.extent(0));
+  //This should throw - WrappedDualView must be able to take ownership
+  //of the device memory from user, but the user's view does not own it
+  try
+  {
+    WrappedDualViewType cannotConstructThis(unmanagedView);
+    TEST_ASSERT(false);
+  }
+  catch(std::exception&)
+  {}
+}
+
+}
+
+int main(int argc, char* argv[]) {
+  Tpetra::ScopeGuard scopeGuard(&argc, &argv);
+  const int errCode = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv);
+  return errCode;
+}
diff --git a/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualViewNull.cpp b/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualViewNull.cpp
new file mode 100644
index 000000000000..8e4fba59cf67
--- /dev/null
+++ b/packages/tpetra/core/test/Utils/TpetraUtils_WrappedDualViewNull.cpp
@@ -0,0 +1,151 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//          Tpetra: Templated Linear Algebra Services Package
+//                 Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ************************************************************************
+// @HEADER
+*/
+
+#include "Tpetra_TestingUtilities.hpp"
+#include "Tpetra_Details_WrappedDualView.hpp"
+#include "Tpetra_Access.hpp"
+#include "Tpetra_Core.hpp"
+#include "Kokkos_DualView.hpp"
+#include "Teuchos_UnitTestHarness.hpp"
+
+namespace {
+
+TEUCHOS_UNIT_TEST(WrappedDualView, InitFromNull) {
+
+  // Test that wrapped dual views return dual views with
+  // equal host and device counts, regardless of how they are constructed.
+  // Note that constructing a DualView with a null Kokkos device view
+  // and its host mirror returns use_count = 2 on host and 0 on device.
+
+  std::cout << "\n" << std::endl;
+
+  using device_t =  Kokkos::Device<Kokkos::DefaultExecutionSpace,
+                                   Kokkos::DefaultExecutionSpace::memory_space>;
+
+  using view_t = Kokkos::View<double*, device_t>;
+  using view_mirror_t = typename view_t::HostMirror;
+
+  using dualview_t = Kokkos::DualView<double*, device_t>;
+  using wrapped_t = Tpetra::Details::WrappedDualView<dualview_t>;
+
+  {
+    // Here's why we need special logic in WrappedDualView constructor
+    // when the input device view is NULL.
+
+    view_t viewNull;
+    view_mirror_t viewNull_mirror =
+       create_mirror_view_and_copy(typename view_t::host_mirror_space(),
+                                   viewNull);
+
+    dualview_t dvNull(viewNull, viewNull_mirror);
+    size_t use_h = dvNull.h_view.use_count();
+    size_t use_d = dvNull.d_view.use_count();
+
+    std::cout << "Null DualView:     "
+              << "host.use_count = " << use_h << ";  "
+              << "device.use_count = " << use_d 
+              << std::endl;
+    // For UVM or serial builds, use_h == use_d == 0.
+    // But for non-UVM CUDA builds, use_h == 2 and use_d == 0.
+    // This difference is bad for WrappedDualView.
+    // Thus, WrappedDualView's constructor needs to check for a 
+    // null device view before creating the HostMirror.
+  }
+
+  {
+    // Happy case:  device view is non-null
+    wrapped_t wrapped;
+    {
+      view_t v("viewFour", 4);
+      wrapped = wrapped_t(v);
+    }
+    size_t use_h = wrapped.getHostView(Tpetra::Access::ReadOnly).use_count();
+    size_t use_d = wrapped.getDeviceView(Tpetra::Access::ReadOnly).use_count();
+    std::cout << "Wrapped "
+              << wrapped.getDeviceView(Tpetra::Access::ReadOnly).label()
+              << ":  host use_count = " << use_h
+              << ";  device use_count = " << use_d << std::endl;
+    TEST_EQUALITY(use_h, use_d);
+  }
+
+  {
+    // Happy case:  device view is non-null, even though length zero
+    wrapped_t wrapped;
+    {
+      view_t v("viewZero", 0);
+      wrapped = wrapped_t(v);
+    }
+    size_t use_h = wrapped.getHostView(Tpetra::Access::ReadOnly).use_count();
+    size_t use_d = wrapped.getDeviceView(Tpetra::Access::ReadOnly).use_count();
+    std::cout << "Wrapped "
+              << wrapped.getDeviceView(Tpetra::Access::ReadOnly).label()
+              << ":  host use_count = " << use_h
+              << ";  device use_count = " << use_d << std::endl;
+    TEST_EQUALITY(use_h, use_d);
+  }
+
+  {
+    // Unhappy case:  null device view won't work with HostMirror in
+    //                DualView constructor in WrappedDualView constructor
+    wrapped_t wrapped;
+    {
+      view_t v;
+      wrapped = wrapped_t(v);
+    }
+    size_t use_h = wrapped.getHostView(Tpetra::Access::ReadOnly).use_count();
+    size_t use_d = wrapped.getDeviceView(Tpetra::Access::ReadOnly).use_count();
+    std::cout << "Wrapped nullview"
+              << wrapped.getDeviceView(Tpetra::Access::ReadOnly).label()
+              << ":  host use_count = " << use_h
+              << ";  device use_count = " << use_d << std::endl;
+    TEST_EQUALITY(use_h, use_d);
+  }
+}
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+  Tpetra::ScopeGuard scopeGuard(&argc, &argv);
+  const int errCode = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv);
+  return errCode;
+}
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_FileTest.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_FileTest.cpp
index 39e4e5b71d96..0285f7757672 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_FileTest.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_FileTest.cpp
@@ -134,6 +134,7 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
 {
   typedef typename CrsGraphType::global_ordinal_type GO;
   typedef typename ArrayView<const GO>::size_type size_type;
+  typedef typename CrsGraphType::nonconst_global_inds_host_view_type gids_type;
 
   OSTab tab (out);
   int localEqual = 1;
@@ -141,7 +142,7 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
   //
   // Are my local graphs equal?
   //
-  Array<GO> indOrig, ind;
+  gids_type indOrig, ind;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -156,15 +157,15 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
       localEqual = 0;
       break;
     }
-    indOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), numEntries);
+    Kokkos::resize(indOrig,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig, numEntriesOrig);
+    Kokkos::resize(ind,numEntries);
+    A.getGlobalRowCopy (globalRow, ind, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them so
     // we can compare them.
-    std::sort(indOrig.begin (), indOrig.end ());
-    std::sort(ind.begin (), ind.end ());
+    Tpetra::sort(indOrig, indOrig.extent(0));
+    Tpetra::sort(ind, ind.extent(0));
 
     for (size_t k = 0; k < numEntries; ++k) {
       // Values should be _exactly_ equal.
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_InOutTest.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_InOutTest.cpp
index 8831295d2410..58d60709979d 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_InOutTest.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsGraph_InOutTest.cpp
@@ -260,6 +260,7 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
   using Teuchos::REDUCE_MIN;
   typedef typename CrsGraphType::global_ordinal_type GO;
   typedef typename ArrayView<const GO>::size_type size_type;
+  typedef typename CrsGraphType::nonconst_global_inds_host_view_type gids_type;
 
   Teuchos::OSTab tab (Teuchos::rcpFromRef (out));
   int localEqual = 1;
@@ -267,7 +268,7 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
+  gids_type indOrig, ind;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -282,15 +283,15 @@ compareCrsGraph (const CrsGraphType& A_orig, const CrsGraphType& A, Teuchos::Fan
       localEqual = 0;
       break;
     }
-    indOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), numEntries);
+    Kokkos::resize(indOrig,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig, numEntriesOrig);
+    Kokkos::resize(ind,numEntries);
+    A.getGlobalRowCopy (globalRow, ind, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them so
     // we can compare them.
-    std::sort (indOrig.begin (), indOrig.end ());
-    std::sort (ind.begin (), ind.end ());
+    Tpetra::sort (indOrig, indOrig.extent(0));
+    Tpetra::sort (ind, ind.extent(0));
 
     for (size_t k = 0; k < numEntries; ++k) {
       // Indices should be _exactly_ equal.
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_Dist_Binary.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_Dist_Binary.cpp
index 923b56850481..4e0ef02b2a83 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_Dist_Binary.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_Dist_Binary.cpp
@@ -100,6 +100,7 @@ class TestReader {
   using matrix_t = Tpetra::CrsMatrix<scalar_t>;
   using vector_t = Tpetra::Vector<scalar_t>;
   using reader_t = Tpetra::MatrixMarket::Reader<matrix_t>;
+  using indices_type = typename matrix_t::nonconst_global_inds_host_view_type;
 
   //////////////////////////////
   // Constructor
@@ -429,7 +430,7 @@ class TestReader {
     // Get the CrsGraph because we do not need the values
     auto graph = AmatWrite->getCrsGraph();	
     auto rowMap = graph->getRowMap();
-    Teuchos::Array<gno_t> gblColInds;
+    indices_type gblColInds;
     size_t numEntries = 0;
 
     // Write the nonzeros
@@ -442,8 +443,8 @@ class TestReader {
       
       // Get the copy of the row with global column indices
       numEntries = graph->getNumEntriesInGlobalRow(gblRow);
-      gblColInds.resize(numEntries);
-      graph->getGlobalRowCopy(gblRow, gblColInds(), numEntries);
+      Kokkos::resize(gblColInds,numEntries);
+      graph->getGlobalRowCopy(gblRow, gblColInds, numEntries);
       
       // Write the entries in the row in COO format (i.e., in "rowId colId" pairs)
       for(size_t c = 0; c < numEntries; c++) {
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_FileTest.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_FileTest.cpp
index fab82b836dce..f66c28eccf92 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_FileTest.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_FileTest.cpp
@@ -43,10 +43,13 @@
 #include <Tpetra_CrsMatrix.hpp>
 #include <Tpetra_Core.hpp>
 #include <Tpetra_Util.hpp> // sort2
+#include <Tpetra_TestingUtilities.hpp> 
 #include <Teuchos_UnitTestHarness.hpp>
 
 using Tpetra::global_size_t;
+using Tpetra::TestingUtilities::arcp_from_view;
 using Teuchos::Array;
+using Teuchos::ArrayRCP;
 using Teuchos::ArrayView;
 using Teuchos::as;
 using Teuchos::Comm;
@@ -135,6 +138,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   typedef typename CrsMatrixType::scalar_type ST;
   typedef typename CrsMatrixType::global_ordinal_type GO;
   typedef typename ArrayView<const GO>::size_type size_type;
+  typedef typename CrsMatrixType::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMatrixType::nonconst_values_host_view_type vals_type;
 
   OSTab tab (out);
   int localEqual = 1;
@@ -142,8 +147,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+  gids_type indOrig, ind;
+  vals_type valOrig, val;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -158,17 +163,17 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
       localEqual = 0;
       break;
     }
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig,numEntriesOrig);
+    Kokkos::resize(valOrig,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig, valOrig, numEntriesOrig);
+    Kokkos::resize(ind,numEntries);
+    Kokkos::resize(val,numEntries);
+    A.getGlobalRowCopy (globalRow, ind, val, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them so
     // we can compare them.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig, indOrig.extent(0), valOrig);
+    Tpetra::sort2 (ind, ind.extent(0), val);
 
     for (size_t k = 0; k < numEntries; ++k) {
       // Values should be _exactly_ equal.
@@ -199,13 +204,16 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
   typedef Teuchos::ScalarTraits<ST> STS;
   typedef typename STS::magnitudeType MT;
   typedef Teuchos::ScalarTraits<MT> STM;
+  typedef typename CrsMatrixType::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMatrixType::nonconst_values_host_view_type vals_type;
 
   OSTab tab (out);
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+
+  gids_type indOrig_v, ind_v;
+  vals_type valOrig_v, val_v;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -217,31 +225,35 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
     numEntriesOrig = A_orig.getNumEntriesInGlobalRow (globalRow);
     numEntries = A.getNumEntriesInGlobalRow (globalRow);
 
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig_v,numEntriesOrig);
+    Kokkos::resize(valOrig_v,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig_v, valOrig_v, numEntriesOrig);
+    Kokkos::resize(ind_v,numEntries);
+    Kokkos::resize(val_v,numEntries);
+    A.getGlobalRowCopy (globalRow, ind_v, val_v, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them
     // (and their values with them) so we can merge their values.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig_v, indOrig_v.extent(0), valOrig_v);
+    Tpetra::sort2 (ind_v, ind_v.extent(0), val_v);
+    auto indOrig = arcp_from_view(indOrig_v);
+    auto ind = arcp_from_view(ind_v);
+    auto valOrig = arcp_from_view(valOrig_v);
+    auto val = arcp_from_view(val_v);
 
     //
     // Merge repeated values in each set of indices and values.
     //
-    typename Array<GO>::iterator indOrigIter = indOrig.begin ();
-    typename Array<ST>::iterator valOrigIter = valOrig.begin ();
-    typename Array<GO>::iterator indOrigEnd = indOrig.end ();
-    typename Array<ST>::iterator valOrigEnd = valOrig.end ();
+    typename ArrayRCP<GO>::iterator indOrigIter = indOrig.begin ();
+    typename ArrayRCP<ST>::iterator valOrigIter = valOrig.begin ();
+    typename ArrayRCP<GO>::iterator indOrigEnd = indOrig.end ();
+    typename ArrayRCP<ST>::iterator valOrigEnd = valOrig.end ();
     Tpetra::merge2 (indOrigEnd, valOrigEnd, indOrigIter, indOrigEnd, valOrigIter, valOrigEnd);
 
-    typename Array<GO>::iterator indIter = ind.begin ();
-    typename Array<ST>::iterator valIter = val.begin ();
-    typename Array<GO>::iterator indEnd = ind.end ();
-    typename Array<ST>::iterator valEnd = val.end ();
+    typename ArrayRCP<GO>::iterator indIter = ind.begin ();
+    typename ArrayRCP<ST>::iterator valIter = val.begin ();
+    typename ArrayRCP<GO>::iterator indEnd = ind.end ();
+    typename ArrayRCP<ST>::iterator valEnd = val.end ();
     Tpetra::merge2 (indEnd, valEnd, indIter, indEnd, valIter, valEnd);
 
     //
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_InOutTest.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_InOutTest.cpp
index c5559fbc9c02..3050ea624ffb 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_InOutTest.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_CrsMatrix_InOutTest.cpp
@@ -42,12 +42,14 @@
 #include <MatrixMarket_Tpetra.hpp>
 #include <Tpetra_Core.hpp>
 #include <Tpetra_Util.hpp> // sort2, merge2
+#include <Tpetra_TestingUtilities.hpp>
 #include <Teuchos_UnitTestHarness.hpp>
 #include "TpetraCore_ETIHelperMacros.h"
 
 namespace { // anonymous
 
 using Tpetra::global_size_t;
+using Tpetra::TestingUtilities::arcp_from_view;
 using Teuchos::Array;
 using Teuchos::as;
 using Teuchos::Comm;
@@ -262,6 +264,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   typedef typename CrsMatrixType::scalar_type ST;
   typedef typename CrsMatrixType::global_ordinal_type GO;
   typedef typename ArrayView<const GO>::size_type size_type;
+  typedef typename CrsMatrixType::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMatrixType::nonconst_values_host_view_type vals_type;
 
   Teuchos::OSTab tab (Teuchos::rcpFromRef (out));
   int localEqual = 1;
@@ -269,8 +273,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+  gids_type indOrig, ind;
+  vals_type valOrig, val;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -285,17 +289,16 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
       localEqual = 0;
       break;
     }
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig,numEntriesOrig);
+    Kokkos::resize(valOrig,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig, valOrig, numEntriesOrig);
+    Kokkos::resize(ind,numEntries);
+    Kokkos::resize(val,numEntries);
+    A.getGlobalRowCopy (globalRow, ind, val, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them so
-    // we can compare them.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig, indOrig.extent(0), valOrig);
+    Tpetra::sort2 (ind, ind.extent(0), val);
 
     for (size_t k = 0; k < numEntries; ++k) {
       // Values should be _exactly_ equal.
@@ -323,6 +326,7 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
 {
   using Teuchos::Array;
   using Teuchos::ArrayView;
+  using Teuchos::ArrayRCP;
   using Teuchos::Comm;
   using Teuchos::RCP;
   using Teuchos::reduceAll;
@@ -334,14 +338,17 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
   typedef Teuchos::ScalarTraits<ST> STS;
   typedef typename STS::magnitudeType MT;
   typedef Teuchos::ScalarTraits<MT> STM;
+  typedef typename CrsMatrixType::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMatrixType::nonconst_values_host_view_type vals_type;
 
   Teuchos::OSTab tab (Teuchos::rcpFromRef (out));
 
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+ //
+  gids_type indOrig_v, ind_v;
+  vals_type valOrig_v, val_v;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -353,32 +360,38 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
     numEntriesOrig = A_orig.getNumEntriesInGlobalRow (globalRow);
     numEntries = A.getNumEntriesInGlobalRow (globalRow);
 
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig_v,numEntriesOrig);
+    Kokkos::resize(valOrig_v,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig_v, valOrig_v, numEntriesOrig);
+    Kokkos::resize(ind_v,numEntries);
+    Kokkos::resize(val_v,numEntries);
+    A.getGlobalRowCopy (globalRow, ind_v , val_v, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them
     // (and their values with them) so we can merge their values.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig_v, indOrig_v.extent(0), valOrig_v);
+    Tpetra::sort2 (ind_v, ind_v.extent(0), val_v);
+
+    auto indOrig = arcp_from_view(indOrig_v);
+    auto ind = arcp_from_view(ind_v);
+    auto valOrig = arcp_from_view(valOrig_v);
+    auto val = arcp_from_view(val_v);
 
     //
     // Merge repeated values in each set of indices and values.
     //
 
-    typename Array<GO>::iterator indOrigIter = indOrig.begin ();
-    typename Array<ST>::iterator valOrigIter = valOrig.begin ();
-    typename Array<GO>::iterator indOrigEnd = indOrig.end ();
-    typename Array<ST>::iterator valOrigEnd = valOrig.end ();
+    typename ArrayRCP<GO>::iterator indOrigIter = indOrig.begin ();
+    typename ArrayRCP<ST>::iterator valOrigIter = valOrig.begin ();
+    typename ArrayRCP<GO>::iterator indOrigEnd = indOrig.end ();
+    typename ArrayRCP<ST>::iterator valOrigEnd = valOrig.end ();
     Tpetra::merge2 (indOrigEnd, valOrigEnd, indOrigIter, indOrigEnd, valOrigIter, valOrigEnd);
 
-    typename Array<GO>::iterator indIter = ind.begin ();
-    typename Array<ST>::iterator valIter = val.begin ();
-    typename Array<GO>::iterator indEnd = ind.end ();
-    typename Array<ST>::iterator valEnd = val.end ();
+  
+    typename ArrayRCP<GO>::iterator indIter = ind.begin ();
+    typename ArrayRCP<ST>::iterator valIter = val.begin ();
+    typename ArrayRCP<GO>::iterator indEnd = ind.end ();
+    typename ArrayRCP<ST>::iterator valEnd = val.end ();
     Tpetra::merge2 (indEnd, valEnd, indIter, indEnd, valIter, valEnd);
 
     //
diff --git a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_OperatorTest.cpp b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_OperatorTest.cpp
index 845eba6b1fc7..daa156e42785 100644
--- a/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_OperatorTest.cpp
+++ b/packages/tpetra/core/test/inout/MatrixMarket_Tpetra_OperatorTest.cpp
@@ -42,12 +42,14 @@
 #include <MatrixMarket_Tpetra.hpp>
 #include <Tpetra_Core.hpp>
 #include <Tpetra_Util.hpp> // sort2, merge2
+#include <Tpetra_TestingUtilities.hpp>
 #include <Teuchos_UnitTestHarness.hpp>
 #include "TpetraCore_ETIHelperMacros.h"
 
 namespace { // anonymous
 
 using Teuchos::Array;
+using Teuchos::ArrayRCP;
 using Teuchos::Comm;
 using Teuchos::OSTab;
 using Teuchos::ParameterList;
@@ -57,6 +59,7 @@ using Teuchos::rcp;
 using Teuchos::REDUCE_MAX;
 using Teuchos::REDUCE_MIN;
 using Teuchos::reduceAll;
+using Tpetra::TestingUtilities::arcp_from_view;
 using std::endl;
 
 const bool callFillComplete = true;
@@ -257,6 +260,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   typedef typename CrsMatrixType::scalar_type ST;
   typedef typename CrsMatrixType::global_ordinal_type GO;
   typedef typename ArrayView<const GO>::size_type size_type;
+  using gids_type = typename CrsMatrixType::nonconst_global_inds_host_view_type;
+  using vals_type = typename CrsMatrixType::nonconst_values_host_view_type;
 
   Teuchos::OSTab tab (Teuchos::rcpFromRef (out));
   int localEqual = 1;
@@ -264,8 +269,8 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+  gids_type indOrig, ind;
+  vals_type valOrig, val;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -280,17 +285,17 @@ compareCrsMatrix (const CrsMatrixType& A_orig, const CrsMatrixType& A, Teuchos::
       localEqual = 0;
       break;
     }
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig,numEntriesOrig);
+    Kokkos::resize(valOrig,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig, valOrig, numEntriesOrig);
+    Kokkos::resize(ind,numEntries);
+    Kokkos::resize(val,numEntries);
+    A.getGlobalRowCopy (globalRow, ind, val, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them so
     // we can compare them.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig, indOrig.extent(0), valOrig);
+    Tpetra::sort2 (ind, ind.extent(0), val);
 
     for (size_t k = 0; k < numEntries; ++k) {
       // Values should be _exactly_ equal.
@@ -329,14 +334,16 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
   typedef Teuchos::ScalarTraits<ST> STS;
   typedef typename STS::magnitudeType MT;
   typedef Teuchos::ScalarTraits<MT> STM;
+  typedef typename CrsMatrixType::nonconst_global_inds_host_view_type gids_type;
+  typedef typename CrsMatrixType::nonconst_values_host_view_type vals_type;
 
   Teuchos::OSTab tab (Teuchos::rcpFromRef (out));
 
   //
   // Are my local matrices equal?
   //
-  Array<GO> indOrig, ind;
-  Array<ST> valOrig, val;
+  gids_type indOrig_v, ind_v;
+  vals_type valOrig_v, val_v;
   size_t numEntriesOrig = 0;
   size_t numEntries = 0;
 
@@ -348,32 +355,36 @@ compareCrsMatrixValues (const CrsMatrixType& A_orig,
     numEntriesOrig = A_orig.getNumEntriesInGlobalRow (globalRow);
     numEntries = A.getNumEntriesInGlobalRow (globalRow);
 
-    indOrig.resize (numEntriesOrig);
-    valOrig.resize (numEntriesOrig);
-    A_orig.getGlobalRowCopy (globalRow, indOrig (), valOrig (), numEntriesOrig);
-    ind.resize (numEntries);
-    val.resize (numEntries);
-    A.getGlobalRowCopy (globalRow, ind (), val (), numEntries);
+    Kokkos::resize(indOrig_v,numEntriesOrig);
+    Kokkos::resize(valOrig_v,numEntriesOrig);
+    A_orig.getGlobalRowCopy (globalRow, indOrig_v, valOrig_v, numEntriesOrig);
+    Kokkos::resize(ind_v,numEntries);
+    Kokkos::resize(val_v,numEntries);
+    A.getGlobalRowCopy (globalRow, ind_v, val_v, numEntries);
 
     // Global row entries are not necessarily sorted.  Sort them
     // (and their values with them) so we can merge their values.
-    Tpetra::sort2 (indOrig.begin (), indOrig.end (), valOrig.begin ());
-    Tpetra::sort2 (ind.begin (), ind.end (), val.begin ());
+    Tpetra::sort2 (indOrig_v, indOrig_v.extent(0), valOrig_v);
+    Tpetra::sort2 (ind_v, ind_v.extent(0), val_v);
+    auto indOrig = arcp_from_view(indOrig_v);
+    auto ind = arcp_from_view(ind_v);
+    auto valOrig = arcp_from_view(valOrig_v);
+    auto val = arcp_from_view(val_v);
 
     //
     // Merge repeated values in each set of indices and values.
     //
 
-    typename Array<GO>::iterator indOrigIter = indOrig.begin ();
-    typename Array<ST>::iterator valOrigIter = valOrig.begin ();
-    typename Array<GO>::iterator indOrigEnd = indOrig.end ();
-    typename Array<ST>::iterator valOrigEnd = valOrig.end ();
+    typename ArrayRCP<GO>::iterator indOrigIter = indOrig.begin ();
+    typename ArrayRCP<ST>::iterator valOrigIter = valOrig.begin ();
+    typename ArrayRCP<GO>::iterator indOrigEnd = indOrig.end ();
+    typename ArrayRCP<ST>::iterator valOrigEnd = valOrig.end ();
     Tpetra::merge2 (indOrigEnd, valOrigEnd, indOrigIter, indOrigEnd, valOrigIter, valOrigEnd);
 
-    typename Array<GO>::iterator indIter = ind.begin ();
-    typename Array<ST>::iterator valIter = val.begin ();
-    typename Array<GO>::iterator indEnd = ind.end ();
-    typename Array<ST>::iterator valEnd = val.end ();
+    typename ArrayRCP<GO>::iterator indIter = ind.begin ();
+    typename ArrayRCP<ST>::iterator valIter = val.begin ();
+    typename ArrayRCP<GO>::iterator indEnd = ind.end ();
+    typename ArrayRCP<ST>::iterator valEnd = val.end ();
     Tpetra::merge2 (indEnd, valEnd, indIter, indEnd, valIter, valEnd);
 
     //
diff --git a/packages/xpetra/src/CrsGraph/Xpetra_CrsGraph.hpp b/packages/xpetra/src/CrsGraph/Xpetra_CrsGraph.hpp
index 065cd63c449b..3e75593979e4 100644
--- a/packages/xpetra/src/CrsGraph/Xpetra_CrsGraph.hpp
+++ b/packages/xpetra/src/CrsGraph/Xpetra_CrsGraph.hpp
@@ -90,7 +90,7 @@ namespace Xpetra {
     typedef Node node_type;
 
     //! @name Constructor/Destructor Methods
-    //@{
+    //@
 
     //! Destructor.
     virtual ~CrsGraph() { }
@@ -109,6 +109,15 @@ namespace Xpetra {
     //! Remove all graph indices from the specified local row.
     virtual void removeLocalIndices(LocalOrdinal localRow)= 0;
 
+    //! Allocates the 1D pointer arrays of the graph
+    virtual void allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind)=0;
+
+    //! Sets the 1D pointer arrays of the graph.
+    virtual void setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind)=0;
+
+    //! Gets the 1D pointer arrays of the graph.
+    virtual void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const = 0;
+
     //@}
 
     //! @name Transformational Methods
@@ -120,6 +129,15 @@ namespace Xpetra {
     //! Signal that data entry is complete.
     virtual void fillComplete(const RCP< ParameterList > &params=null)= 0;
 
+    //! Expert version of fillComplete
+    virtual void
+    expertStaticFillComplete (const RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& domainMap,
+                              const RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& rangeMap,
+                              const RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer =null,
+                              const RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter =null,
+                              const RCP<Teuchos::ParameterList>& params = null)=0;
+
+
     //@}
 
     //! @name Methods implementing RowGraph.
diff --git a/packages/xpetra/src/CrsGraph/Xpetra_CrsGraphFactory.hpp b/packages/xpetra/src/CrsGraph/Xpetra_CrsGraphFactory.hpp
index 177d8d647eba..b7f9d74d9253 100644
--- a/packages/xpetra/src/CrsGraph/Xpetra_CrsGraphFactory.hpp
+++ b/packages/xpetra/src/CrsGraph/Xpetra_CrsGraphFactory.hpp
@@ -100,6 +100,25 @@ namespace Xpetra {
       TEUCHOS_UNREACHABLE_RETURN(null);
     }
 
+    //! Constructor specifying column Map and number of entries per row
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
+          const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap,
+          size_t maxNumEntriesPerRow,
+          const Teuchos::RCP< Teuchos::ParameterList > &plist=Teuchos::null) {
+      XPETRA_MONITOR("CrsGraphFactory::Build");
+
+#ifdef HAVE_XPETRA_TPETRA
+      if (rowMap->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(rowMap, colMap, maxNumEntriesPerRow, plist) );
+#endif
+
+      XPETRA_FACTORY_ERROR_IF_EPETRA(rowMap->lib());
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
+
     //! Constructor specifying column Map and number of entries in each row.
     static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
     Build(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
@@ -118,6 +137,26 @@ namespace Xpetra {
       TEUCHOS_UNREACHABLE_RETURN(null);
     }
 
+
+    //! Constructor using fused import
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const RCP<const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > >& sourceGraph,
+          const Import< LocalOrdinal, GlobalOrdinal, Node > & importer,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node >>& domainMap = Teuchos::null,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node > >& rangeMap = Teuchos::null,
+          const RCP<Teuchos::ParameterList>& params = Teuchos::null) {
+#ifdef HAVE_XPETRA_TPETRA
+      if (sourceGraph->getRowMap()->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(sourceGraph, importer, domainMap, rangeMap, params) );
+#endif
+
+      XPETRA_FACTORY_ERROR_IF_EPETRA(sourceGraph()->getRowMap()->lib());
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
+
+
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 #ifdef HAVE_XPETRA_TPETRA
     /// \brief Constructor specifying column Map and arrays containing the graph in sorted, local ids.
@@ -302,6 +341,44 @@ namespace Xpetra {
       TEUCHOS_UNREACHABLE_RETURN(null);
     }
 
+    //! Constructor specifying column Map and number of entries per row
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
+          const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap,
+          size_t maxNumEntriesPerRow,
+          const Teuchos::RCP< Teuchos::ParameterList > &plist=Teuchos::null) {
+      XPETRA_MONITOR("CrsGraphFactory::Build");
+
+#ifdef HAVE_XPETRA_TPETRA
+      if (rowMap->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(rowMap, colMap, maxNumEntriesPerRow, plist) );
+#endif
+      if (rowMap->lib() == UseEpetra)
+        return rcp( new EpetraCrsGraphT<int,Node>(rowMap, colMap, maxNumEntriesPerRow, plist) );
+
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
+
+    //! Constructor using fused import
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const RCP<const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > >& sourceGraph,
+          const Import< LocalOrdinal, GlobalOrdinal, Node > & importer,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node >>& domainMap = Teuchos::null,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node > >& rangeMap = Teuchos::null,
+          const RCP<Teuchos::ParameterList>& params = Teuchos::null) {
+#ifdef HAVE_XPETRA_TPETRA
+      if (sourceGraph->getRowMap()->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(sourceGraph, importer, domainMap, rangeMap, params) );
+#endif
+      if (sourceGraph->getRowMap()->lib() == UseEpetra)
+        return rcp( new EpetraCrsGraphT<int, Node>(sourceGraph, importer, domainMap, rangeMap, params) );
+
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 #ifdef HAVE_XPETRA_TPETRA
     /// \brief Constructor specifying column Map and arrays containing the graph in sorted, local ids.
@@ -487,6 +564,44 @@ namespace Xpetra {
       TEUCHOS_UNREACHABLE_RETURN(null);
     }
 
+    //! Constructor specifying column Map and number of entries per row
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
+          const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap,
+          size_t maxNumEntriesPerRow,
+          const Teuchos::RCP< Teuchos::ParameterList > &plist=Teuchos::null) {
+      XPETRA_MONITOR("CrsGraphFactory::Build");
+
+#ifdef HAVE_XPETRA_TPETRA
+      if (rowMap->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(rowMap, colMap, maxNumEntriesPerRow, plist) );
+#endif
+      if (rowMap->lib() == UseEpetra)
+        return rcp( new EpetraCrsGraphT<long long,Node>(rowMap, colMap, maxNumEntriesPerRow, plist) );
+
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
+    //! Constructor using fused import
+    static Teuchos::RCP<CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
+    Build(const RCP<const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > >& sourceGraph,
+          const Import< LocalOrdinal, GlobalOrdinal, Node > & importer,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node >>& domainMap = Teuchos::null,
+          const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node > >& rangeMap = Teuchos::null,
+          const RCP<Teuchos::ParameterList>& params = Teuchos::null) {
+#ifdef HAVE_XPETRA_TPETRA
+      if (sourceGraph->getRowMap()->lib() == UseTpetra)
+        return rcp( new TpetraCrsGraph<LocalOrdinal, GlobalOrdinal, Node>(sourceGraph, importer, domainMap, rangeMap, params) );
+#endif
+      if (sourceGraph->getRowMap()->lib() == UseTpetra)
+        return rcp( new EpetraCrsGraphT<long long,Node><LocalOrdinal, GlobalOrdinal, Node>(sourceGraph, importer, domainMap, rangeMap, params) );
+
+      XPETRA_FACTORY_END;
+      TEUCHOS_UNREACHABLE_RETURN(null);
+    }
+
+
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 #ifdef HAVE_XPETRA_TPETRA
     /// \brief Constructor specifying column Map and arrays containing the graph in sorted, local ids.
diff --git a/packages/xpetra/src/CrsGraph/Xpetra_EpetraCrsGraph.hpp b/packages/xpetra/src/CrsGraph/Xpetra_EpetraCrsGraph.hpp
index f4305232cdd4..72bfdc7dc908 100644
--- a/packages/xpetra/src/CrsGraph/Xpetra_EpetraCrsGraph.hpp
+++ b/packages/xpetra/src/CrsGraph/Xpetra_EpetraCrsGraph.hpp
@@ -117,6 +117,18 @@ class EpetraCrsGraphT
       "Xpetra::EpetraCrsGraph only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)");
   }
 
+  // Constructor for fused import
+  EpetraCrsGraphT(const RCP<const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > >& sourceGraph,
+                  const Import< LocalOrdinal, GlobalOrdinal, Node > & importer,
+                  const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node >>& domainMap = Teuchos::null,
+                  const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node > >& rangeMap = Teuchos::null,
+                  const RCP<Teuchos::ParameterList>& params = Teuchos::null)  {
+    TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError,
+      "Xpetra::EpetraCrsGraph only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)");
+  }
+
+
+
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 #ifdef HAVE_XPETRA_TPETRA
   //! Constructor specifying column Map, number of entries in each row and column indices in each row.
@@ -168,6 +180,16 @@ class EpetraCrsGraphT
   //! Remove all graph indices from the specified local row.
   void removeLocalIndices(LocalOrdinal localRow) {  }
 
+  //! Allocates the 1D pointer arrays of the graph
+  void allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind) { }
+  
+  //! Sets the 1D pointer arrays of the graph.
+  void setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind){ }
+  
+  //! Gets the 1D pointer arrays of the graph.
+  void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const { }
+  
+
   //@}
 
   //! @name Transformational Methods
@@ -178,6 +200,18 @@ class EpetraCrsGraphT
 
   //! Signal that data entry is complete.
   void fillComplete(const RCP< ParameterList > &params=null) {  }
+
+  //! Expert version of fillComplete
+  void
+  expertStaticFillComplete (const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& domainMap,
+                            const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& rangeMap,
+                            const Teuchos::RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer =
+                            Teuchos::null,
+                            const Teuchos::RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter =
+                            Teuchos::null,
+                            const Teuchos::RCP<Teuchos::ParameterList>& params =
+                            Teuchos::null) { }
+
   //@}
 
   //! @name Methods implementing RowGraph.
@@ -387,6 +421,30 @@ class EpetraCrsGraphT<int, EpetraNode>
     Teuchos::Array<int> numEntriesPerRowToAlloc(NumEntriesPerRowToAlloc.begin(), NumEntriesPerRowToAlloc.end()); // convert array of "size_t" to array of "int"
     graph_ = Teuchos::rcp(new Epetra_CrsGraph(Copy, toEpetra<GlobalOrdinal,Node>(rowMap), toEpetra<GlobalOrdinal,Node>(colMap), numEntriesPerRowToAlloc.getRawPtr(), true));
   }
+  
+  // Constructor for fused import
+  EpetraCrsGraphT(const RCP<const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > >& sourceGraph,
+                  const Import< LocalOrdinal, GlobalOrdinal, Node > & importer,
+                  const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node >>& domainMap = Teuchos::null,
+                  const RCP<const Map< LocalOrdinal, GlobalOrdinal, Node > >& rangeMap = Teuchos::null,
+                  const RCP<Teuchos::ParameterList>& params = Teuchos::null)  {
+
+    XPETRA_DYNAMIC_CAST(const EpetraCrsGraphT<GlobalOrdinal XPETRA_COMMA Node>, *sourceGraph, tSourceGraph, "Xpetra::EpetraCrsGraphT() only accepts Xpetra::EpetraCrsGraphT as input arguments.");
+    XPETRA_DYNAMIC_CAST(const EpetraImportT<GlobalOrdinal XPETRA_COMMA Node>, importer, tImporter, "Xpetra::EpetraCrsGraphT::doImport only accept Xpetra::EpetraImportT as input arguments.");
+    RCP< const Epetra_CrsGraph> eSourceGraph = tSourceGraph.getEpetra_CrsGraph();
+
+    // NOTE: Unlike Tpetra, Epetra does not have a FusedTransfer for Graphs.  So we do this the slow way
+    graph_ = Teuchos::rcp(new Epetra_CrsGraph(Copy,eSourceGraph->RowMap(),0,false));
+    graph_->Import(*eSourceGraph,*tImporter.getEpetra_Import(),Insert);
+
+    const Epetra_BlockMap & myDomainMap = domainMap!=Teuchos::null ? toEpetra<GlobalOrdinal,Node>(domainMap) : eSourceGraph->ColMap();
+    const Epetra_BlockMap & myRangeMap  = rangeMap!=Teuchos::null  ? toEpetra<GlobalOrdinal,Node>(rangeMap) : toEpetra<LocalOrdinal,Node>(importer.getTargetMap());
+    
+    graph_->FillComplete(myDomainMap,myRangeMap);
+
+  }
+
+
 
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 #ifdef HAVE_XPETRA_TPETRA
@@ -449,6 +507,74 @@ class EpetraCrsGraphT<int, EpetraNode>
   //! Remove all graph indices from the specified local row.
   void removeLocalIndices(LocalOrdinal localRow) { XPETRA_MONITOR("EpetraCrsGraphT::removeLocalIndices"); graph_->RemoveMyIndices(localRow); }
 
+  //! Allocates and returns ArrayRCPs of the Crs arrays --- This is an Xpetra-only routine.
+  //** \warning This is an expert-only routine and should not be called from user code. */
+  void allocateAllIndices(size_t numNonZeros, ArrayRCP<size_t>& rowptr, ArrayRCP<LocalOrdinal>& colind) {
+       XPETRA_MONITOR("EpetraCrsGraphT::allocateAllIndies");
+
+      // Row offsets
+      // Unfortunately, we cannot do this in the same manner as column indices
+      // and values (see below).  The problem is that Tpetra insists on using
+      // size_t, and Epetra uses int internally.  So we only resize here, and
+      // will need to copy in setAllValues
+      rowptr.resize(getNodeNumRows()+1);
+
+      int  lowerOffset = 0;
+      bool ownMemory   = false;
+
+      // Column indices
+      // Extract, resize, set colind
+      Epetra_IntSerialDenseVector& myColind = graph_->ExpertExtractIndices();
+      myColind.Resize(numNonZeros);
+      colind = Teuchos::arcp(myColind.Values(), lowerOffset, numNonZeros, ownMemory);
+    }
+
+  //! Sets the 1D pointer arrays of the graph.
+  void setAllIndices(const ArrayRCP<size_t>& rowptr, const ArrayRCP<LocalOrdinal>& colind) {
+    XPETRA_MONITOR("EpetraCrsGraphT::setAllIndices");
+
+    // Check sizes
+    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(rowptr.size()) != getNodeNumRows()+1, Xpetra::Exceptions::RuntimeError,
+                               "An exception is thrown to let you know that the size of your rowptr array is incorrect.");
+    if (colind.size() > 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION(colind.getRawPtr() != graph_->ExpertExtractIndices().Values(), Xpetra::Exceptions::RuntimeError,
+                                 "An exception is thrown to let you know that you mismatched your pointers.");
+    }
+
+    // We have to make a copy here, it is unavoidable
+    // See comments in allocateAllIndices
+    const size_t N = getNodeNumRows();
+
+    Epetra_IntSerialDenseVector& myRowptr = graph_->ExpertExtractIndexOffset();
+    myRowptr.Resize(N+1);
+    for (size_t i = 0; i < N+1; i++)
+      myRowptr[i] = Teuchos::as<int>(rowptr[i]);
+  }
+
+
+  //! Gets the 1D pointer arrays of the graph.
+  void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const {
+    XPETRA_MONITOR("EpetraCrsGraphT::getAllIndices");
+
+    int  lowerOffset = 0;
+    bool ownMemory   = false;
+
+    const size_t n   = getNodeNumRows();
+    const size_t nnz = getNodeNumEntries();
+
+    // Row offsets
+    // We have to make a copy here, it is unavoidable (see comments in allocateAllValues)
+    Epetra_IntSerialDenseVector& myRowptr = graph_->ExpertExtractIndexOffset();
+    rowptr.resize(n+1);
+    for (size_t i = 0; i < n+1; i++)
+      (*const_cast<size_t*>(&rowptr[i])) = Teuchos::as<size_t>(myRowptr[i]);
+
+    // Column indices
+    colind = Teuchos::arcp(graph_->ExpertExtractIndices().Values(), lowerOffset, nnz, ownMemory);
+  }
+
+
+
   //@}
 
   //! @name Transformational Methods
@@ -473,6 +599,22 @@ class EpetraCrsGraphT<int, EpetraNode>
     if (params != null && params->get("Optimize Storage",true) == false) doOptimizeStorage = false;
     if (doOptimizeStorage) graph_->OptimizeStorage();
   }
+
+  //! Expert version of fillComplete
+  void
+  expertStaticFillComplete (const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& domainMap,
+                            const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& rangeMap,
+                            const Teuchos::RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer =
+                            Teuchos::null,
+                            const Teuchos::RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter =
+                            Teuchos::null,
+                            const Teuchos::RCP<Teuchos::ParameterList>& params =
+                            Teuchos::null) { 
+    // Not optimized
+    graph_->FillComplete(toEpetra<GlobalOrdinal,Node>(domainMap), toEpetra<GlobalOrdinal,Node>(rangeMap));
+    graph_->OptimizeStorage();
+  }
+
   //@}
 
   //! @name Methods implementing RowGraph.
@@ -816,6 +958,74 @@ class EpetraCrsGraphT<long long, EpetraNode>
   //! Remove all graph indices from the specified local row.
   void removeLocalIndices(LocalOrdinal localRow) { XPETRA_MONITOR("EpetraCrsGraphT::removeLocalIndices"); graph_->RemoveMyIndices(localRow); }
 
+  //! Allocates and returns ArrayRCPs of the Crs arrays --- This is an Xpetra-only routine.
+  //** \warning This is an expert-only routine and should not be called from user code. */
+  void allocateAllIndices(size_t numNonZeros, ArrayRCP<size_t>& rowptr, ArrayRCP<LocalOrdinal>& colind) {
+       XPETRA_MONITOR("EpetraCrsGraphT::allocateAllIndies");
+
+      // Row offsets
+      // Unfortunately, we cannot do this in the same manner as column indices
+      // and values (see below).  The problem is that Tpetra insists on using
+      // size_t, and Epetra uses int internally.  So we only resize here, and
+      // will need to copy in setAllValues
+      rowptr.resize(getNodeNumRows()+1);
+
+      int  lowerOffset = 0;
+      bool ownMemory   = false;
+
+      // Column indices
+      // Extract, resize, set colind
+      Epetra_IntSerialDenseVector& myColind = graph_->ExpertExtractIndices();
+      myColind.Resize(numNonZeros);
+      colind = Teuchos::arcp(myColind.Values(), lowerOffset, numNonZeros, ownMemory);
+    }
+
+  //! Sets the 1D pointer arrays of the graph.
+  void setAllIndices(const ArrayRCP<size_t>& rowptr, const ArrayRCP<LocalOrdinal>& colind) {
+    XPETRA_MONITOR("EpetraCrsGraphT::setAllIndices");
+
+    // Check sizes
+    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(rowptr.size()) != getNodeNumRows()+1, Xpetra::Exceptions::RuntimeError,
+                               "An exception is thrown to let you know that the size of your rowptr array is incorrect.");
+    if (colind.size() > 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION(colind.getRawPtr() != graph_->ExpertExtractIndices().Values(), Xpetra::Exceptions::RuntimeError,
+                                 "An exception is thrown to let you know that you mismatched your pointers.");
+    }
+
+    // We have to make a copy here, it is unavoidable
+    // See comments in allocateAllIndices
+    const size_t N = getNodeNumRows();
+
+    Epetra_IntSerialDenseVector& myRowptr = graph_->ExpertExtractIndexOffset();
+    myRowptr.Resize(N+1);
+    for (size_t i = 0; i < N+1; i++)
+      myRowptr[i] = Teuchos::as<int>(rowptr[i]);
+  }
+
+
+  //! Gets the 1D pointer arrays of the graph.
+  void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const {
+    XPETRA_MONITOR("EpetraCrsGraphT::getAllIndices");
+
+    int  lowerOffset = 0;
+    bool ownMemory   = false;
+
+    const size_t n   = getNodeNumRows();
+    const size_t nnz = getNodeNumEntries();
+
+    // Row offsets
+    // We have to make a copy here, it is unavoidable (see comments in allocateAllValues)
+    Epetra_IntSerialDenseVector& myRowptr = graph_->ExpertExtractIndexOffset();
+    rowptr.resize(n+1);
+    for (size_t i = 0; i < n+1; i++)
+      (*const_cast<size_t*>(&rowptr[i])) = Teuchos::as<size_t>(myRowptr[i]);
+
+    // Column indices
+    colind = Teuchos::arcp(graph_->ExpertExtractIndices().Values(), lowerOffset, nnz, ownMemory);
+  }
+
+
+
   //@}
 
   //! @name Transformational Methods
@@ -840,6 +1050,22 @@ class EpetraCrsGraphT<long long, EpetraNode>
     if (params != null && params->get("Optimize Storage",true) == false) doOptimizeStorage = false;
     if (doOptimizeStorage) graph_->OptimizeStorage();
   }
+
+  //! Expert version of fillComplete
+  void
+  expertStaticFillComplete (const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& domainMap,
+                            const Teuchos::RCP<const Map < LocalOrdinal, GlobalOrdinal, Node > >& rangeMap,
+                            const Teuchos::RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer =
+                            Teuchos::null,
+                            const Teuchos::RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter =
+                            Teuchos::null,
+                            const Teuchos::RCP<Teuchos::ParameterList>& params =
+                            Teuchos::null) { 
+    // Not optimized
+    graph_->FillComplete(toEpetra<GlobalOrdinal,Node>(domainMap), toEpetra<GlobalOrdinal,Node>(rangeMap));
+    graph_->OptimizeStorage();
+  }
+
   //@}
 
   //! @name Methods implementing RowGraph.
diff --git a/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_decl.hpp b/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_decl.hpp
index 03a9c146ad27..7a8b3cc3a3b2 100644
--- a/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_decl.hpp
+++ b/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_decl.hpp
@@ -68,10 +68,13 @@ namespace Xpetra {
   class TpetraCrsGraph
     : public CrsGraph<LocalOrdinal,GlobalOrdinal,Node>
   {
-
+    #undef XPETRA_TPETRACRSGRAPH_SHORT
+#include "Xpetra_UseShortNamesOrdinal.hpp"
     // The following typedef is used by the XPETRA_DYNAMIC_CAST() macro.
     typedef TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node> TpetraCrsGraphClass;
-    typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
+    typedef TpetraImport<LocalOrdinal,GlobalOrdinal,Node> TpetraImportClass;
+    typedef TpetraExport<LocalOrdinal,GlobalOrdinal,Node> TpetraExportClass;
+    typedef Map map_type;
 
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
     typedef typename Xpetra::CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type local_graph_type;
@@ -83,16 +86,25 @@ namespace Xpetra {
     //@{
 
     //! Constructor specifying fixed number of entries for each row.
-    TpetraCrsGraph(const RCP< const map_type > &rowMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params=null);
+    TpetraCrsGraph(const RCP< const Map > &rowMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params=null);
 
     //! Constructor specifying (possibly different) number of entries in each row.
-    TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params=null);
+    TpetraCrsGraph(const RCP< const Map > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params=null);
 
     //! Constructor specifying column Map and fixed number of entries for each row.
-    TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params=null);
+    TpetraCrsGraph(const RCP< const Map > &rowMap, const RCP< const Map > &colMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params=null);
 
     //! Constructor specifying column Map and number of entries in each row.
-    TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params=null);
+    TpetraCrsGraph(const RCP< const Map > &rowMap, const RCP< const Map > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params=null);
+
+    // Constructor for fused import
+    TpetraCrsGraph(const RCP<const CrsGraph >& sourceGraph,
+                   const Import & importer,
+                   const RCP<const Map>& domainMap = Teuchos::null,
+                   const RCP<const Map>& rangeMap = Teuchos::null,
+                   const RCP<Teuchos::ParameterList>& params = Teuchos::null);
+    
+
 
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
     /// \brief Constructor specifying column Map and arrays containing the graph in sorted, local ids.
@@ -114,8 +126,8 @@ namespace Xpetra {
     /// \param params [in/out] Optional list of parameters.  If not
     ///   null, any missing parameters will be filled in with their
     ///   default values.
-    TpetraCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
-                   const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap,
+    TpetraCrsGraph(const Teuchos::RCP< const Map > &rowMap,
+                   const Teuchos::RCP< const Map > &colMap,
                    const typename local_graph_type::row_map_type& rowPointers,
                    const typename local_graph_type::entries_type::non_const_type& columnIndices,
                    const Teuchos::RCP< Teuchos::ParameterList > &plist=Teuchos::null);
@@ -169,11 +181,15 @@ namespace Xpetra {
     /// \param params [in/out] Optional list of parameters.  If not
     ///   null, any missing parameters will be filled in with their
     ///   default values.
-    TpetraCrsGraph(const Teuchos::RCP<const map_type>& rowMap,
-                   const Teuchos::RCP<const map_type>& colMap,
+    TpetraCrsGraph(const Teuchos::RCP<const Map>& rowMap,
+                   const Teuchos::RCP<const Map>& colMap,
                    const local_graph_type& lclGraph,
                    const Teuchos::RCP<Teuchos::ParameterList>& params);
 
+
+
+
+
 #endif
 
     //! Destructor.
@@ -190,6 +206,15 @@ namespace Xpetra {
 
     //! Remove all graph indices from the specified local row.
     void removeLocalIndices(LocalOrdinal localRow);
+    
+    //! Allocates the 1D pointer arrays of the graph
+    void allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind);
+
+    //! Sets the 1D pointer arrays of the graph.
+    void setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind);
+
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const;
 
     //@}
 
@@ -197,11 +222,23 @@ namespace Xpetra {
     //@{
 
     //! Signal that data entry is complete, specifying domain and range maps.
-    void fillComplete(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const RCP< ParameterList > &params=null);
+    void fillComplete(const RCP< const Map > &domainMap, const RCP< const Map > &rangeMap, const RCP< ParameterList > &params=null);
 
     //! Signal that data entry is complete.
     void fillComplete(const RCP< ParameterList > &params=null);
 
+    //! Expert version of fillComplete
+    void
+    expertStaticFillComplete (const Teuchos::RCP<const map_type>& domainMap,
+                              const Teuchos::RCP<const map_type>& rangeMap,
+                              const Teuchos::RCP<const Import>& importer =
+                              Teuchos::null,
+                              const Teuchos::RCP<const Export>& exporter =
+                              Teuchos::null,
+                              const Teuchos::RCP<Teuchos::ParameterList>& params =
+                              Teuchos::null);
+    //@}
+
     //@}
 
     //! @name Methods implementing RowGraph.
@@ -211,22 +248,22 @@ namespace Xpetra {
     RCP< const Comm< int > > getComm() const;
 
     //! Returns the Map that describes the row distribution in this graph.
-    RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  getRowMap() const;
+    RCP< const Map >  getRowMap() const;
 
     //! Returns the Map that describes the column distribution in this graph.
-    RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  getColMap() const;
+    RCP< const Map >  getColMap() const;
 
     //! Returns the Map associated with the domain of this graph.
-    RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  getDomainMap() const;
+    RCP< const Map >  getDomainMap() const;
 
     //! Returns the Map associated with the domain of this graph.
-    RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  getRangeMap() const;
+    RCP< const Map >  getRangeMap() const;
 
     //! Returns the importer associated with this graph.
-    RCP< const Import< LocalOrdinal, GlobalOrdinal, Node > > getImporter() const;
+    RCP< const Import > getImporter() const;
 
     //! Returns the exporter associated with this graph.
-    RCP< const Export< LocalOrdinal, GlobalOrdinal, Node > > getExporter() const;
+    RCP< const Export > getExporter() const;
 
     //! Returns the number of global rows in the graph.
     global_size_t getGlobalNumRows() const;
@@ -322,23 +359,23 @@ namespace Xpetra {
     //{@
 
     //! Access function for the Tpetra::Map this DistObject was constructed with.
-    Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getMap() const;
+    Teuchos::RCP< const Map > getMap() const;
 
     //! Import.
     void doImport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &source,
-                  const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, CombineMode CM);
+                  const Import &importer, CombineMode CM);
 
     //! Export.
     void doExport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &dest,
-                  const Import< LocalOrdinal, GlobalOrdinal, Node >& importer, CombineMode CM);
+                  const Import& importer, CombineMode CM);
 
     //! Import (using an Exporter).
     void doImport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &source,
-                  const Export< LocalOrdinal, GlobalOrdinal, Node >& exporter, CombineMode CM);
+                  const Export& exporter, CombineMode CM);
 
     //! Export (using an Importer).
     void doExport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &dest,
-                  const Export< LocalOrdinal, GlobalOrdinal, Node >& exporter, CombineMode CM);
+                  const Export& exporter, CombineMode CM);
 
     // @}
 
@@ -346,7 +383,7 @@ namespace Xpetra {
     //@{
 
     //! TpetraCrsGraph constructor to wrap a Tpetra::CrsGraph object
-    TpetraCrsGraph(const Teuchos::RCP<Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > &graph);
+    TpetraCrsGraph(const Teuchos::RCP<Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>  > &graph);
 
     //! Get the underlying Tpetra graph
     RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > getTpetra_CrsGraph() const;
diff --git a/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_def.hpp b/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_def.hpp
index 009d78261702..1ee60952cbc5 100644
--- a/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_def.hpp
+++ b/packages/xpetra/src/CrsGraph/Xpetra_TpetraCrsGraph_def.hpp
@@ -59,30 +59,52 @@
 
 
 namespace Xpetra {
-#ifdef HAVE_XPETRA_KOKKOS_REFACTOR
-#endif
+
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const map_type > &rowMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params)
 : graph_(Teuchos::rcp(new Tpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node >(toTpetra(rowMap), maxNumEntriesPerRow, Tpetra::StaticProfile, params))) {  }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params)
+TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params)
 : graph_(Teuchos::rcp(new Tpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node >(toTpetra(rowMap), NumEntriesPerRowToAlloc(), Tpetra::StaticProfile, params))) {  }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params)
+TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map>&rowMap, const RCP< const Map > &colMap, size_t maxNumEntriesPerRow, const RCP< ParameterList > &params)
 : graph_(Teuchos::rcp(new Tpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node >(toTpetra(rowMap), toTpetra(colMap), maxNumEntriesPerRow, Tpetra::StaticProfile, params))) {  }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params)
+TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::TpetraCrsGraph(const RCP< const Map > &rowMap, const RCP< const Map > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, const RCP< ParameterList > &params)
 : graph_(Teuchos::rcp(new Tpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node >(toTpetra(rowMap), toTpetra(colMap), NumEntriesPerRowToAlloc(), Tpetra::StaticProfile, params))) {  }
 
+
+template <class LocalOrdinal, class GlobalOrdinal, class Node>
+TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
+TpetraCrsGraph(const Teuchos::RCP<const CrsGraph >& sourceGraph,
+               const Import & importer,
+               const Teuchos::RCP<const Map >& domainMap,
+               const Teuchos::RCP<const Map >& rangeMap,
+               const Teuchos::RCP<Teuchos::ParameterList>& params)
+{
+  typedef Tpetra::CrsGraph<LocalOrdinal,GlobalOrdinal,Node> MyTpetraCrsGraph;
+  XPETRA_DYNAMIC_CAST(const TpetraCrsGraphClass, *sourceGraph, tSourceGraph, "Xpetra::TpetraCrsMatrix constructor only accepts Xpetra::TpetraCrsMatrix as the input argument.");//TODO: remove and use toTpetra()
+  RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > v = tSourceGraph.getTpetra_CrsGraph();
+
+  RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myDomainMap = domainMap!=Teuchos::null ? toTpetra(domainMap) : Teuchos::null;
+  RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myRangeMap  = rangeMap!=Teuchos::null  ? toTpetra(rangeMap)  : Teuchos::null;
+  graph_=Tpetra::importAndFillCompleteCrsGraph<MyTpetraCrsGraph>(v,toTpetra(importer),myDomainMap,myRangeMap,params);
+  bool restrictComm=false;
+  if(!params.is_null()) restrictComm = params->get("Restrict Communicator",restrictComm);
+  if(restrictComm && graph_->getRowMap().is_null()) graph_=Teuchos::null;
+  
+}
+
+
 #ifdef HAVE_XPETRA_KOKKOS_REFACTOR
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
-TpetraCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap,
-               const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap,
+TpetraCrsGraph(const Teuchos::RCP< const Map > &rowMap,
+               const Teuchos::RCP< const Map > &colMap,
                const typename local_graph_type::row_map_type& rowPointers,
                const typename local_graph_type::entries_type::non_const_type& columnIndices,
                const Teuchos::RCP< Teuchos::ParameterList > &plist)
@@ -124,31 +146,74 @@ void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::removeLocalIndices(LocalOr
 { XPETRA_MONITOR("TpetraCrsGraph::removeLocalIndices"); graph_->removeLocalIndices(localRow); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::fillComplete(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const RCP< ParameterList > &params)
+void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
+allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind) {
+  rowptr.resize(getNodeNumRows()+1); colind.resize(numNonZeros);
+}
+
+template<class LocalOrdinal, class GlobalOrdinal, class Node>
+void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
+setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind) {
+  graph_->setAllIndices(rowptr,colind);
+}
+
+template<class LocalOrdinal, class GlobalOrdinal, class Node>
+void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
+getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const {
+  rowptr = graph_->getNodeRowPtrs();
+  colind = graph_->getNodePackedIndices();
+}
+
+template<class LocalOrdinal, class GlobalOrdinal, class Node>
+void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::fillComplete(const RCP< const Map > &domainMap, const RCP< const Map > &rangeMap, const RCP< ParameterList > &params)
 { XPETRA_MONITOR("TpetraCrsGraph::fillComplete"); graph_->fillComplete(toTpetra(domainMap), toTpetra(rangeMap), params); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::fillComplete(const RCP< ParameterList > &params)
 { XPETRA_MONITOR("TpetraCrsGraph::fillComplete"); graph_->fillComplete(params); }
 
+template<class LocalOrdinal, class GlobalOrdinal, class Node>
+void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::
+expertStaticFillComplete (const Teuchos::RCP<const map_type>& domainMap,
+                          const Teuchos::RCP<const map_type>& rangeMap,
+                          const Teuchos::RCP<const Import>& importer,
+                          const Teuchos::RCP<const Export>& exporter,                          
+                          const Teuchos::RCP<Teuchos::ParameterList>& params) {
+  XPETRA_MONITOR("TpetraCrsGraph::expertStaticFillComplete");
+  RCP<const Tpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > myImport;
+  RCP<const Tpetra::Export<LocalOrdinal,GlobalOrdinal,Node> > myExport;
+  
+  if(importer!=Teuchos::null) {
+    XPETRA_DYNAMIC_CAST( const TpetraImportClass , *importer, tImporter, "Xpetra::TpetraCrsGraph::expertStaticFillComplete only accepts Xpetra::TpetraImport.");
+    myImport = tImporter.getTpetra_Import();
+  }
+  if(exporter!=Teuchos::null) {
+    XPETRA_DYNAMIC_CAST( const TpetraExportClass , *exporter, tExporter, "Xpetra::TpetraCrsGraph::expertStaticFillComplete only accepts Xpetra::TpetraExport.");
+    myExport = tExporter.getTpetra_Export();
+  }
+  
+  graph_->expertStaticFillComplete(toTpetra(domainMap),toTpetra(rangeMap),myImport,myExport,params);
+}
+
+
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 RCP< const Comm< int > > TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getComm() const
 { XPETRA_MONITOR("TpetraCrsGraph::getComm"); return graph_->getComm(); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getRowMap() const
+RCP< const Map<LocalOrdinal, GlobalOrdinal, Node> >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getRowMap() const
 { XPETRA_MONITOR("TpetraCrsGraph::getRowMap"); return toXpetra(graph_->getRowMap()); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getColMap() const
+RCP< const Map<LocalOrdinal, GlobalOrdinal, Node> >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getColMap() const
 { XPETRA_MONITOR("TpetraCrsGraph::getColMap"); return toXpetra(graph_->getColMap()); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getDomainMap() const
+RCP< const Map<LocalOrdinal, GlobalOrdinal, Node> >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getDomainMap() const
 { XPETRA_MONITOR("TpetraCrsGraph::getDomainMap"); return toXpetra(graph_->getDomainMap()); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getRangeMap() const
+RCP< const Map<LocalOrdinal, GlobalOrdinal, Node> >  TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getRangeMap() const
 { XPETRA_MONITOR("TpetraCrsGraph::getRangeMap"); return toXpetra(graph_->getRangeMap()); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -265,12 +330,12 @@ ArrayRCP< const size_t > TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getNod
 { XPETRA_MONITOR("TpetraCrsGraph::getNodeRowPtrs"); return graph_->getNodeRowPtrs(); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
-Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getMap() const
-{ XPETRA_MONITOR("TpetraCrsGraph::getMap"); return rcp( new TpetraMap< LocalOrdinal, GlobalOrdinal, Node >(graph_->getMap()) ); }
+Teuchos::RCP< const Map<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::getMap() const
+{ XPETRA_MONITOR("TpetraCrsGraph::getMap"); return rcp( new TpetraMap(graph_->getMap()) ); }
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doImport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &source,
-                                                               const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, CombineMode CM){
+                                                               const Import &importer, CombineMode CM){
   XPETRA_MONITOR("TpetraCrsGraph::doImport");
   
   XPETRA_DYNAMIC_CAST(const TpetraCrsGraphClass, source, tSource, "Xpetra::TpetraCrsGraph::doImport only accept Xpetra::TpetraCrsGraph as input arguments.");//TODO: remove and use toTpetra()
@@ -282,7 +347,7 @@ void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doImport(const DistObject<
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doExport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &dest,
-                                                               const Import< LocalOrdinal, GlobalOrdinal, Node >& importer, CombineMode CM) {
+                                                               const Import& importer, CombineMode CM) {
   XPETRA_MONITOR("TpetraCrsGraph::doExport");
   
   XPETRA_DYNAMIC_CAST(const TpetraCrsGraphClass, dest, tDest, "Xpetra::TpetraCrsGraph::doImport only accept Xpetra::TpetraCrsGraph as input arguments.");//TODO: remove and use toTpetra()
@@ -293,7 +358,7 @@ void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doExport(const DistObject<
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doImport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &source,
-                                                               const Export< LocalOrdinal, GlobalOrdinal, Node >& exporter, CombineMode CM){
+                                                               const Export& exporter, CombineMode CM){
   XPETRA_MONITOR("TpetraCrsGraph::doImport");
   
   XPETRA_DYNAMIC_CAST(const TpetraCrsGraphClass, source, tSource, "Xpetra::TpetraCrsGraph::doImport only accept Xpetra::TpetraCrsGraph as input arguments.");//TODO: remove and use toTpetra()
@@ -305,7 +370,7 @@ void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doImport(const DistObject<
 
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 void TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>::doExport(const DistObject<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> &dest,
-                                                               const Export< LocalOrdinal, GlobalOrdinal, Node >& exporter, CombineMode CM) {
+                                                               const Export& exporter, CombineMode CM) {
   XPETRA_MONITOR("TpetraCrsGraph::doExport");
   
   XPETRA_DYNAMIC_CAST(const TpetraCrsGraphClass, dest, tDest, "Xpetra::TpetraCrsGraph::doImport only accept Xpetra::TpetraCrsGraph as input arguments.");//TODO: remove and use toTpetra()
@@ -324,6 +389,8 @@ RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<
 { return graph_; }
 
 
+
+
 #ifdef HAVE_XPETRA_EPETRA
 
 #if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
@@ -480,6 +547,16 @@ RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<
     //! Remove all graph indices from the specified local row.
     void removeLocalIndices(LocalOrdinal localRow) { }
 
+    //! Allocates the 1D pointer arrays of the graph
+    void allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind){ }
+
+    //! Sets the 1D pointer arrays of the graph.
+    void setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind){ }
+
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const{ }
+
+
     //@}
 
     //! @name Transformational Methods
@@ -491,6 +568,13 @@ RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<
     //! Signal that data entry is complete.
     void fillComplete(const RCP< ParameterList > &params=null) { }
 
+    //! Expert version of fillComplete
+    void expertStaticFillComplete (const Teuchos::RCP<const map_type>& domainMap,
+                                   const Teuchos::RCP<const map_type>& rangeMap,
+                                   const Teuchos::RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer = null,
+                                   const Teuchos::RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter = null,                          
+                                   const Teuchos::RCP<Teuchos::ParameterList>& params=null){ } 
+
     //@}
 
     //! @name Methods implementing RowGraph.
@@ -803,6 +887,17 @@ RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<
     //! Remove all graph indices from the specified local row.
     void removeLocalIndices(LocalOrdinal localRow) { }
 
+    
+    //! Allocates the 1D pointer arrays of the graph
+    void allocateAllIndices(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind){ }
+
+    //! Sets the 1D pointer arrays of the graph.
+    void setAllIndices(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind){ } 
+
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllIndices(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind) const { } 
+
+
     //@}
 
     //! @name Transformational Methods
@@ -814,6 +909,13 @@ RCP< const Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> > TpetraCrsGraph<
     //! Signal that data entry is complete.
     void fillComplete(const RCP< ParameterList > &params=null) { }
 
+    //! Expert version of fillComplete
+    void expertStaticFillComplete (const Teuchos::RCP<const map_type>& domainMap,
+                                   const Teuchos::RCP<const map_type>& rangeMap,
+                                   const Teuchos::RCP<const Import< LocalOrdinal, GlobalOrdinal, Node > >& importer = null,
+                                   const Teuchos::RCP<const Export< LocalOrdinal, GlobalOrdinal, Node > >& exporter = null,                          
+                                   const Teuchos::RCP<Teuchos::ParameterList>& params=null){ } 
+
     //@}
 
     //! @name Methods implementing RowGraph.
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
index d52ce60b837c..6461e9b48570 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
@@ -127,6 +127,9 @@ namespace Xpetra {
     //! Gets the 1D pointer arrays of the graph.
     virtual void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const = 0;
 
+    //! Gets the 1D pointer arrays of the graph.
+    virtual void getAllValues(ArrayRCP<Scalar>& values) =0;
+
     //@}
 
     //! @name Transformational Methods
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
index ffb2a83b9ef0..a436b20a7d35 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
@@ -167,6 +167,7 @@ class EpetraCrsMatrixT
   void allocateAllValues(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind, ArrayRCP<Scalar> & values) { }
   void setAllValues(const ArrayRCP<size_t> & rowptr, const ArrayRCP<LocalOrdinal> & colind, const ArrayRCP<Scalar> & values) { }
   void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const { }
+  void getAllValues(ArrayRCP<Scalar>& values) { }
   bool haveGlobalConstants() const  { return true;}
   void expertStaticFillComplete(const RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> > & domainMap,
       const RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> > & rangeMap,
@@ -672,6 +673,18 @@ class EpetraCrsMatrixT <int, EpetraNode>
     values = Teuchos::arcp(mtx_->ExpertExtractValues(), lowerOffset, nnz, ownMemory);
   }
 
+  //! Gets the 1D pointer arrays of the graph.
+  void getAllValues(ArrayRCP<Scalar>& values) {
+    XPETRA_MONITOR("EpetraCrsMatrixT::getAllValues");
+
+    int  lowerOffset = 0;
+    bool ownMemory   = false;
+
+    const size_t nnz = getNodeNumEntries();
+    // Values
+    values = Teuchos::arcp(mtx_->ExpertExtractValues(), lowerOffset, nnz, ownMemory);
+  }
+
   // Epetra always has global constants
   bool haveGlobalConstants() const  { return true; }
   //! Expert static fill complete
@@ -1691,6 +1704,20 @@ class EpetraCrsMatrixT <long long, EpetraNode>
     values = Teuchos::arcp(mtx_->ExpertExtractValues(), lowerOffset, nnz, ownMemory);
   }
 
+
+  //! Gets the 1D pointer arrays of the graph.
+  void getAllValues(ArrayRCP<Scalar>& values) {
+    XPETRA_MONITOR("EpetraCrsMatrixT::getAllValues");
+
+    int  lowerOffset = 0;
+    bool ownMemory   = false;
+
+    const size_t nnz = getNodeNumEntries();
+    // Values
+    values = Teuchos::arcp(mtx_->ExpertExtractValues(), lowerOffset, nnz, ownMemory);
+  }
+
+
   // Epetra always has global constants
   bool haveGlobalConstants() const  { return true;}
 
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
index fffa4f2b97d2..5492d6475031 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
@@ -200,7 +200,9 @@ namespace Xpetra {
                       ArrayRCP<const LocalOrdinal>& colind, 
                       ArrayRCP<const Scalar>& values)  const;
 
-
+    //! Gets the 1D pointer arrays of the graph (not implemented)
+    void getAllValues(ArrayRCP<Scalar>& values);                      
+                     
     //! @name Transformational Methods
 
     //!
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
index 46e9a86fa7b0..41f226a99de1 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
@@ -276,11 +276,21 @@ namespace Xpetra {
     getAllValues(ArrayRCP<const size_t>& rowptr, 
                  ArrayRCP<const LocalOrdinal>& colind, 
                  ArrayRCP<const Scalar>& values) const
+    { 
+      throw std::runtime_error("Xpetra::TpetraBlockCrsMatrix function not implemented"); 
+    }  
+
+    //! Gets the 1D pointer arrays of the graph (not implemented)
+    template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+    void 
+    TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+    getAllValues(ArrayRCP<Scalar>& values) 
     { 
       throw std::runtime_error("Xpetra::TpetraBlockCrsMatrix function not implemented"); 
     }
 
 
+
     //@}
    
     // Transformational Methods
@@ -940,7 +950,11 @@ setAllValues (const typename local_matrix_type::row_map_type& ptr,
     //! Gets the 1D pointer arrays of the graph (not implemented)
     void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const
     {}
-
+    
+    
+    //! Gets the 1D pointer arrays of the graph (not implemented)
+    void getAllValues(ArrayRCP<Scalar>& values) 
+    {}
 
     //! @name Transformational Methods
 
@@ -1281,6 +1295,11 @@ setAllValues (const typename local_matrix_type::row_map_type& ptr,
     void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const
     {}
 
+    
+    //! Gets the 1D pointer arrays of the graph (not implemented)
+    void getAllValues(ArrayRCP<Scalar>& values) 
+    {}
+
 
     //! @name Transformational Methods
 
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
index bc44002e8ab6..0a13c812b48c 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
@@ -61,7 +61,7 @@
 #include "Xpetra_TpetraMap_decl.hpp"
 #include "Xpetra_TpetraMultiVector_decl.hpp"
 #include "Xpetra_TpetraVector_decl.hpp"
-#include "Xpetra_TpetraCrsGraph_decl.hpp"
+#include "Xpetra_TpetraCrsGraph.hpp"
 #include "Xpetra_Exceptions.hpp"
 
 namespace Xpetra {
@@ -218,6 +218,9 @@ namespace Xpetra {
     void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const
    ;
 
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllValues(ArrayRCP<Scalar>& values);
+
     bool haveGlobalConstants() const
    ;
 
@@ -624,6 +627,9 @@ namespace Xpetra {
     //! Gets the 1D pointer arrays of the graph.
     void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const {  }
 
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllValues(ArrayRCP<Scalar>& values){ }
+
     bool haveGlobalConstants() const  { return false;}
 
     //@}
@@ -1024,6 +1030,10 @@ namespace Xpetra {
     //! Gets the 1D pointer arrays of the graph.
     void getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const {  }
 
+    //! Gets the 1D pointer arrays of the graph.
+    void getAllValues(ArrayRCP<Scalar>& values) { }
+
+
     bool haveGlobalConstants() const  { return false;}
 
     //@}
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_def.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_def.hpp
index a614be84b2e0..49a116c00990 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_def.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_def.hpp
@@ -220,6 +220,10 @@ namespace Xpetra {
     void TpetraCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::getAllValues(ArrayRCP<const size_t>& rowptr, ArrayRCP<const LocalOrdinal>& colind, ArrayRCP<const Scalar>& values) const
     { XPETRA_MONITOR("TpetraCrsMatrix::getAllValues"); mtx_->getAllValues(rowptr,colind,values); }
 
+    template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+    void TpetraCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::getAllValues(ArrayRCP<Scalar>& values)
+    { XPETRA_MONITOR("TpetraCrsMatrix::getAllValues"); mtx_->getAllValues(values); }
+
     template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
     bool TpetraCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::haveGlobalConstants() const
     { return mtx_->haveGlobalConstants();}
diff --git a/packages/xpetra/src/Import/Xpetra_EpetraImport.hpp b/packages/xpetra/src/Import/Xpetra_EpetraImport.hpp
index 5ccdf67dca47..53a3a145fa82 100644
--- a/packages/xpetra/src/Import/Xpetra_EpetraImport.hpp
+++ b/packages/xpetra/src/Import/Xpetra_EpetraImport.hpp
@@ -93,6 +93,10 @@ namespace Xpetra {
     //! Destructor.
     ~EpetraImportT() { }
 
+    //! Special "constructor"
+    Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >
+    createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const {TEUCHOS_TEST_FOR_EXCEPTION(1, Xpetra::Exceptions::NotImplemented, "TODO EpetraImportT<EpetraGlobalOrdinal>::createRemoteOnlyImport not implemented"); }
+
     //@}
 
     //! @name Import Attribute Methods
diff --git a/packages/xpetra/src/Import/Xpetra_Import.hpp b/packages/xpetra/src/Import/Xpetra_Import.hpp
index 32d67c3b76ca..44d0f50a3c05 100644
--- a/packages/xpetra/src/Import/Xpetra_Import.hpp
+++ b/packages/xpetra/src/Import/Xpetra_Import.hpp
@@ -66,6 +66,7 @@ namespace Xpetra {
     typedef LocalOrdinal local_ordinal_type;
     typedef GlobalOrdinal global_ordinal_type;
     typedef Node node_type;
+    typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
 
     //! @name Constructor/Destructor Methods
     //@{
@@ -73,6 +74,10 @@ namespace Xpetra {
     //! Destructor.
     virtual ~Import() { }
 
+    //! Special "constructor"
+    virtual Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >
+    createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const = 0;
+    
    //@}
 
     //! @name Import Attribute Methods
diff --git a/packages/xpetra/src/Import/Xpetra_TpetraImport_decl.hpp b/packages/xpetra/src/Import/Xpetra_TpetraImport_decl.hpp
index 0b73084f5624..8bb3d9488a89 100644
--- a/packages/xpetra/src/Import/Xpetra_TpetraImport_decl.hpp
+++ b/packages/xpetra/src/Import/Xpetra_TpetraImport_decl.hpp
@@ -85,6 +85,10 @@ namespace Xpetra {
     //! Destructor.
     ~TpetraImport();
 
+    //! Special "constructor"
+    Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >
+    createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const;
+
     //@}
 
     //! @name Import Attribute Methods
diff --git a/packages/xpetra/src/Import/Xpetra_TpetraImport_def.hpp b/packages/xpetra/src/Import/Xpetra_TpetraImport_def.hpp
index ee7f3ee487cd..de7db215fbb9 100644
--- a/packages/xpetra/src/Import/Xpetra_TpetraImport_def.hpp
+++ b/packages/xpetra/src/Import/Xpetra_TpetraImport_def.hpp
@@ -73,6 +73,15 @@ template<class LocalOrdinal, class GlobalOrdinal, class Node>
 TpetraImport<LocalOrdinal,GlobalOrdinal,Node>::~TpetraImport()
 {  }
 
+template<class LocalOrdinal, class GlobalOrdinal, class Node>
+Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >   
+TpetraImport<LocalOrdinal,GlobalOrdinal,Node>::createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const {
+  Teuchos::RCP<const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node> > newImport = import_->createRemoteOnlyImport(toTpetra(remoteTarget));
+  return Teuchos::rcp(new TpetraImport<LocalOrdinal,GlobalOrdinal,Node>(newImport));
+}
+
+
+
 template<class LocalOrdinal, class GlobalOrdinal, class Node>
 size_t TpetraImport<LocalOrdinal,GlobalOrdinal,Node>::getNumSameIDs() const
 { XPETRA_MONITOR("TpetraImport::getNumSameIDs"); return import_->getNumSameIDs(); }
@@ -178,6 +187,13 @@ RCP< const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > TpetraImport<Lo
     //! Destructor.
     ~TpetraImport() {  }
 
+
+    //! Special "constructor"
+    Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >
+    createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const {
+      return Teuchos::null;
+    }
+
     //@}
 
     //! @name Import Attribute Methods
@@ -280,6 +296,12 @@ RCP< const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > TpetraImport<Lo
 
     //! Destructor.
     ~TpetraImport() {  }
+    
+    //! Special "constructor"
+    Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> >
+    createRemoteOnlyImport (const Teuchos::RCP<const map_type>& remoteTarget) const {
+      return Teuchos::null;
+    }
 
     //@}
 
diff --git a/packages/xpetra/src/MultiVector/Xpetra_EpetraMultiVectorFactory.cpp b/packages/xpetra/src/MultiVector/Xpetra_EpetraMultiVectorFactory.cpp
index 0ec5bfbf8c73..1caa050af40c 100644
--- a/packages/xpetra/src/MultiVector/Xpetra_EpetraMultiVectorFactory.cpp
+++ b/packages/xpetra/src/MultiVector/Xpetra_EpetraMultiVectorFactory.cpp
@@ -296,7 +296,7 @@ Build(const Teuchos::RCP<const MultiVector<double, int, long long, EpetraNode> >
 #ifdef HAVE_XPETRA_TPETRA
     if(source->getMap()->lib() == UseTpetra)
     {
-      return rcp(new TpetraMultiVector<int, int, long long, EpetraNode>(*source, copyOrView));
+      return rcp(new TpetraMultiVector<double, int, long long, EpetraNode>(*source, copyOrView));
     }
 #endif
 
diff --git a/packages/xpetra/src/MultiVector/Xpetra_MultiVectorFactory_decl.hpp b/packages/xpetra/src/MultiVector/Xpetra_MultiVectorFactory_decl.hpp
index 4b0f97e28c97..620d15fb3078 100644
--- a/packages/xpetra/src/MultiVector/Xpetra_MultiVectorFactory_decl.hpp
+++ b/packages/xpetra/src/MultiVector/Xpetra_MultiVectorFactory_decl.hpp
@@ -50,7 +50,7 @@
 #include "Xpetra_MultiVector_decl.hpp"
 
 #ifdef HAVE_XPETRA_TPETRA
-#include "Xpetra_TpetraMultiVector_decl.hpp"
+#include "Xpetra_TpetraMultiVector.hpp"
 #endif
 
 #ifdef HAVE_XPETRA_EPETRA
diff --git a/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList b/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList
index d91382cf9826..945554bfb2e1 100644
--- a/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList
+++ b/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList
@@ -14,7 +14,7 @@
 #ReorderedBlockedCrsMatrix
 TpetraBlockCrsMatrix
 TpetraCrsMatrix
-TpetraHalfPrecisionOperator
+#TpetraHalfPrecisionOperator
 ##TpetraVbrMatrix
 ##VbrMatrix
 #Vector
diff --git a/packages/xpetra/src/Utils/Xpetra_ImportUtils.hpp b/packages/xpetra/src/Utils/Xpetra_ImportUtils.hpp
new file mode 100644
index 000000000000..dc377ca53bd9
--- /dev/null
+++ b/packages/xpetra/src/Utils/Xpetra_ImportUtils.hpp
@@ -0,0 +1,348 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//             Xpetra: A linear algebra interface package
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu        (jhu@sandia.gov)
+//                    Ray Tuminaro       (rstumin@sandia.gov)
+//                    Chris Siefert      (csiefer@sandia.gov)
+//                    Luc Berger-Vergoat (lberge@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef PACKAGES_XPETRA_IMPORT_UTILS_HPP_
+#define PACKAGES_XPETRA_IMPORT_UTILS_HPP_
+
+#include "Xpetra_ConfigDefs.hpp"
+#include "Xpetra_Exceptions.hpp"
+#include "Xpetra_Map.hpp"          // definition of UnderlyingLib
+#include "Xpetra_Import.hpp"
+#include "Teuchos_Array.hpp"
+#include "Teuchos_ArrayView.hpp"
+
+#include <utility>
+
+#ifdef HAVE_XPETRA_EPETRA
+#include "Epetra_Util.h"
+#include "Xpetra_EpetraImport.hpp"
+#endif
+
+#ifdef HAVE_XPETRA_TPETRA
+#include "Xpetra_TpetraImport.hpp"
+#include "Tpetra_Import_Util.hpp"
+#endif
+
+namespace Xpetra {
+
+  /*!
+    @class ImportUtils
+    @brief Xpetra utility class for Import-related routines
+
+    The routines should be independent from Epetra/Tpetra and be purely implemented in Xpetra.
+
+  */
+  template <class LocalOrdinal,            
+            class GlobalOrdinal,
+            class Node = KokkosClassic::DefaultNode::DefaultNodeType>
+  class ImportUtils {
+#undef XPETRA_IMPORTUTILS_SHORT
+
+public:
+    /// \brief For each GID in the TargetMap, find who owns the GID in the SourceMap.
+    ///
+    /// This only uses the Distributor and does not communicate.  It
+    /// returns (as an output argument) an array of (PID,GID) pairs.
+    /// If use_minus_one_for_local is true, any GIDs owned by this
+    /// processor get -1 instead of their PID.
+    void
+    getPidGidPairs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                    Teuchos::Array< std::pair<int,GlobalOrdinal> >& gpids,
+                    bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+#if defined(HAVE_XPETRA_EPETRA)
+        throw(Xpetra::Exceptions::RuntimeError("Xpetra::ImportUtils only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)"));
+#endif // HAVE_XPETRA_EPETRA
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPidGidPairs(Xpetra::toTpetra(Importer),gpids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::Array<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+#if defined(HAVE_XPETRA_EPETRA)
+        throw(Xpetra::Exceptions::RuntimeError("Xpetra::ImportUtils only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)"));
+#endif // HAVE_XPETRA_EPETRA
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+   
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    // Like the above, but without the resize
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::ArrayView<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+#if defined(HAVE_XPETRA_EPETRA)
+        throw(Xpetra::Exceptions::RuntimeError("Xpetra::ImportUtils only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)"));
+#endif // HAVE_XPETRA_EPETRA
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+   }
+
+    
+    /// \brief Get a list of remote PIDs from an importer in the order
+    ///   corresponding to the remote LIDs.
+    void
+    getRemotePIDs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                   Teuchos::Array<int>& RemotePIDs) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+#if defined(HAVE_XPETRA_EPETRA)
+        throw(Xpetra::Exceptions::RuntimeError("Xpetra::ImportUtils only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)"));
+#endif // HAVE_XPETRA_EPETRA
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getRemotePIDs(Xpetra::toTpetra(Importer),RemotePIDs);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+    
+
+  }; // end class ImportUtils
+
+#ifdef HAVE_XPETRA_EPETRA
+// Specialization for int, int, EpetraNode
+  template <>
+  class ImportUtils<int,int,EpetraNode> {
+    typedef int             LocalOrdinal;
+    typedef int             GlobalOrdinal;
+    typedef EpetraNode      Node;
+#undef XPETRA_IMPORTUTILS_SHORT
+
+  public:
+    void
+    getPidGidPairs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                    Teuchos::Array< std::pair<int,GlobalOrdinal> >& gpids,
+                    bool use_minus_one_for_local) {
+
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector< std::pair<int,GlobalOrdinal> > gpids_v(gpids.size());
+        Epetra_Util::GetPidGidPairs(*e_Importer,gpids_v,use_minus_one_for_local);
+        std::copy(gpids_v.begin(),gpids_v.end(),gpids.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPidGidPairs(Xpetra::toTpetra(Importer),gpids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::Array<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int > pids_v(pids.size());
+        Epetra_Util::GetPids(*e_Importer,pids_v,use_minus_one_for_local);
+        std::copy(pids_v.begin(),pids_v.end(),pids.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+   
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    // Like the above, but without the resize
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::ArrayView<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int> pids_v(pids.begin(),pids.end());
+        Epetra_Util::GetPids(*e_Importer,pids_v,use_minus_one_for_local);
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+   }
+
+    
+    /// \brief Get a list of remote PIDs from an importer in the order
+    ///   corresponding to the remote LIDs.
+    void
+    getRemotePIDs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                   Teuchos::Array<int>& RemotePIDs) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int> pids_v(RemotePIDs.size());
+        Epetra_Util::GetRemotePIDs(*e_Importer,pids_v);
+        std::copy(pids_v.begin(),pids_v.end(),RemotePIDs.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getRemotePIDs(Xpetra::toTpetra(Importer),RemotePIDs);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }  
+
+
+  }; // end class ImportUtils
+
+
+// Specialization for double, int, long long, EpetraNode
+  template <>
+  class ImportUtils<int,long long,EpetraNode> {
+    typedef int             LocalOrdinal;
+    typedef long long       GlobalOrdinal;
+    typedef EpetraNode      Node;
+#undef XPETRA_IMPORTUTILS_SHORT
+
+  public:
+
+
+    void
+    getPidGidPairs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                    Teuchos::Array< std::pair<int,GlobalOrdinal> >& gpids,
+                    bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector< std::pair<int,GlobalOrdinal> > gpids_v(gpids.size());
+        Epetra_Util::GetPidGidPairs(*e_Importer,gpids_v,use_minus_one_for_local);
+        std::copy(gpids_v.begin(),gpids_v.end(),gpids.begin());
+
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPidGidPairs(Xpetra::toTpetra(Importer),gpids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::Array<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int > pids_v(pids.size());
+        Epetra_Util::GetPids(*e_Importer,pids_v,use_minus_one_for_local);
+        std::copy(pids_v.begin(),pids_v.end(),pids.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }
+
+   
+    //! Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
+    // Like the above, but without the resize
+    void
+    getPids (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+             Teuchos::ArrayView<int>& pids,
+             bool use_minus_one_for_local) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int > pids_v(pids.size());
+        Epetra_Util::GetPids(*e_Importer,pids_v,use_minus_one_for_local);
+        std::copy(pids_v.begin(),pids_v.end(),pids.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getPids(Xpetra::toTpetra(Importer),pids,use_minus_one_for_local);
+#endif // HAVE_XPETRA_TPETRA
+      }
+   }
+
+    
+    /// \brief Get a list of remote PIDs from an importer in the order
+    ///   corresponding to the remote LIDs.
+    void
+    getRemotePIDs (const Import<LocalOrdinal,GlobalOrdinal,Node>& Importer,
+                   Teuchos::Array<int>& RemotePIDs) {
+      UnderlyingLib lib = Importer.getSourceMap()->lib();
+      if(lib == Xpetra::UseEpetra) {
+        RCP<const Epetra_Import> e_Importer=dynamic_cast<const EpetraImportT<GlobalOrdinal,Node>* >(&Importer)->getEpetra_Import();
+        std::vector<int> pids_v(RemotePIDs.size());
+        Epetra_Util::GetRemotePIDs(*e_Importer,pids_v);
+        std::copy(pids_v.begin(),pids_v.end(),RemotePIDs.begin());
+      } else if(lib == Xpetra::UseTpetra) {
+#ifdef HAVE_XPETRA_TPETRA
+        Tpetra::Import_Util::getRemotePIDs(Xpetra::toTpetra(Importer),RemotePIDs);
+#endif // HAVE_XPETRA_TPETRA
+      }
+    }  
+
+  }; // end class ImportUtils
+#endif // HAVE_XPETRA_EPETRA for Epetra scpecialization
+
+} // end namespace Xpetra
+
+#define XPETRA_IMPORTUTILS_SHORT
+
+#endif // PACKAGES_XPETRA_IMPORT_UTILS_HPP_
diff --git a/packages/xpetra/sup/Utils/Xpetra_IO.hpp b/packages/xpetra/sup/Utils/Xpetra_IO.hpp
index 386785759caf..1f785e4e724c 100644
--- a/packages/xpetra/sup/Utils/Xpetra_IO.hpp
+++ b/packages/xpetra/sup/Utils/Xpetra_IO.hpp
@@ -1269,7 +1269,8 @@ namespace Xpetra {
         //TEUCHOS_TEST_FOR_EXCEPTION(true, ::Xpetra::Exceptions::BadCast, "Epetra can only be used with Scalar=double and Ordinal=int");
 #if defined(HAVE_XPETRA_EPETRA) && defined(HAVE_XPETRA_EPETRAEXT)
         Epetra_MultiVector * MV;
-        EpetraExt::MatrixMarketFileToMultiVector(fileName.c_str(), toEpetra(map), MV);
+        int rv = EpetraExt::MatrixMarketFileToMultiVector(fileName.c_str(), toEpetra(map), MV);
+        if(rv != 0) throw Exceptions::RuntimeError("EpetraExt::MatrixMarketFileToMultiVector failed");
         RCP<Epetra_MultiVector> MVrcp = rcp(MV);
         return Convert_Epetra_MultiVector_ToXpetra_MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>(MVrcp);
 #else
diff --git a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybrid2GL.hpp b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybrid2GL.hpp
index a4e1ded6aed9..580219b6d3b9 100644
--- a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybrid2GL.hpp
+++ b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybrid2GL.hpp
@@ -270,7 +270,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
     RCP<Environment> env;
     RCP<const Teuchos::Comm<int> > comm;
     bool verbose;
-
+    bool timing;
     
   private:
     //This function constructs a CSR with complete adjacency information for
@@ -688,8 +688,8 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
       std::vector<int> recvcnts(comm->getSize(), 0);
       Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
 
-      //if we're computing statistics, remove the computation imbalance from the comm timer
-      if(verbose) comm->barrier();
+      //if we're reporting times, remove the computation imbalance from the comm timer
+      if(timing) comm->barrier();
       double comm_total = 0.0;
       double comm_temp = timer();
       
@@ -720,6 +720,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
       const RCP<const Teuchos::Comm<int> > &comm_)
     : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
       verbose = pl->get<bool>("verbose",false);
+      timing = pl->get<bool>("timing", false);
       modelFlag_t flags;
       flags.reset();
       buildModel(flags);
@@ -1071,7 +1072,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
       double conflict_detection = 0.0;
       
       //Number of rounds we are saving statistics for
-      //100 is a decent default.
+      //100 is a decent default. Reporting requires --verbose argument.
       const int numStatisticRecordingRounds = 100;
       
       //includes all ghosts, including the second layer.
@@ -1322,7 +1323,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
       //Done initializing, start coloring!
 
       //use a barrier if we are reporting timing info
-      if(verbose) comm->barrier();
+      if(timing) comm->barrier();
       interior_time = timer();
       total_time = timer();
       //give the entire local graph to KokkosKernels to color
@@ -1427,7 +1428,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
           vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
         }
         
-        if(verbose) comm->barrier();
+        if(timing) comm->barrier();
         double recolor_temp = timer();
         //recolor using KokkosKernels' coloring function 
         if(verts_to_recolor_size_host(0) > 0){
@@ -1437,10 +1438,13 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	if(distributedRounds < numStatisticRecordingRounds){
           recoloringPerRound[distributedRounds] = timer() - recolor_temp;
           recoloring_time += recoloringPerRound[distributedRounds];
-          total_time += recoloringPerRound[distributedRounds];
           comp_time += recoloringPerRound[distributedRounds];
           compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
           totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
+	} else if(timing){
+	  double recoloring_round_time = timer() - recolor_temp;
+	  recoloring_time += recoloring_round_time;
+	  comp_time += recoloring_round_time;
 	}
         
 	//reset the ghost colors to what they were before recoloring
@@ -1461,10 +1465,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
           commPerRound[distributedRounds] = curr_comm_time;
 	  recvPerRound[distributedRounds] = recv;
           sentPerRound[distributedRounds] = sent;
-	  if(verbose) {
-            std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
-            std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
-	  }
           totalPerRound[distributedRounds] += commPerRound[distributedRounds];
 	}
         
@@ -1507,7 +1507,11 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
           compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
           totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
           comp_time += conflictDetectionPerRound[distributedRounds];
-        }
+        } else if(timing){
+	  double conflict_detection_round_time = timer() - detection_temp;
+	  conflict_detection += conflict_detection_round_time;
+	  comp_time += conflict_detection_round_time;
+	}
 
         distributedRounds++;
         size_t localDone = recoloringSize_host(0);
@@ -1535,10 +1539,9 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	if(distributedRounds < numStatisticRecordingRounds){
 	  vertsPerRound[distributedRounds] = recoloringSize_host(0);
 	}
-	if(verbose){
-            std::cout<<comm->getRank()<<": starting to recolor, serial\n";
-            comm->barrier();
-	}
+	if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
+        if(timing) comm->barrier();
+	
 	double recolor_temp = timer();
 	if(verts_to_recolor_size_host(0) > 0){
 	  this->colorInterior_serial(femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, 
@@ -1547,10 +1550,13 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	if(distributedRounds < numStatisticRecordingRounds){
 	  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
 	  recoloring_time += recoloringPerRound[distributedRounds];
-	  total_time += recoloringPerRound[distributedRounds];
 	  comp_time += recoloringPerRound[distributedRounds];
 	  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
 	  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
+	} else if(timing){
+	  double recoloring_serial_round_time = timer() - recolor_temp;
+	  recoloring_time += recoloring_serial_round_time;
+	  comp_time += recoloring_serial_round_time;
 	}
 
 	//reset the ghost colors to their previous values to avoid
@@ -1566,10 +1572,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	  commPerRound[distributedRounds] = curr_comm_time;
 	  recvPerRound[distributedRounds] = recv;
 	  sentPerRound[distributedRounds] = sent;
-	  if(verbose) {
-	    std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
-	    std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
-	  }
 	  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
 	}
         
@@ -1579,7 +1581,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	  ghost_colors_host(i) = colors_host(i+n_local);
 	}
 
-	if(verbose) comm->barrier();
+	if(timing) comm->barrier();
 	double detection_temp = timer();
         
 	//zero these out, they'll be updated by detectConflicts_serial
@@ -1599,7 +1601,12 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
 	  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
 	  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
 	  comp_time += conflictDetectionPerRound[distributedRounds];
-        }
+        } else if(timing){
+	  double conflict_detection_serial_round_time = timer() - detection_temp;
+	  conflict_detection += conflict_detection_serial_round_time;
+	  comp_time += conflict_detection_serial_round_time;
+	}
+
 	size_t globalDone = 0;
 	size_t localDone = recoloringSize_host(0);
 	Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
@@ -1656,6 +1663,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
         Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
         Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
         printf("Rank %d: boundary size: %ld\n",comm->getRank(),localBoundaryVertices);
+        if(comm->getRank() == 0) printf("Total boundary size: %ld\n",totalBoundarySize);
         for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
           printf("Rank %d: recolor %ld vertices in round %d\n",comm->getRank(), vertsPerRound[i],i);
           printf("Rank %d: sentbuf had %lld entries in round %d\n", comm->getRank(), sentPerRound[i],i);
@@ -1672,6 +1680,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
             printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
           }
         }
+      } else if (timing){
         double global_total_time = 0.0;
         double global_recoloring_time = 0.0;
         double global_min_recoloring_time = 0.0;
@@ -1689,7 +1698,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
         comm->barrier();
         fflush(stdout);
         if(comm->getRank()==0){
-          printf("Boundary size: %ld\n",totalBoundarySize);
           printf("Total Time: %f\n",global_total_time);
           printf("Interior Time: %f\n",global_interior_time);
           printf("Recoloring Time: %f\n",global_recoloring_time);
diff --git a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1-2GL.hpp b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1-2GL.hpp
index d0d846252c06..1885f42714ee 100644
--- a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1-2GL.hpp
+++ b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1-2GL.hpp
@@ -210,7 +210,9 @@ class AlgDistance1TwoGhostLayer : public AlgTwoGhostLayer<Adapter> {
         }
       },recoloringSize(0));
       Kokkos::fence();
-      Kokkos::parallel_for(femv_colors.size(), KOKKOS_LAMBDA (const size_t& i){
+      Kokkos::parallel_for("rebuild verts_to_send and verts_to_recolor",
+		           Kokkos::RangePolicy<ExecutionSpace>(0,femv_colors.size()), 
+			   KOKKOS_LAMBDA (const size_t& i){
         if(femv_colors(i) == 0){
           if(i < n_local){
             verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
diff --git a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1.hpp b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1.hpp
index 9c0689e97abd..36f1d46f8054 100644
--- a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1.hpp
+++ b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD1.hpp
@@ -232,11 +232,15 @@ class AlgDistance1 : public Algorithm<Adapter>
           }
         }
       },recoloringSize(0));
-      Kokkos::parallel_for(n_local, KOKKOS_LAMBDA(const int& i){
+      Kokkos::fence();
+      Kokkos::parallel_for("Rebuild verts_to_send_view",
+		           Kokkos::RangePolicy<ExecutionSpace>(0,n_local), 
+			   KOKKOS_LAMBDA(const int& i){
         if(femv_colors(i) == 0){
           verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
         }
       });
+      Kokkos::fence();
     }
 
   private:
@@ -317,8 +321,8 @@ class AlgDistance1 : public Algorithm<Adapter>
       std::vector<int> recvcnts(comm->getSize(), 0);
       Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
 
-      //if we're computing statistics, remove the computation imbalance from the comm timer.
-      if(verbose) comm->barrier();
+      //if we're reporting timings, remove the computation imbalance from the comm timer.
+      if(timing) comm->barrier();
       double comm_total = 0.0;
       double comm_temp = timer();
 
@@ -346,7 +350,7 @@ class AlgDistance1 : public Algorithm<Adapter>
     RCP<Environment> env;
     RCP<const Teuchos::Comm<int> > comm;
     bool verbose;
-
+    bool timing;
   public:
     //constructor for the  hybrid distributed distance-1 algorithm
     AlgDistance1(
@@ -356,6 +360,7 @@ class AlgDistance1 : public Algorithm<Adapter>
       const RCP<const Teuchos::Comm<int> > &comm_)
     : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
       verbose = pl->get<bool>("verbose",false);
+      timing = pl->get<bool>("timing", false);
       if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
       modelFlag_t flags;
       flags.reset();
@@ -724,18 +729,18 @@ class AlgDistance1 : public Algorithm<Adapter>
       if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
       //initialize interior and total timers, barrier to prevent any imbalance from setup.
       //Only use a barrier if timing is happening.
-      if(verbose) comm->barrier();
+      if(timing) comm->barrier();
       interior_time = timer();
       total_time = timer();
       //call the KokkosKernels coloring function with the Tpetra default spaces.
       bool use_vbbit = (global_max_degree < 6000);
       this->colorInterior<execution_space,memory_space>
                  (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
-      if(verbose){
+      if(timing){
         interior_time = timer() - interior_time;
         comp_time = interior_time;
-        std::cout<<comm->getRank()<<": Going to recolor\n";
       }
+      if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
       bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
 
       //if there is more than a single process, check distributed conflicts and recolor
@@ -819,7 +824,6 @@ class AlgDistance1 : public Algorithm<Adapter>
         if(distributedRounds < numStatisticRecordingRounds) {
           vertsPerRound[distributedRounds] = recoloringSize_host(0);
         }
-        if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
 	
 	//copying the send view to the recolor view is necessary because
 	//KokkosKernels can change the view passed in, and we need the send view
@@ -844,9 +848,12 @@ class AlgDistance1 : public Algorithm<Adapter>
           comp_time += recoloringPerRound[distributedRounds];
           compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
           totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
+	} else if(timing) {
+	  double recolor_round_time = timer() - recolor_temp;
+	  recoloring_time += recolor_round_time;
+	  comp_time += recolor_round_time;
 	}
         
-        if(verbose) std::cout<<comm->getRank()<<": done recoloring\n";
 	//reset the recoloringSize device host and device views
 	//to zero
         recoloringSize_host(0) = 0;
@@ -874,10 +881,6 @@ class AlgDistance1 : public Algorithm<Adapter>
         comm_time += curr_comm_time;
         if(distributedRounds < numStatisticRecordingRounds){
 	  commPerRound[distributedRounds] = curr_comm_time;
-	  if(verbose){
-            std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
-            std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
-  	  }
           sentPerRound[distributedRounds] = sent;
           recvPerRound[distributedRounds] = recv;
           totalPerRound[distributedRounds] += commPerRound[distributedRounds];
@@ -913,6 +916,10 @@ class AlgDistance1 : public Algorithm<Adapter>
           compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
           totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
           comp_time += conflictDetectionPerRound[distributedRounds];
+	} else if(timing){
+	  double conflict_detection_round_time = timer()- detection_temp;
+	  conflict_detection += conflict_detection_round_time;
+	  comp_time += conflict_detection_round_time;
 	}
         //do a reduction to determine if we're done
         int globalDone = 0;
@@ -942,7 +949,6 @@ class AlgDistance1 : public Algorithm<Adapter>
 	if(distributedRounds < 100){
 	  vertsPerRound[distributedRounds] = recoloringSize_host(0);
 	}
-	if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
 
 	double recolor_temp = timer();
 	//use KokkosKernels to recolor the conflicting vertices
@@ -958,9 +964,12 @@ class AlgDistance1 : public Algorithm<Adapter>
 	  comp_time += recoloringPerRound[distributedRounds];
 	  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
 	  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
-        }
+        } else if(timing){
+	  double recolor_serial_round_time = timer() - recolor_temp;
+	  recoloring_time += recolor_serial_round_time;
+	  comp_time += recolor_serial_round_time;
+	}
 
-	if(verbose) std::cout<<comm->getRank()<<": done recoloring\n";
 	recoloringSize_host(0) = 0;
 
 	for(size_t i = 0; i < rand.size() -nVtx; i++){
@@ -980,10 +989,6 @@ class AlgDistance1 : public Algorithm<Adapter>
 
 	if(distributedRounds < numStatisticRecordingRounds){
 	  commPerRound[distributedRounds] = curr_comm_time;
-	  if(verbose){
-	    std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
-	    std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
-	  }
 	  sentPerRound[distributedRounds] = sent;
 	  recvPerRound[distributedRounds] = recv;
 	  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
@@ -1012,7 +1017,11 @@ class AlgDistance1 : public Algorithm<Adapter>
 	  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
 	  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
 	  comp_time += conflictDetectionPerRound[distributedRounds];
-        }
+        } else if(timing){
+	  double conflict_detection_serial_round_time = timer() - detection_temp;
+	  conflict_detection += conflict_detection_serial_round_time;
+	  comp_time += conflict_detection_serial_round_time;
+	}
 	//do a reduction to determine if we're done
 	int globalDone = 0;
 	int localDone = recoloringSize_host(0);
@@ -1036,8 +1045,8 @@ class AlgDistance1 : public Algorithm<Adapter>
         }
         //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
         if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
+	int totalBoundarySize = 0;
         int totalVertsPerRound[numStatisticRecordingRounds];
-        int totalBoundarySize = 0;
         double finalTotalPerRound[numStatisticRecordingRounds];
         double maxRecoloringPerRound[numStatisticRecordingRounds];
         double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
@@ -1072,6 +1081,7 @@ class AlgDistance1 : public Algorithm<Adapter>
         Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
         
         printf("Rank %d: boundary size: %d\n",comm->getRank(),localBoundaryVertices);
+        if(comm->getRank()==0) printf("Total boundary size: %d\n",totalBoundarySize);
         for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
           printf("Rank %d: recolor %d vertices in round %d\n",comm->getRank(),vertsPerRound[i],i);
           if(comm->getRank()==0) printf("recolored %d vertices in round %d\n",totalVertsPerRound[i],i);
@@ -1085,7 +1095,7 @@ class AlgDistance1 : public Algorithm<Adapter>
           if(comm->getRank()==0) printf("total recv in round %d: %lld\n",i,finalRecvPerRound[i]);
           if(comm->getRank()==0) printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
         }
-        
+      } else if(timing){
         double global_total_time = 0.0;
         double global_recoloring_time=0.0;
         double global_min_recoloring_time=0.0;
@@ -1103,7 +1113,6 @@ class AlgDistance1 : public Algorithm<Adapter>
         comm->barrier();
         fflush(stdout);
         if(comm->getRank()==0){
-          printf("Boundary size: %d\n",totalBoundarySize);
           printf("Total Time: %f\n",global_total_time);
           printf("Interior Time: %f\n",global_interior_time);
           printf("Recoloring Time: %f\n",global_recoloring_time);
@@ -1112,8 +1121,8 @@ class AlgDistance1 : public Algorithm<Adapter>
           printf("Comm Time: %f\n",global_comm_time);
           printf("Comp Time: %f\n",global_comp_time);
         }
-        std::cout<<comm->getRank()<<": exiting coloring\n";
       }
+      if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
     }
 };
 
diff --git a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD2.hpp b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD2.hpp
index 6b3a4ecbd6ad..953a66d809b2 100644
--- a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD2.hpp
+++ b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridD2.hpp
@@ -255,7 +255,9 @@ class AlgDistance2 : public AlgTwoGhostLayer<Adapter> {
       Kokkos::fence();
 
       //update the verts_to_send and verts_to_recolor views.
-      Kokkos::parallel_for(femv_colors.size(), KOKKOS_LAMBDA(const uint64_t& i){
+      Kokkos::parallel_for("rebuild verts_to_send and verts_to_recolor",
+		           Kokkos::RangePolicy<ExecutionSpace>(0,femv_colors.size()), 
+			   KOKKOS_LAMBDA(const uint64_t& i){
         if(femv_colors(i) == 0){
 	  //we only send vertices owned by the current process
           if(i < n_local){
diff --git a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridPD2.hpp b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridPD2.hpp
index 4357cab70934..75f1707e265e 100644
--- a/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridPD2.hpp
+++ b/packages/zoltan2/core/src/algorithms/color/Zoltan2_AlgHybridPD2.hpp
@@ -218,7 +218,9 @@ class AlgPartialDistance2 : public AlgTwoGhostLayer<Adapter> {
         },recoloringSize(0));
         Kokkos::fence();
 	//update the verts_to_send and verts_to_recolor views
-        Kokkos::parallel_for(femv_colors.size(), KOKKOS_LAMBDA(const uint64_t& i){
+        Kokkos::parallel_for("rebuild verts_to_send and verts_to_recolor",
+			     Kokkos::RangePolicy<ExecutionSpace>(0,femv_colors.size()),
+			     KOKKOS_LAMBDA(const uint64_t& i){
           if(femv_colors(i) == 0){
             if(i < n_local){
 	      //we only send vertices owned by the current process
diff --git a/packages/zoltan2/core/src/problems/Zoltan2_ColoringProblem.hpp b/packages/zoltan2/core/src/problems/Zoltan2_ColoringProblem.hpp
index efd16e3b32a0..12b47b95214e 100644
--- a/packages/zoltan2/core/src/problems/Zoltan2_ColoringProblem.hpp
+++ b/packages/zoltan2/core/src/problems/Zoltan2_ColoringProblem.hpp
@@ -139,6 +139,7 @@ class ColoringProblem : public Problem<Adapter>
     pl.set("color_method", "SerialGreedy", "coloring algorithm",
      color_method_Validator);
     pl.set("verbose", false, "print all output", Environment::getBoolValidator());
+    pl.set("timing", false, "print timing data", Environment::getBoolValidator());
     pl.set("serial_threshold",0,"vertices to recolor in serial",Environment::getAnyIntValidator());
     pl.set("recolor_degrees",true,"recolor based on vertex degrees",Environment::getBoolValidator());
   }
diff --git a/packages/zoltan2/test/core/color/CMakeLists.txt b/packages/zoltan2/test/core/color/CMakeLists.txt
index 3ad847cf7cb4..d749d79d4909 100644
--- a/packages/zoltan2/test/core/color/CMakeLists.txt
+++ b/packages/zoltan2/test/core/color/CMakeLists.txt
@@ -37,7 +37,7 @@ TRIBITS_ADD_TEST(
   NUM_MPI_PROCS 4
   COMM serial mpi
   ARGS
-  "--inputFile=simple --colorMethod=D1"
+  "--inputFile=simple --colorMethod=D1 --timing"
   PASS_REGULAR_EXPRESSION "PASS"
   FAIL_REGULAR_EXPRESSION "FAIL"
   )
@@ -155,7 +155,7 @@ TRIBITS_ADD_TEST(
   NUM_MPI_PROCS 4
   COMM serial mpi
   ARGS
-  "--inputFile=simple --colorMethod=D1-2GL"
+  "--inputFile=simple --colorMethod=D1-2GL --timing"
   PASS_REGULAR_EXPRESSION "PASS"
   FAIL_REGULAR_EXPRESSION "FAIL"
   )
diff --git a/packages/zoltan2/test/core/color/coloring1.cpp b/packages/zoltan2/test/core/color/coloring1.cpp
index 7a3ad1a88bfb..e783f87389f5 100644
--- a/packages/zoltan2/test/core/color/coloring1.cpp
+++ b/packages/zoltan2/test/core/color/coloring1.cpp
@@ -197,6 +197,7 @@ int main(int narg, char** arg)
   std::string outputFile = "";           // Output file to write
   std::string colorAlg = "SerialGreedy"; // Default algorithm is the serial greedy
   bool verbose = false;                  // Verbosity of output
+  bool timing = false;                   // If true, report coloring times.
   int testReturn = 0;
   bool recolorDegrees = false;
   std::string prepartition = "";    	 // Call Zoltan2 partitioning to better distribute
@@ -229,6 +230,8 @@ int main(int narg, char** arg)
 		 "number of vertices to recolor in serial");
   cmdp.setOption("recolorDegrees","recolorRandom",&recolorDegrees,
 		 "recolor based on vertex degrees or random numbers");
+  cmdp.setOption("timing", "notimes", &timing,
+		  "report how long coloring takes");
   std::cout << "Starting everything" << std::endl;
 
   //////////////////////////////////
@@ -341,6 +344,7 @@ int main(int narg, char** arg)
   params.set("color_choice", colorMethod);
   params.set("color_method", colorAlg);
   params.set("verbose", verbose);
+  params.set("timing", timing);
   params.set("serial_threshold",serialThreshold);
   params.set("recolor_degrees",recolorDegrees);
   //params.set("balance_colors", balanceColors); // TODO