diff --git a/cmake/std/sems/PullRequestClang10.0.0TestingEnv.sh b/cmake/std/sems/PullRequestClang10.0.0TestingEnv.sh
index 6042879d3fd9..e8428587f53e 100644
--- a/cmake/std/sems/PullRequestClang10.0.0TestingEnv.sh
+++ b/cmake/std/sems/PullRequestClang10.0.0TestingEnv.sh
@@ -9,13 +9,13 @@
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
-module load sems-git/2.10.1
 module load sems-gcc/5.3.0
 module load sems-clang/10.0.0
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
 module load sems-boost/1.69.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -23,8 +23,14 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.17.1
 module load sems-ninja_fortran/1.10.0
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestClang7.0.1TestingEnv.sh b/cmake/std/sems/PullRequestClang7.0.1TestingEnv.sh
index b6066ad86bfe..93975363d4db 100644
--- a/cmake/std/sems/PullRequestClang7.0.1TestingEnv.sh
+++ b/cmake/std/sems/PullRequestClang7.0.1TestingEnv.sh
@@ -9,13 +9,14 @@
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-git/2.10.1
 module load sems-gcc/5.3.0
 module load sems-clang/7.0.1
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -23,7 +24,6 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
-module load sems-cmake/3.12.2
 
 # Load the SEMS CMake Module
 # - One of the SEMS modules will load CMake 3.4.x also,
@@ -41,5 +41,8 @@ module load atdm-env
 #module load atdm-cmake/3.10.1
 module load atdm-ninja_fortran/1.7.2
 
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestClang9.0.0TestingEnv.sh b/cmake/std/sems/PullRequestClang9.0.0TestingEnv.sh
index b70114233685..329895a918d2 100644
--- a/cmake/std/sems/PullRequestClang9.0.0TestingEnv.sh
+++ b/cmake/std/sems/PullRequestClang9.0.0TestingEnv.sh
@@ -9,13 +9,14 @@
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-git/2.10.1
 module load sems-gcc/5.3.0
 module load sems-clang/9.0.0
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -23,13 +24,12 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
-module load sems-cmake/3.12.2
 
 # Load the SEMS CMake Module
 # - One of the SEMS modules will load CMake 3.4.x also,
 #   so this will pull in the SEMS cmake 3.10.3 version
 #   for Trilinos compatibility.
-module load sems-cmake/3.12.2
+module load sems-cmake/3.10.3
 
 # Using CMake and Ninja modules from the ATDM project space.
 # SEMS does not yet supply a recent enough version of CMake
@@ -41,5 +41,9 @@ module load atdm-env
 #module load atdm-cmake/3.10.1
 module load atdm-ninja_fortran/1.7.2
 
+module unload sems-python
+module load sems-python/3.5.2
+
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestGCC4.8.4TestingEnv.sh b/cmake/std/sems/PullRequestGCC4.8.4TestingEnv.sh
index 81be3b7e7a88..42e81ed73ca4 100644
--- a/cmake/std/sems/PullRequestGCC4.8.4TestingEnv.sh
+++ b/cmake/std/sems/PullRequestGCC4.8.4TestingEnv.sh
@@ -8,12 +8,12 @@
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
  
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/4.8.4
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -29,5 +29,10 @@ module load sems-superlu/4.3/base
 module load sems-cmake/3.10.3
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestGCC4.9.3TestingEnv.sh b/cmake/std/sems/PullRequestGCC4.9.3TestingEnv.sh
index e82c8c4cbb2b..dcef5dfbbed0 100644
--- a/cmake/std/sems/PullRequestGCC4.9.3TestingEnv.sh
+++ b/cmake/std/sems/PullRequestGCC4.9.3TestingEnv.sh
@@ -4,12 +4,12 @@
 
 # usage: $ source PullRequestGCC4.9.3TestingEnv.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/4.9.3
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -17,9 +17,15 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.12.2
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
 
diff --git a/cmake/std/sems/PullRequestGCC4.9.3TestingEnvSERIAL.sh b/cmake/std/sems/PullRequestGCC4.9.3TestingEnvSERIAL.sh
index dce5e353de08..6cc883051734 100644
--- a/cmake/std/sems/PullRequestGCC4.9.3TestingEnvSERIAL.sh
+++ b/cmake/std/sems/PullRequestGCC4.9.3TestingEnvSERIAL.sh
@@ -4,11 +4,11 @@
 
 # usage: $ source PullRequestGCC4.9.3TestingEnv.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/4.9.3
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/base
@@ -16,8 +16,15 @@ module load sems-netcdf/4.7.3/base
 module load sems-metis/5.1.0/base
 # module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.12.2
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestGCC7.2.0SerialTestingEnv.sh b/cmake/std/sems/PullRequestGCC7.2.0SerialTestingEnv.sh
index d0f7431346d5..7a995c6d22b1 100644
--- a/cmake/std/sems/PullRequestGCC7.2.0SerialTestingEnv.sh
+++ b/cmake/std/sems/PullRequestGCC7.2.0SerialTestingEnv.sh
@@ -4,19 +4,26 @@
 
 # usage: $ source PullRequestGCC7.2.0TestingEnv.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/7.2.0
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/base
 module load sems-netcdf/4.7.3/base
 module load sems-metis/5.1.0/base
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.12.2
 module load sems-ninja_fortran/1.8.2
 
+# Boost loads sems-python/2.7.x as a side effect.
+module unload sems-python
+module load sems-python/3.5.2
+
+module load sems-git/2.10.1
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/cmake/std/sems/PullRequestGCC7.2.0TestingEnv.sh b/cmake/std/sems/PullRequestGCC7.2.0TestingEnv.sh
index d9972e3b6869..f858d316ed17 100644
--- a/cmake/std/sems/PullRequestGCC7.2.0TestingEnv.sh
+++ b/cmake/std/sems/PullRequestGCC7.2.0TestingEnv.sh
@@ -4,12 +4,12 @@
 
 # usage: $ source PullRequestGCC7.2.0TestingEnv.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/7.2.0
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -17,9 +17,17 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
-module load sems-cmake/3.12.2
+
+module load sems-cmake/3.10.3
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+# Note: sems-boost also loads sems-python/2.7.x, we need to
+#       unload it here.
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
 
diff --git a/cmake/std/sems/PullRequestGCC8.3.0TestingEnv.sh b/cmake/std/sems/PullRequestGCC8.3.0TestingEnv.sh
index 628ac355869e..69c913cbf925 100644
--- a/cmake/std/sems/PullRequestGCC8.3.0TestingEnv.sh
+++ b/cmake/std/sems/PullRequestGCC8.3.0TestingEnv.sh
@@ -4,12 +4,12 @@
 
 # usage: $ source PullRequestGCC8.3.0TestingEnv.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 module load sems-gcc/8.3.0
 module load sems-openmpi/1.10.1
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.66.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -17,8 +17,17 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.17.1
 module load sems-ninja_fortran/1.10.0
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
+
+
diff --git a/cmake/std/sems/PullRequestIntel17.0.1TestingEnv.sh b/cmake/std/sems/PullRequestIntel17.0.1TestingEnv.sh
index 18fbc3611d3c..26bcf3fc970a 100644
--- a/cmake/std/sems/PullRequestIntel17.0.1TestingEnv.sh
+++ b/cmake/std/sems/PullRequestIntel17.0.1TestingEnv.sh
@@ -7,6 +7,8 @@
 # After the environment is no longer needed, it can be purged using
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
+
+module purge
  
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
@@ -14,8 +16,6 @@ export SEMS_FORCE_LOCAL_COMPILER_VERSION=4.9.3
 module load sems-gcc/5.3.0
 module load sems-intel/17.0.1
 module load sems-mpich/3.2
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -23,9 +23,15 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.12.2
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
 
diff --git a/cmake/std/sems/PullRequestIntel19.0.5TestingEnv.sh b/cmake/std/sems/PullRequestIntel19.0.5TestingEnv.sh
index 08af6094b5b7..ef908231c123 100644
--- a/cmake/std/sems/PullRequestIntel19.0.5TestingEnv.sh
+++ b/cmake/std/sems/PullRequestIntel19.0.5TestingEnv.sh
@@ -8,14 +8,14 @@
 # $ module purge
 # or Trilinos/cmake/unload_sems_dev_env.sh
 
+module purge
+
 source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
 export SEMS_FORCE_LOCAL_COMPILER_VERSION=4.9.3
 module load sems-gcc/6.1.0
 module load sems-intel/19.0.5
 module load sems-mpich/3.2
-module load sems-python/2.7.9
-module load sems-git/2.10.1
 module load sems-boost/1.63.0/base
 module load sems-zlib/1.2.8/base
 module load sems-hdf5/1.10.6/parallel
@@ -23,8 +23,14 @@ module load sems-netcdf/4.7.3/parallel
 module load sems-parmetis/4.0.3/parallel
 module load sems-scotch/6.0.3/nopthread_64bit_parallel
 module load sems-superlu/4.3/base
+
 module load sems-cmake/3.12.2
 module load sems-ninja_fortran/1.8.2
 
+module load sems-git/2.10.1
+
+module unload sems-python
+module load sems-python/3.5.2
+
 # add the OpenMP environment variable we need
 export OMP_NUM_THREADS=2
diff --git a/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp b/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp
index 883cbb08cbbb..57096e4961d0 100644
--- a/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp
+++ b/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp
@@ -154,12 +154,13 @@ void MakeCoarseLevelMaps(const int maxRegPerGID,
 
   for(int currentLevel = 1; currentLevel < numLevels; ++currentLevel) {
     RCP<Level> level        = regHierarchy->GetLevel(currentLevel);
-    RCP<Matrix> regProlong         = level->Get<RCP<Matrix> >("P");
-    RCP<Map> regRowMap             = level->Get<RCP<Map> >("regRowMap");
+    RCP<Matrix> regProlong  = level->Get<RCP<Matrix> >("P");
+    RCP<Map> regRowMap      = Teuchos::rcp_const_cast<Map>(regProlong->getColMap());
 
     RCP<Level> levelFine    = regHierarchy->GetLevel(currentLevel-1);
     RCP<Import> regRowImportFine   = levelFine->Get<RCP<Import> >("rowImport");
-    RCP<Map> regRowMapFine         = levelFine->Get<RCP<Map> >("regRowMap");
+    RCP<Matrix> regMatFine         = levelFine->Get<RCP<Matrix> >("A");
+    RCP<Map> regRowMapFine         = Teuchos::rcp_const_cast<Map>(regMatFine->getRowMap());
 
     // Extracting some basic information about local mesh in composite/region format
     const size_t numFineRegionNodes    = regProlong->getNodeNumRows();
@@ -325,15 +326,10 @@ void MakeCoarseLevelMaps(const int maxRegPerGID,
                 regionMatVecLIDs, regionInterfaceImporter);
 
     // Fill level with the outputs
-    level->Set<RCP<Map> >("compRowMap",compRowMap);
-    level->Set<RCP<Map> >("regRowMap",regRowMapCurrent);
-    level->Set<RCP<Map> >("regColMap",Teuchos::rcp_const_cast<Map>(regProlong->getColMap()));
     level->Set<Teuchos::ArrayRCP<LO> >("regionMatVecLIDs",regionMatVecLIDs);
     level->Set<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("regionInterfaceImporter", regionInterfaceImporter);
     level->Set<RCP<Xpetra::MultiVector<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> > >("interfaceGIDs", interfaceGIDs);
     level->Set<RCP<Xpetra::MultiVector<LO,LO,GO,NO> > >("regionsPerGIDWithGhosts", regionsPerGIDWithGhosts);
-    level->Set<RCP<Map> >("rowMap",quasiRegRowMap);
-    level->Set<RCP<Map> >("colMap",quasiRegRowMap);
     level->Set<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport",regRowImportCurrent);
 
     // Finally reset compositeToRegionLIDs
@@ -707,7 +703,6 @@ void MakeInterfaceScalingFactors(const int numLevels,
 {
 #include "Xpetra_UseShortNames.hpp"
 #include "MueLu_UseShortNames.hpp"
-  // std::cout << compRowMaps[0]->getComm()->getRank() << " | Computing interface scaling factors ..." << std::endl;
 
   const SC SC_ONE = Teuchos::ScalarTraits<SC>::one();
 
@@ -715,15 +710,15 @@ void MakeInterfaceScalingFactors(const int numLevels,
 
   for (int l = 0; l < numLevels; l++) {
     RCP<Level> level        = regHierarchy->GetLevel(l);
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > regRowMap = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regRowMap");
+    RCP<Matrix> regMat      = level->Get<RCP<Matrix> >("A");
+    RCP<Map> regRowMap      = Teuchos::rcp_const_cast<Map>(regMat->getRowMap());
     RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporters = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
     // initialize region vector with all ones.
     RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = VectorFactory::Build(regRowMap);
     regInterfaceScalings->putScalar(SC_ONE);
 
     // transform to composite layout while adding interface values via the Export() combine mode
-    //RCP<Vector> compInterfaceScalingSum = VectorFactory::Build(compRowMaps[l], true);
-    RCP<Vector> compInterfaceScalingSum = VectorFactory::Build( level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("compRowMap"), true);
+    RCP<Vector> compInterfaceScalingSum = VectorFactory::Build( regRowImporters->getSourceMap() , true);
     regionalToComposite(regInterfaceScalings, compInterfaceScalingSum, regRowImporters);
 
     /* transform composite layout back to regional layout. Now, GIDs associated
@@ -779,7 +774,9 @@ void createRegionHierarchy(const int numDimensions,
   }
 
   RCP<Level> level0 = regHierarchy->GetLevel(0);
-  RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > revisedRowMap1 = level0->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regRowMap");
+  RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporters = level0->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
+  RCP<Matrix> regMat       = level0->Get<RCP<Matrix> >("A");
+  RCP<Map> revisedRowMap  = Teuchos::rcp_const_cast<Map>(regMat->getRowMap());
   /* Get coarse level matrices and prolongators from MueLu hierarchy
    * Note: fine level has been dealt with previously, so we start at level 1 here.
    */
@@ -795,15 +792,12 @@ void createRegionHierarchy(const int numDimensions,
 
     RCP<Matrix> regMatrices = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
 
-    level->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regRowMap",Teuchos::rcp_const_cast<Xpetra::Map<LO,GO,NO>>(regMatrices->getRowMap()));
-    level->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regColMap",Teuchos::rcp_const_cast<Xpetra::Map<LO,GO,NO>>(regMatrices->getColMap()));
-
     // Create residual and solution vectors and cache them for vCycle apply
     std::string levelName("level");
     levelName += std::to_string(l);
     ParameterList& levelList = hierarchyData->sublist(levelName, false, "list of data on current level");
-    RCP<Vector> regRes = VectorFactory::Build(revisedRowMap1, true);
-    RCP<Vector> regSol = VectorFactory::Build(revisedRowMap1, true);
+    RCP<Vector> regRes = VectorFactory::Build(revisedRowMap, true);
+    RCP<Vector> regSol = VectorFactory::Build(revisedRowMap, true);
 
     levelList.set<RCP<Vector> >("residual", regRes, "Cached residual vector");
     levelList.set<RCP<Vector> >("solution", regSol, "Cached solution vector");
@@ -815,7 +809,7 @@ void createRegionHierarchy(const int numDimensions,
   tmLocal = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("createRegionHierarchy: MakeCoarseLevel")));
 
   MakeCoarseLevelMaps(maxRegPerGID,
-                      regHierarchy); //TODO: CLEAN UP THIS FUNCTION
+                      regHierarchy);
 
   tmLocal = Teuchos::null;
   tmLocal = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("createRegionHierarchy: MakeInterfaceScaling")));
@@ -833,13 +827,13 @@ void createRegionHierarchy(const int numDimensions,
   for(int levelIdx = 0; levelIdx < numLevels - 1; ++levelIdx) {
     RCP<Level> level = regHierarchy->GetLevel(levelIdx);
     RCP<Matrix> regMatrix = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > regRowMap = level->Get<RCP<Xpetra::Map<LO,GO,NO> > >("regRowMap");
+    RCP<Map>    regRowMap = Teuchos::rcp_const_cast<Map>(regMatrix->getRowMap());
     RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
-    RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings1 = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
+    RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
 
     smootherParams[levelIdx]->set("smoother: level", levelIdx);
     smootherSetup(smootherParams[levelIdx], regRowMap,
-                  regMatrix, regInterfaceScalings1,
+                  regMatrix, regInterfaceScalings,
                   regRowImporter);
   }
 
@@ -850,10 +844,10 @@ void createRegionHierarchy(const int numDimensions,
   const std::string coarseSolverType = coarseSolverData->get<std::string>("coarse solver type");
   if (coarseSolverType == "smoother") {
     RCP<Level> level = regHierarchy->GetLevel(numLevels - 1);
-    RCP<Matrix> regMatrix                                                           = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > regRowMap                  = level->Get<RCP<Xpetra::Map<LO,GO,NO> > >("regRowMap");
-    RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter          = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
-    RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings1 = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
+    RCP<Matrix> regMatrix = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
+    RCP<Map> regRowMap = Teuchos::rcp_const_cast<Map>(regMatrix->getRowMap());
+    RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
+    RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
     // Set the smoother on the coarsest level.
     const std::string smootherXMLFileName = coarseSolverData->get<std::string>("smoother xml file");
     RCP<ParameterList> coarseSmootherParams = smootherParams[numLevels - 1];
@@ -863,7 +857,7 @@ void createRegionHierarchy(const int numDimensions,
 
 
     smootherSetup(smootherParams[numLevels - 1], regRowMap,
-                  regMatrix, regInterfaceScalings1,
+                  regMatrix, regInterfaceScalings,
                   regRowImporter);
   } else if( (coarseSolverType == "direct") || (coarseSolverType == "amg") ) {
     // A composite coarse matrix is needed
@@ -871,12 +865,12 @@ void createRegionHierarchy(const int numDimensions,
     // std::cout << mapComp->getComm()->getRank() << " | MakeCoarseCompositeOperator ..." << std::endl;
 
     RCP<Level> level = regHierarchy->GetLevel(numLevels - 1);
-    RCP<Matrix>                                             regMatrix      = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    compRowMap     = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("compRowMap");
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    regRowMap      = level->Get<RCP<Xpetra::Map<LO,GO,NO> > >("regRowMap");
+    RCP<Matrix>  regMatrix      = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
+    RCP<Map> regRowMap          = Teuchos::rcp_const_cast<Map>(regMatrix->getRowMap());
     RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    quasiRegRowMap = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("rowMap");
-    RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    quasiRegColMap = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("colMap");
+    RCP<Map> compRowMap         = Teuchos::rcp_const_cast<Map>(regRowImporter->getSourceMap());
+    RCP<Map> quasiRegRowMap     = Teuchos::rcp_const_cast<Map>(regRowImporter->getTargetMap());
+    RCP<Map> quasiRegColMap     = Teuchos::rcp_const_cast<Map>(regRowImporter->getTargetMap());// col map same as row map.
 
     RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > coarseCompOp;
     RCP<realvaluedmultivector_type> compCoarseCoordinates;
@@ -957,12 +951,12 @@ void vCycle(const int l, ///< ID of current level
 
   RCP<Level> level = regHierarchy->GetLevel(l);
   RCP<Matrix>                                             regMatrix      = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
-  RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    compRowMap     = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("compRowMap");
-  RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    regRowMap      = level->Get<RCP<Xpetra::Map<LO,GO,NO> > >("regRowMap");
   RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
-  RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    quasiRegRowMap = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("rowMap");
-  RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    quasiRegColMap = level->Get<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("colMap");
-  RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings1 = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
+  RCP<Map> regRowMap          = Teuchos::rcp_const_cast<Map>(regMatrix->getRowMap());
+  RCP<Map> compRowMap         = Teuchos::rcp_const_cast<Map>(regRowImporter->getSourceMap());
+  RCP<Map> quasiRegRowMap     = Teuchos::rcp_const_cast<Map>(regRowImporter->getTargetMap());
+  RCP<Map> quasiRegColMap     = Teuchos::rcp_const_cast<Map>(regRowImporter->getTargetMap()); // col map samle as row map
+  RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
 
   int cycleCount = 1;
   if(cycleType == "W" && l > 0) // W cycle and not on finest level
@@ -1003,7 +997,7 @@ void vCycle(const int l, ///< ID of current level
       tm = Teuchos::null;
       tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("vCycle: 3 - scale interface")));
 
-      scaleInterfaceDOFs(regRes, regInterfaceScalings1, true);
+      scaleInterfaceDOFs(regRes, regInterfaceScalings, true);
 
       tm = Teuchos::null;
       tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("vCycle: 4 - create coarse vectors")));
@@ -1015,7 +1009,7 @@ void vCycle(const int l, ///< ID of current level
       {
         RCP<Level> levelCoarse = regHierarchy->GetLevel(l+1);
         RCP<Matrix>                                             regProlongCoarse     = levelCoarse->Get<RCP<Matrix> >("P", MueLu::NoFactory::get());
-        RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> >    regRowMapCoarse      = levelCoarse->Get<RCP<Xpetra::Map<LO,GO,NO> > >("regRowMap");
+        RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > regRowMapCoarse = regProlongCoarse->getColMap();
         // Get pre-communicated communication patterns for the fast MatVec
         const ArrayRCP<LocalOrdinal> regionInterfaceLIDs = smootherParams[l+1]->get<ArrayRCP<LO>>("Fast MatVec: interface LIDs");
         const RCP<Import>        regionInterfaceImporter = smootherParams[l+1]->get<RCP<Import>>("Fast MatVec: interface importer");
@@ -1096,8 +1090,8 @@ void vCycle(const int l, ///< ID of current level
       RCP<Vector> compX = VectorFactory::Build(coarseRowMap, true);
       RCP<Vector> compRhs = VectorFactory::Build(coarseRowMap, true);
       {
-        RCP<Vector> inverseInterfaceScaling = VectorFactory::Build(regInterfaceScalings1->getMap());
-        inverseInterfaceScaling->reciprocal(*regInterfaceScalings1);
+        RCP<Vector> inverseInterfaceScaling = VectorFactory::Build(regInterfaceScalings->getMap());
+        inverseInterfaceScaling->reciprocal(*regInterfaceScalings);
         fineRegB->elementWiseMultiply(SC_ONE, *fineRegB, *inverseInterfaceScaling, SC_ZERO);
 
         regionalToComposite(fineRegB, compRhs, regRowImporter);
diff --git a/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp b/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp
index c6049d03084d..f95b413b5d5a 100644
--- a/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp
+++ b/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp
@@ -714,11 +714,6 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
 
   {
   RCP<MueLu::Level> level0 = regHierarchy->GetLevel(0);
-  level0->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("compRowMap", dofMap);
-  level0->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("rowMap", rowMap);
-  level0->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("colMap", colMap);
-  level0->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regRowMap", revisedRowMap);
-  level0->Set<RCP<Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> > >("regColMap", revisedColMap);
   level0->Set<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport",rowImport);
   level0->Set<ArrayView<LocalOrdinal> > ("compositeToRegionLIDs", compositeToRegionLIDs() );
   level0->Set<RCP<Xpetra::MultiVector<GlobalOrdinal, LocalOrdinal, GlobalOrdinal, Node> > >("interfaceGIDs", interfaceGIDsPerLevel[0]);
diff --git a/packages/muelu/src/Misc/MueLu_SchurComplementFactory_def.hpp b/packages/muelu/src/Misc/MueLu_SchurComplementFactory_def.hpp
index 309d5bdc9fa9..1a4dce874ff4 100644
--- a/packages/muelu/src/Misc/MueLu_SchurComplementFactory_def.hpp
+++ b/packages/muelu/src/Misc/MueLu_SchurComplementFactory_def.hpp
@@ -126,7 +126,7 @@ namespace MueLu {
       } else {
         RCP<BlockedCrsMatrix> bA00 = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(A00);
         TEUCHOS_TEST_FOR_EXCEPTION(bA00.is_null()==false, MueLu::Exceptions::RuntimeError,"MueLu::SchurComplementFactory::Build: Mass lumping not implemented. Implement a mass lumping kernel!");
-        diag = Utilities::GetLumpedMatrixDiagonal(A00);
+        diag = Utilities::GetLumpedMatrixDiagonal(*A00);
       }
       // invert diagonal vector. Replace all entries smaller than 1e-4 by one!
       RCP<Vector> D = (!fixing ? Utilities::GetInverse(diag) : Utilities::GetInverse(diag, 1e-4, one));
diff --git a/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_BraessSarazinSmoother_def.hpp b/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_BraessSarazinSmoother_def.hpp
index 87ebaa414c31..f49bfef1ddc7 100644
--- a/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_BraessSarazinSmoother_def.hpp
+++ b/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_BraessSarazinSmoother_def.hpp
@@ -212,7 +212,7 @@ namespace MueLu {
     if (pL.get<bool>("lumping") == false) {
       A00_->getLocalDiagCopy(*diagFVector);       // extract diagonal of F
     } else {
-      diagFVector = Utilities::GetLumpedMatrixDiagonal(A00_);
+      diagFVector = Utilities::GetLumpedMatrixDiagonal(*A00_);
     }
     diagFVector->scale(omega);
     D_ = Utilities::GetInverse(diagFVector);
diff --git a/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_SimpleSmoother_def.hpp b/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_SimpleSmoother_def.hpp
index 6e6dd58ec693..2c4920f14e17 100644
--- a/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_SimpleSmoother_def.hpp
+++ b/packages/muelu/src/Smoothers/BlockedSmoothers/MueLu_SimpleSmoother_def.hpp
@@ -250,7 +250,7 @@ namespace MueLu {
         diag[i] = absRowSum;
       }*/
       // TODO this does not work if F_ is nested!
-      diagFVector = Utilities::GetLumpedMatrixDiagonal(F_);
+      diagFVector = Utilities::GetLumpedMatrixDiagonal(*F_);
     }
     diagFinv_ = Utilities::GetInverse(diagFVector);
 
diff --git a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
index 366d845770c0..f5e4f909fb81 100644
--- a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
+++ b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
@@ -644,7 +644,7 @@ namespace MueLu {
         doScale = paramList.get<bool>("chebyshev: use rowsumabs diagonal scaling");
         paramList.remove("chebyshev: use rowsumabs diagonal scaling");
         if (doScale) {
-          RCP<Vector> lumpedDiagonal = Utilities::GetLumpedMatrixDiagonal(currentLevel.Get<RCP<Matrix> >("A"),true);
+          RCP<Vector> lumpedDiagonal = Utilities::GetLumpedMatrixDiagonal(*(currentLevel.Get<RCP<Matrix> >("A")),true);
           const Xpetra::TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& tmpVec = dynamic_cast<const Xpetra::TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>&>(*lumpedDiagonal);
           paramList.set("chebyshev: operator inv diagonal",tmpVec.getTpetra_Vector());
         }
diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_SaPFactory_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_SaPFactory_def.hpp
index 0654fdeaa871..4eafe08bb1bf 100644
--- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_SaPFactory_def.hpp
+++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_SaPFactory_def.hpp
@@ -178,7 +178,7 @@ namespace MueLu {
           Coordinate stopTol = 1e-4;
           if (useAbsValueRowSum) {
             const bool returnReciprocal=true;
-            invDiag = Utilities::GetLumpedMatrixDiagonal(A,returnReciprocal);
+            invDiag = Utilities::GetLumpedMatrixDiagonal(*A,returnReciprocal);
             TEUCHOS_TEST_FOR_EXCEPTION(invDiag.is_null(), Exceptions::RuntimeError,
                                        "SaPFactory: eigenvalue estimate: diagonal reciprocal is null.");
             lambdaMax = Utilities::PowerMethod(*A, invDiag, maxEigenIterations, stopTol);
@@ -202,7 +202,7 @@ namespace MueLu {
         else if (invDiag == Teuchos::null) {
           GetOStream(Runtime0) << "Using rowsumabs diagonal" << std::endl;
           const bool returnReciprocal=true;
-          invDiag = Utilities::GetLumpedMatrixDiagonal(A,returnReciprocal);
+          invDiag = Utilities::GetLumpedMatrixDiagonal(*A,returnReciprocal);
           TEUCHOS_TEST_FOR_EXCEPTION(invDiag.is_null(), Exceptions::RuntimeError, "SaPFactory: diagonal reciprocal is null.");
         }	
 	
diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
index e3e9e47f8700..ae68787366ed 100644
--- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_decl.hpp
@@ -166,58 +166,23 @@ namespace MueLu {
     static RCP<Vector> GetMatrixDiagonalInverse(const Matrix& A, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
       Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetMatrixDiagonalInverse");
 
-      RCP<const Matrix> rcpA = Teuchos::rcpFromRef(A);
-
-      RCP<const Map> rowMap = rcpA->getRowMap();
+      RCP<const Map> rowMap = A.getRowMap();
       RCP<Vector> diag = Xpetra::VectorFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Build(rowMap,true);
 
-      rcpA->getLocalDiagCopy(*diag);
+      A.getLocalDiagCopy(*diag);
 
       RCP<Vector> inv = MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetInverse(diag, tol, tolReplacement);
 
       return inv;
     }
 
-
-
-    /*! @brief Extract Matrix Diagonal of lumped matrix
-
-    Returns Matrix diagonal of lumped matrix in ArrayRCP.
-
-    NOTE -- it's assumed that A has been fillComplete'd.
-    */
-    static Teuchos::ArrayRCP<Scalar> GetLumpedMatrixDiagonal(const Matrix& A, const bool doReciprocal = false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
-
-      size_t numRows = A.getRowMap()->getNodeNumElements();
-      const Scalar zero = Teuchos::ScalarTraits<Scalar>::zero();
-      const Scalar one = Teuchos::ScalarTraits<Scalar>::one();
-      Teuchos::ArrayRCP<Scalar> diag(numRows);
-      Teuchos::ArrayView<const LocalOrdinal> cols;
-      Teuchos::ArrayView<const Scalar> vals;
-      for (size_t i = 0; i < numRows; ++i) {
-        A.getLocalRowView(i, cols, vals);
-        diag[i] = zero;
-        for (LocalOrdinal j = 0; j < cols.size(); ++j) {
-          diag[i] += Teuchos::ScalarTraits<Scalar>::magnitude(vals[j]);
-        }
-      }
-      if (doReciprocal) {
-        for (size_t i = 0; i < numRows; ++i)
-          if(Teuchos::ScalarTraits<Scalar>::magnitude(diag[i]) > tol)
-            diag[i] = one / diag[i];
-          else
-            diag[i] = tolReplacement;
-      }
-      return diag;
-    }
-
     /*! @brief Extract Matrix Diagonal of lumped matrix
 
-    Returns Matrix diagonal of lumped matrix in ArrayRCP.
+    Returns Matrix diagonal of lumped matrix in RCP<Vector>.
 
     NOTE -- it's assumed that A has been fillComplete'd.
     */
-    static Teuchos::RCP<Vector> GetLumpedMatrixDiagonal(Teuchos::RCP<const Matrix> rcpA, const bool doReciprocal = false,
+    static Teuchos::RCP<Vector> GetLumpedMatrixDiagonal(Matrix const & A, const bool doReciprocal = false,
                                                         Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100,
                                                         Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
 
@@ -225,6 +190,8 @@ namespace MueLu {
       const Scalar zero = Teuchos::ScalarTraits<Scalar>::zero();
       const Scalar one = Teuchos::ScalarTraits<Scalar>::one();
 
+      Teuchos::RCP<const Matrix> rcpA = Teuchos::rcpFromRef(A);
+
       RCP<const Xpetra::BlockedCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > bA =
           Teuchos::rcp_dynamic_cast<const Xpetra::BlockedCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(rcpA);
       if(bA == Teuchos::null) {
@@ -251,9 +218,6 @@ namespace MueLu {
       } else {
         TEUCHOS_TEST_FOR_EXCEPTION(doReciprocal, Xpetra::Exceptions::RuntimeError,
           "UtilitiesBase::GetLumpedMatrixDiagonal(): extracting reciprocal of diagonal of a blocked matrix is not supported");
-        //TEUCHOS_TEST_FOR_EXCEPTION(bA->Rows() != bA->Cols(), Xpetra::Exceptions::RuntimeError,
-        //  "UtilitiesBase::GetLumpedMatrixDiagonal(): you cannot extract the diagonal of a "<< bA->Rows() << "x"<< bA->Cols() << " blocked matrix." );
-
         diag = Xpetra::VectorFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Build(bA->getRangeMapExtractor()->getFullMap(),true);
 
         for (size_t row = 0; row < bA->Rows(); ++row) {
@@ -262,7 +226,7 @@ namespace MueLu {
               // if we are in Thyra mode, but the block (row,row) is again a blocked operator, we have to use (pseudo) Xpetra-style GIDs with offset!
               bool bThyraMode = bA->getRangeMapExtractor()->getThyraMode() && (Teuchos::rcp_dynamic_cast<Xpetra::BlockedCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(bA->getMatrix(row,col)) == Teuchos::null);
               RCP<Vector> ddtemp = bA->getRangeMapExtractor()->ExtractVector(diag,row,bThyraMode);
-              RCP<const Vector> dd = GetLumpedMatrixDiagonal(bA->getMatrix(row,col));
+              RCP<const Vector> dd = GetLumpedMatrixDiagonal(*(bA->getMatrix(row,col)));
               ddtemp->update(Teuchos::as<Scalar>(1.0),*dd,Teuchos::as<Scalar>(1.0));
               bA->getRangeMapExtractor()->InsertVector(ddtemp,row,diag,bThyraMode);
             }
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
index 271037ee9679..ae7ca5759d6d 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
@@ -208,9 +208,7 @@ namespace MueLu {
     static RCP<Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >          Crs2Op(RCP<Xpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op) { return UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Crs2Op(Op); }
     static Teuchos::ArrayRCP<Scalar>                                             GetMatrixDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixDiagonal(A); }
     static RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> >          GetMatrixDiagonalInverse(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixDiagonalInverse(A,tol); }
-    static Teuchos::ArrayRCP<Scalar>                                             GetLumpedMatrixDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero())
-    { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement); }
-    static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Teuchos::RCP<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero())
+    static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero())
     { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement); }
     static RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> >          GetMatrixOverlappedDiagonal(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixOverlappedDiagonal(A); }
     static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetInverse(Teuchos::RCP<const Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > v, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetInverse(v,tol,tolReplacement); }
@@ -585,8 +583,7 @@ namespace MueLu {
     static RCP<Matrix>                                                           Crs2Op(RCP<CrsMatrix> Op) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Crs2Op(Op); }
     static Teuchos::ArrayRCP<Scalar>                                             GetMatrixDiagonal(const Matrix& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixDiagonal(A); }
     static RCP<Vector>                                                           GetMatrixDiagonalInverse(const Matrix& A, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixDiagonalInverse(A,tol); }
-    static Teuchos::ArrayRCP<Scalar>                                             GetLumpedMatrixDiagonal(const Matrix& A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement); }
-    static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Teuchos::RCP<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero())
+    static Teuchos::RCP<Xpetra::Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > GetLumpedMatrixDiagonal(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero())
     { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement); }
     static RCP<Vector>                                                           GetMatrixOverlappedDiagonal(const Matrix& A) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetMatrixOverlappedDiagonal(A); }
     static RCP<Vector>                                                           GetInverse(Teuchos::RCP<const Vector> v, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) { return MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetInverse(v,tol,tolReplacement); }
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
index 78af7bf51aed..5eceebd91590 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_decl.hpp
@@ -386,10 +386,7 @@ namespace MueLu {
     static RCP<Vector> GetMatrixDiagonalInverse(const Matrix& A, Magnitude tol = Teuchos::ScalarTraits<SC>::eps()*100, const bool doLumped=false) {
       return UtilitiesBase::GetMatrixDiagonalInverse(A, tol, doLumped);
     }
-    static ArrayRCP<SC> GetLumpedMatrixDiagonal(const Matrix& A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
-      return UtilitiesBase::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement);
-    }
-    static RCP<Vector> GetLumpedMatrixDiagonal(RCP<const Matrix > A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
+    static RCP<Vector> GetLumpedMatrixDiagonal(Matrix const &A, const bool doReciprocal=false, Magnitude tol = Teuchos::ScalarTraits<Scalar>::eps()*100, Scalar tolReplacement = Teuchos::ScalarTraits<Scalar>::zero()) {
       return UtilitiesBase::GetLumpedMatrixDiagonal(A, doReciprocal, tol, tolReplacement);
     }
     static RCP<Vector> GetMatrixOverlappedDiagonal(const Matrix& A) {
diff --git a/packages/muelu/test/maxwell/Maxwell3D.cpp b/packages/muelu/test/maxwell/Maxwell3D.cpp
index 3f6cfd3d10cc..d13c1bdbd050 100644
--- a/packages/muelu/test/maxwell/Maxwell3D.cpp
+++ b/packages/muelu/test/maxwell/Maxwell3D.cpp
@@ -713,7 +713,7 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib lib, int arg
     // nodal mass matrix
     RCP<Matrix> M0_Matrix = Xpetra::IO<SC, LO, GO, NO>::Read(M0_file, node_map);
     // build lumped mass matrix inverse (M0inv_Matrix)
-    RCP<Vector> diag = Utilities::GetLumpedMatrixDiagonal(M0_Matrix);
+    RCP<Vector> diag = Utilities::GetLumpedMatrixDiagonal(*M0_Matrix);
     RCP<CrsMatrixWrap> M0inv_MatrixWrap = rcp(new CrsMatrixWrap(node_map, node_map, 0));
     RCP<CrsMatrix> M0inv_CrsMatrix = M0inv_MatrixWrap->getCrsMatrix();
     Teuchos::ArrayRCP<size_t> rowPtr;
diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp
index 377fb52a39e8..b5f9d5514cf0 100644
--- a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp
@@ -823,8 +823,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// This method does not have an "atomic" option like
     /// sumIntoGlobalValue.  This is deliberate.  Replacement is not
@@ -842,7 +843,7 @@ namespace Tpetra {
     void
     replaceGlobalValue (const GlobalOrdinal gblRow,
                         const size_t col,
-                        const impl_scalar_type& value) const;
+                        const impl_scalar_type& value);
 
     /// \brief Like the above replaceGlobalValue, but only enabled if
     ///   T differs from impl_scalar_type.
@@ -859,9 +860,9 @@ namespace Tpetra {
     /// and you want to modify the non-host version of the data, you
     /// must access the device View directly by calling getLocalView().
     /// Please see modify(), sync(), and the discussion of DualView
-    /// semantics elsewhere in the documentation.  You are responsible
-    /// for calling modify() and sync(), if needed; this method
-    /// doesn't do that.
+    /// semantics elsewhere in the documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// This method does not have an "atomic" option like
     /// sumIntoGlobalValue.  This is deliberate.  Replacement is not
@@ -880,7 +881,7 @@ namespace Tpetra {
     typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
     replaceGlobalValue (GlobalOrdinal globalRow,
                         size_t col,
-                        const T& value) const
+                        const T& value)
     {
       replaceGlobalValue (globalRow, col, static_cast<impl_scalar_type> (value));
     }
@@ -897,8 +898,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// \param gblRow [in] Global row index of the entry to modify.
     ///   This <i>must</i> be a valid global row index on the calling
@@ -912,7 +914,7 @@ namespace Tpetra {
     sumIntoGlobalValue (const GlobalOrdinal gblRow,
                         const size_t col,
                         const impl_scalar_type& value,
-                        const bool atomic = useAtomicUpdatesByDefault) const;
+                        const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Like the above sumIntoGlobalValue, but only enabled if
     ///   T differs from impl_scalar_type.
@@ -930,8 +932,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// \param gblRow [in] Global row index of the entry to modify.
     ///   This <i>must</i> be a valid global row index on the calling
@@ -946,7 +949,7 @@ namespace Tpetra {
     sumIntoGlobalValue (const GlobalOrdinal gblRow,
                         const size_t col,
                         const T& val,
-                        const bool atomic = useAtomicUpdatesByDefault) const
+                        const bool atomic = useAtomicUpdatesByDefault)
     {
       sumIntoGlobalValue (gblRow, col, static_cast<impl_scalar_type> (val), atomic);
     }
@@ -963,8 +966,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// This method does not have an "atomic" option like
     /// sumIntoLocalValue.  This is deliberate.  Replacement is not
@@ -982,7 +986,7 @@ namespace Tpetra {
     void
     replaceLocalValue (const LocalOrdinal lclRow,
                        const size_t col,
-                       const impl_scalar_type& value) const;
+                       const impl_scalar_type& value);
 
     /// \brief Like the above replaceLocalValue, but only enabled if
     ///   T differs from impl_scalar_type.
@@ -1000,8 +1004,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// This method does not have an "atomic" option like
     /// sumIntoLocalValue.  This is deliberate.  Replacement is not
@@ -1020,7 +1025,7 @@ namespace Tpetra {
     typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
     replaceLocalValue (const LocalOrdinal lclRow,
                        const size_t col,
-                       const T& val) const
+                       const T& val)
     {
       replaceLocalValue (lclRow, col, static_cast<impl_scalar_type> (val));
     }
@@ -1037,8 +1042,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// \param lclRow [in] Local row index of the entry to modify.
     ///   Must be a valid local index in this MultiVector's Map on the
@@ -1052,7 +1058,7 @@ namespace Tpetra {
     sumIntoLocalValue (const LocalOrdinal lclRow,
                        const size_t col,
                        const impl_scalar_type& val,
-                       const bool atomic = useAtomicUpdatesByDefault) const;
+                       const bool atomic = useAtomicUpdatesByDefault);
 
     /// \brief Like the above sumIntoLocalValue, but only enabled if
     ///   T differs from impl_scalar_type.
@@ -1070,8 +1076,9 @@ namespace Tpetra {
     /// must access the device View directly by calling
     /// getLocalView().  Please see modify(), sync(), and the
     /// discussion of DualView semantics elsewhere in the
-    /// documentation.  You are responsible for calling modify() and
-    /// sync(), if needed; this method doesn't do that.
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
     ///
     /// \param lclRow [in] Local row index of the entry to modify.
     /// \param col [in] Column index of the entry to modify.
@@ -1084,7 +1091,7 @@ namespace Tpetra {
     sumIntoLocalValue (const LocalOrdinal lclRow,
                        const size_t col,
                        const T& val,
-                       const bool atomic = useAtomicUpdatesByDefault) const
+                       const bool atomic = useAtomicUpdatesByDefault)
     {
       sumIntoLocalValue (lclRow, col, static_cast<impl_scalar_type> (val), atomic);
     }
diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
index 3269dc9e22f9..3d44c695cb73 100644
--- a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
@@ -2044,7 +2044,7 @@ namespace Tpetra {
   namespace { // (anonymous)
     template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
     typename MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dot_type
-    multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
+    multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
                                 const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& y)
     {
       using ::Tpetra::Details::ProfilingRegion;
@@ -2070,14 +2070,9 @@ namespace Tpetra {
         dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
 
         // All non-unary kernels are executed on the device as per Tpetra policy.  Sync to device if needed.
-        if (x.need_sync_device ()) {
-          x.sync_device ();
-        }
-        if (y.need_sync_device ()) {
-          const_cast<MV&>(y).sync_device ();
-        }
+        const_cast<MV&>(x).sync_device ();
+        const_cast<MV&>(y).sync_device ();
 
-        x.modify_device ();
         auto x_2d = x.getLocalViewDevice ();
         auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
         auto y_2d = y.getLocalViewDevice ();
@@ -2139,7 +2134,7 @@ namespace Tpetra {
        numDots << " != this->getNumVectors() = " << numVecs << ".");
 
     if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
-      const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
+      const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
       dots[0] = gblDot;
     }
     else {
@@ -4145,26 +4140,30 @@ namespace Tpetra {
   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   replaceLocalValue (const LocalOrdinal lclRow,
                      const size_t col,
-                     const impl_scalar_type& ScalarValue) const
+                     const impl_scalar_type& ScalarValue)
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
-    const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      lclRow < minLocalIndex || lclRow > maxLocalIndex,
-      std::runtime_error,
-      "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
-      << " is invalid.  The range of valid row indices on this process "
-      << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
-      << ", " << maxLocalIndex << "].");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      vectorIndexOutOfRange(col),
-      std::runtime_error,
-      "Tpetra::MultiVector::replaceLocalValue: vector index " << col
-      << " of the multivector is invalid.");
-#endif
+    if (::Tpetra::Details::Behavior::debug()) {
+      const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
+      const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        lclRow < minLocalIndex || lclRow > maxLocalIndex,
+        std::runtime_error,
+        "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
+        << " is invalid.  The range of valid row indices on this process "
+        << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
+        << ", " << maxLocalIndex << "].");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        vectorIndexOutOfRange(col),
+        std::runtime_error,
+        "Tpetra::MultiVector::replaceLocalValue: vector index " << col
+        << " of the multivector is invalid.");
+    }
+
+    this->sync_host();
+
     const size_t colInd = isConstantStride () ? col : whichVectors_[col];
     view_.h_view (lclRow, colInd) = ScalarValue;
+    this->modify_host();
   }
 
 
@@ -4174,24 +4173,27 @@ namespace Tpetra {
   sumIntoLocalValue (const LocalOrdinal lclRow,
                      const size_t col,
                      const impl_scalar_type& value,
-                     const bool atomic) const
+                     const bool atomic)
   {
-#ifdef HAVE_TPETRA_DEBUG
-    const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
-    const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      lclRow < minLocalIndex || lclRow > maxLocalIndex,
-      std::runtime_error,
-      "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
-      << " is invalid.  The range of valid row indices on this process "
-      << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
-      << ", " << maxLocalIndex << "].");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      vectorIndexOutOfRange(col),
-      std::runtime_error,
-      "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
-      << " of the multivector is invalid.");
-#endif
+    if (::Tpetra::Details::Behavior::debug()) {
+      const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
+      const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        lclRow < minLocalIndex || lclRow > maxLocalIndex,
+        std::runtime_error,
+        "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
+        << " is invalid.  The range of valid row indices on this process "
+        << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
+        << ", " << maxLocalIndex << "].");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        vectorIndexOutOfRange(col),
+        std::runtime_error,
+        "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
+        << " of the multivector is invalid.");
+    }
+
+    this->sync_host();
+
     const size_t colInd = isConstantStride () ? col : whichVectors_[col];
     if (atomic) {
       Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
@@ -4199,6 +4201,7 @@ namespace Tpetra {
     else {
       view_.h_view (lclRow, colInd) += value;
     }
+    this->modify_host();
   }
 
 
@@ -4207,22 +4210,24 @@ namespace Tpetra {
   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   replaceGlobalValue (const GlobalOrdinal gblRow,
                       const size_t col,
-                      const impl_scalar_type& ScalarValue) const
+                      const impl_scalar_type& ScalarValue)
   {
     // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
     // touches the RCP's reference count, which isn't thread safe.
     const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
-#ifdef HAVE_TPETRA_DEBUG
-    const char tfecfFuncName[] = "replaceGlobalValue: ";
-    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-      (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
-       std::runtime_error,
-       "Global row index " << gblRow << " is not present on this process "
-       << this->getMap ()->getComm ()->getRank () << ".");
-    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
-      (this->vectorIndexOutOfRange (col), std::runtime_error,
-       "Vector index " << col << " of the MultiVector is invalid.");
-#endif // HAVE_TPETRA_DEBUG
+
+    if (::Tpetra::Details::Behavior::debug()) {
+      const char tfecfFuncName[] = "replaceGlobalValue: ";
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
+         std::runtime_error,
+         "Global row index " << gblRow << " is not present on this process "
+         << this->getMap ()->getComm ()->getRank () << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (this->vectorIndexOutOfRange (col), std::runtime_error,
+         "Vector index " << col << " of the MultiVector is invalid.");
+    }
+
     this->replaceLocalValue (lclRow, col, ScalarValue);
   }
 
@@ -4232,24 +4237,26 @@ namespace Tpetra {
   sumIntoGlobalValue (const GlobalOrdinal globalRow,
                       const size_t col,
                       const impl_scalar_type& value,
-                      const bool atomic) const
+                      const bool atomic)
   {
     // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
     // touches the RCP's reference count, which isn't thread safe.
     const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
-#ifdef HAVE_TEUCHOS_DEBUG
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
-      std::runtime_error,
-      "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
-      << " is not present on this process "
-      << this->getMap ()->getComm ()->getRank () << ".");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      vectorIndexOutOfRange(col),
-      std::runtime_error,
-      "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
-      << " of the multivector is invalid.");
-#endif
+
+    if (::Tpetra::Details::Behavior::debug()) {
+      TEUCHOS_TEST_FOR_EXCEPTION(
+          lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
+          std::runtime_error,
+          "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
+          << " is not present on this process "
+          << this->getMap ()->getComm ()->getRank () << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+          vectorIndexOutOfRange(col),
+          std::runtime_error,
+          "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
+          << " of the multivector is invalid.");
+    }
+
     this->sumIntoLocalValue (lclRow, col, value, atomic);
   }
 
diff --git a/packages/tpetra/core/src/Tpetra_Vector_decl.hpp b/packages/tpetra/core/src/Tpetra_Vector_decl.hpp
index 387670736cbc..7116c0455c90 100644
--- a/packages/tpetra/core/src/Tpetra_Vector_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_Vector_decl.hpp
@@ -262,7 +262,7 @@ class Vector : public MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>
   //! Replace current value at the specified location with specified value.
   /** \pre \c globalRow must be a valid global element on this node, according to the row map.
    */
-  void replaceGlobalValue (const GlobalOrdinal globalRow, const Scalar& value) const;
+  void replaceGlobalValue (const GlobalOrdinal globalRow, const Scalar& value);
 
   /// \brief Add value to existing value, using global (row) index.
   ///
@@ -286,12 +286,12 @@ class Vector : public MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>
   void
   sumIntoGlobalValue (const GlobalOrdinal globalRow,
                       const Scalar& value,
-                      const bool atomic = base_type::useAtomicUpdatesByDefault) const;
+                      const bool atomic = base_type::useAtomicUpdatesByDefault);
 
   //! Replace current value at the specified location with specified values.
   /** \pre \c localRow must be a valid local element on this node, according to the row map.
    */
-  void replaceLocalValue (const LocalOrdinal myRow, const Scalar& value) const;
+  void replaceLocalValue (const LocalOrdinal myRow, const Scalar& value);
 
   /// \brief Add \c value to existing value, using local (row) index.
   ///
@@ -313,7 +313,7 @@ class Vector : public MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>
   void
   sumIntoLocalValue (const LocalOrdinal myRow,
                      const Scalar& value,
-                     const bool atomic = base_type::useAtomicUpdatesByDefault) const;
+                     const bool atomic = base_type::useAtomicUpdatesByDefault);
 
   //@}
 
diff --git a/packages/tpetra/core/src/Tpetra_Vector_def.hpp b/packages/tpetra/core/src/Tpetra_Vector_def.hpp
index 18a0c3bd16a0..90a235f1a3aa 100644
--- a/packages/tpetra/core/src/Tpetra_Vector_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_Vector_def.hpp
@@ -118,7 +118,7 @@ namespace Tpetra {
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  replaceGlobalValue (const GlobalOrdinal globalRow, const Scalar& value) const {
+  replaceGlobalValue (const GlobalOrdinal globalRow, const Scalar& value) {
     this->base_type::replaceGlobalValue (globalRow, 0, value);
   }
 
@@ -127,7 +127,7 @@ namespace Tpetra {
   Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   sumIntoGlobalValue (const GlobalOrdinal globalRow,
                       const Scalar& value,
-                      const bool atomic) const
+                      const bool atomic)
   {
     this->base_type::sumIntoGlobalValue (globalRow, 0, value, atomic);
   }
@@ -135,7 +135,7 @@ namespace Tpetra {
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
   Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  replaceLocalValue (const LocalOrdinal myRow, const Scalar& value) const {
+  replaceLocalValue (const LocalOrdinal myRow, const Scalar& value) {
     this->base_type::replaceLocalValue (myRow, 0, value);
   }
 
@@ -144,7 +144,7 @@ namespace Tpetra {
   Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   sumIntoLocalValue (const LocalOrdinal globalRow,
                      const Scalar& value,
-                     const bool atomic) const
+                     const bool atomic)
   {
     this->base_type::sumIntoLocalValue (globalRow, 0, value, atomic);
   }
diff --git a/packages/tpetra/core/test/MultiVector/MultiVector_UnitTests.cpp b/packages/tpetra/core/test/MultiVector/MultiVector_UnitTests.cpp
index 5f0eeb0a1a1a..4a0617779b13 100644
--- a/packages/tpetra/core/test/MultiVector/MultiVector_UnitTests.cpp
+++ b/packages/tpetra/core/test/MultiVector/MultiVector_UnitTests.cpp
@@ -1749,7 +1749,7 @@ namespace {
     mvec2.norm1(norms2());
     std::fill(ans.begin(), ans.end(), M0);
     TEST_COMPARE_FLOATING_ARRAYS(norms1,ans,M0);
-    TEST_COMPARE_FLOATING_ARRAYS(norms1,ans,M0);
+    TEST_COMPARE_FLOATING_ARRAYS(norms2,ans,M0);
     // replace local entries s.t.
     // mvec1 = [1 1]  and  mvec2 = [0 0]
     //         [0 0]               [1 1]
@@ -2705,6 +2705,8 @@ namespace {
   ////
   TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( MultiVector, ScaleAndAssign, LO , GO , Scalar , Node )
   {
+    std::cerr << std::endl;
+
     typedef typename ScalarTraits<Scalar>::magnitudeType Mag;
     typedef Tpetra::MultiVector<Scalar,LO,GO,Node> MV;
     typedef Tpetra::Vector<Scalar,LO,GO,Node>       V;
@@ -2735,12 +2737,13 @@ namespace {
     // Also, ensure that other vectors aren't changed
 
     out << "Create A, and fill with random numbers" << endl;
-    MV A (map, numVectors, false);
-    A.randomize ();
+    MV A(map, numVectors, false);
+    A.randomize();
+    A.sync_host();
 
     out << "Stash away norms of columns of A" << endl;
     Array<Mag> Anrms(numVectors);
-    A.norm1 (Anrms ());
+    A.norm1(Anrms());
 
     out << "Test B := A*2, using different methods" << endl;
     // set B = A * 2, using different techniques
@@ -2769,8 +2772,8 @@ namespace {
             std::ostringstream os;
             os << ">>> Proc " << comm->getSize ();
             os << ": A.modified_host: " << (A.need_sync_device ()?1:0);
-            os  << ", A.modified_device: " << (A.need_sync_host ()?1:0);
-            os << ": B.modified_host: " << (B.need_sync_device ()?1:0);
+            os << ", A.modified_device: " << (A.need_sync_host ()?1:0);
+            os << ", B.modified_host: " << (B.need_sync_device ()?1:0);
             os << ", B.modified_device: " << (B.need_sync_host ()?1:0);
             os << std::endl;
             std::cerr << os.str ();