diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp
index 1366eb231e68..bd5eed1f59c1 100644
--- a/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp
@@ -78,6 +78,7 @@ namespace MueLu {
 
     /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
     void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20);
+    bool has_stream_vector_table() const {return stream_sizes_.size() > 0;}
 
     /* Lookup in the stream_vector table */
     double stream_vector_copy_lookup(int SIZE_IN_BYTES);
@@ -91,8 +92,8 @@ namespace MueLu {
     double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);   
 
     /* Print table */
-    void print_stream_vector_table(std::ostream & out);
-    void print_latency_corrected_stream_vector_table(std::ostream & out);
+    void print_stream_vector_table(std::ostream & out, const std::string & prefix="");
+    void print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix="");
 
     /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
      * The sender process sends a message and then waits for confirmation of reception.
@@ -102,13 +103,14 @@ namespace MueLu {
      * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
      */    
     void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
+    bool has_pingpong_table() const {return pingpong_sizes_.size() > 0;}
 
     /* Lookup in the pingpong_vector table */
     double pingpong_host_lookup(int SIZE_IN_BYTES);
     double pingpong_device_lookup(int SIZE_IN_BYTES);
 
     /* Print table */
-    void print_pingpong_table(std::ostream & out);
+    void print_pingpong_table(std::ostream & out, const std::string & prefix="");
 
     /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
      * Here we use exactly the communication pattern specified in the import object
@@ -118,13 +120,14 @@ namespace MueLu {
      * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
      */
     void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import);
+    bool has_halopong_table() const {return halopong_sizes_.size() > 0;}
 
     /* Lookup in the halopong_vector table */
     double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
     double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
 
     /* Print table */
-    void print_halopong_table(std::ostream & out);
+    void print_halopong_table(std::ostream & out, const std::string & prefix="");
 
 
 
@@ -133,15 +136,16 @@ namespace MueLu {
      * e.g., GPUS.
      */
     void launch_latency_make_table(int KERNEL_REPEATS);
+    bool has_launch_latency_table() const  {return launch_and_wait_latency_ > 0;}
 
     /* Lookup launch latency */
     double launch_latency_lookup();
        
     /* Print table */
-    void print_launch_latency_table(std::ostream & out);
+    void print_launch_latency_table(std::ostream & out, const std::string & prefix="");
 
   private:
-    void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction);    
+    void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix);    
 
 
     std::vector<int>    stream_sizes_;
diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp
index d97a9e4d0c65..5c9a70b4edff 100644
--- a/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp
+++ b/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp
@@ -422,31 +422,33 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out) {
-    print_stream_vector_table_impl(out,false);
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out, const std::string & prefix) {
+    print_stream_vector_table_impl(out,false,prefix);
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out) {
-    print_stream_vector_table_impl(out,true);
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix) {
+    print_stream_vector_table_impl(out,true,prefix);
   }
 
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix) {
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Length in Scalars" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Length in Scalars" << setw(1) << " "
         << setw(20) << "COPY (us)" << setw(1) << " "
         << setw(20) << "ADD (us)" << setw(1) << " "
         << setw(20) << "COPY (GB/s)" << setw(1) << " "
         << setw(20) << "ADD (GB/s)" << std::endl;
 
-    out << setw(20) << "-----------------" << setw(1) << " "
+    out << prefix
+        << setw(20) << "-----------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "--------" << setw(1) << " "
         << setw(20) << "-----------" << setw(1) << " "
@@ -462,7 +464,8 @@ namespace MueLu {
       double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
@@ -502,18 +505,20 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out, const std::string & prefix) {
     if(pingpong_sizes_.size() == 0) return;
 
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Message Size" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Message Size" << setw(1) << " "
         << setw(20) << "Host (us)" << setw(1) << " "
         << setw(20) << "Device (us)" << std::endl;
 
-    out << setw(20) << "------------" << setw(1) << " "
+    out << prefix
+        << setw(20) << "------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "-----------" << std::endl;
     
@@ -524,7 +529,8 @@ namespace MueLu {
       double d_time = pingpong_device_times_[i];
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
     }
@@ -562,18 +568,20 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out, const std::string & prefix) {
     if(halopong_sizes_.size() == 0) return;
 
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Message Size" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Message Size" << setw(1) << " "
         << setw(20) << "Host (us)" << setw(1) << " "
         << setw(20) << "Device (us)" << std::endl;
 
-    out << setw(20) << "------------" << setw(1) << " "
+    out << prefix 
+        << setw(20) << "------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "-----------" << std::endl;
     
@@ -584,7 +592,8 @@ namespace MueLu {
       double d_time = halopong_device_times_[i];
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
     }
@@ -629,12 +638,13 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out, const std::string & prefix) {
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " 
+    out << prefix
+        << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " 
         << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
 
     out.copyfmt(old_format);
diff --git a/packages/muelu/test/scaling/CMakeLists.txt b/packages/muelu/test/scaling/CMakeLists.txt
index 48739297ee57..e36192802cb2 100644
--- a/packages/muelu/test/scaling/CMakeLists.txt
+++ b/packages/muelu/test/scaling/CMakeLists.txt
@@ -40,6 +40,15 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV
 
   INSTALL(TARGETS "${PACKAGE_NAME}_Driver")
 
+  # Perf Model
+  TRIBITS_ADD_TEST(
+    Driver
+    NAME PerformanceModel
+    COMM mpi
+    ARGS "--nx=40 --ny=40 --nz=40 --matrixType=Laplace3D --performance-model=verbose"
+    PASS_REGULAR_EXPRESSION "Belos converged"
+  )
+
   # Do a simple weak scaling experiment (4x ranks and 4x grid size)
   TRIBITS_ADD_TEST(
     Driver
diff --git a/packages/muelu/test/scaling/Driver.cpp b/packages/muelu/test/scaling/Driver.cpp
index 7a55e318b8e5..e6c822cb04e0 100644
--- a/packages/muelu/test/scaling/Driver.cpp
+++ b/packages/muelu/test/scaling/Driver.cpp
@@ -47,6 +47,7 @@
 #include <iomanip>
 #include <iostream>
 #include <unistd.h>
+#include <vector>
 #include <sys/resource.h>
 
 #include <Teuchos_XMLParameterListHelpers.hpp>
@@ -77,6 +78,7 @@
 #include <MueLu_MutuallyExclusiveTime.hpp>
 #include <MueLu_ParameterListInterpreter.hpp>
 #include <MueLu_Utilities.hpp>
+#include <MueLu_PerfModelReporter.hpp>
 #include <MatrixLoad.hpp>
 #include <DriverCore.hpp>
 
@@ -277,8 +279,9 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
   int numReruns = 1;                                  clp.setOption("reruns",                &numReruns,  "number of reruns");
   std::string rerunFilePrefix;                             clp.setOption("fileprefix",              &rerunFilePrefix,      "if doing reruns, optional prefix to prepend to output files");
   std::string rerunFileSuffix;                             clp.setOption("filesuffix",              &rerunFileSuffix,      "if doing reruns, optional suffix to append to output files");
-
+  std::string  levelPerformanceModel  = "no";          clp.setOption("performance-model", &levelPerformanceModel,  "runs the level-by-level performance mode options- 'no', 'yes' or 'verbose'");
   clp.recogniseAllOptions(true);
+
   switch (clp.parse(argc, argv)) {
     case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS;
     case Teuchos::CommandLineProcessor::PARSE_ERROR:
@@ -542,6 +545,25 @@ MueLu::MueLu_AMGX_initialize_plugins();
 
 
       tm = Teuchos::null;
+
+
+      // If we want Level-specific performance model diagnostics, now is the time!
+      if( (levelPerformanceModel=="yes" || levelPerformanceModel=="verbose")
+          && !H.is_null()) {
+        for(int i=0; i < H->GetNumLevels(); i++) {
+          RCP<Level> level = H->GetLevel(i);
+          try {
+            RCP<Matrix> A_level = level->Get<RCP<Matrix> >("A");            
+            std::string level_name = std::string("Level-") + std::to_string(i) + std::string(": ");
+            std::vector<const char *> timers;//MueLu: Laplace2D: Hierarchy: Solve (level=0)  
+            MueLu::report_spmv_performance_models<Matrix>(A_level,100,timers,globalTimeMonitor,level_name,levelPerformanceModel=="verbose");
+          }
+          catch(...) {;}
+        }        
+      }
+     
+
+
       globalTimeMonitor = Teuchos::null;
       if (useStackedTimer)
         resetStackedTimer = true;
diff --git a/packages/muelu/test/scaling/MatvecKernelDriver.cpp b/packages/muelu/test/scaling/MatvecKernelDriver.cpp
index fc499d42e8cf..a2b5c07da4fc 100644
--- a/packages/muelu/test/scaling/MatvecKernelDriver.cpp
+++ b/packages/muelu/test/scaling/MatvecKernelDriver.cpp
@@ -64,8 +64,10 @@
 #include "MueLu.hpp"
 #include "MueLu_TestHelpers.hpp"
 #include "MueLu_PerfModels.hpp"
+#include "MueLu_PerfModelReporter.hpp"
 #include <MatrixLoad.hpp>
 
+
 #include "Xpetra_TpetraMultiVector.hpp"
 #include "Xpetra_TpetraImport.hpp"
 #include "Tpetra_CrsMatrix.hpp"
@@ -123,318 +125,6 @@ void print_crs_graph(std::string name, const V1 rowptr, const V2 colind) {
   printf("\n");
 }
 
-// =========================================================================
-// Performance Routines
-// =========================================================================
-// Report bandwidth in GB / sec
-const double GB = 1024.0 * 1024.0 * 1024.0;
-
-double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
-
-  double time_per_call = time / num_calls;
-
-  return memory_per_call_bytes / GB / time_per_call;
-}
-
-
-
-template<class Matrix>
-void report_performance_models(const Teuchos::RCP<const Matrix> & A, int nrepeat, bool verbose) {  
-  using Teuchos::RCP;
-  const RCP<const Teuchos::Comm<int> > comm = A->getMap()->getComm();
-  using SC = typename Matrix::scalar_type;
-  using LO = typename Matrix::local_ordinal_type;
-  using GO = typename Matrix::global_ordinal_type;
-  using NO = typename Matrix::node_type;
-
-  // NOTE: We've hardwired this to size_t for the rowptr.  This really should really get read out of a typedef,
-  // if Tpetra actually had one
-  using rowptr_type = size_t;
-  MueLu::PerfModels<SC,LO,GO,NO> PM;
-  int rank = comm->getRank(); int nproc = comm->getSize();
-  int m   = static_cast<int>(A->getLocalNumRows());
-  int n   = static_cast<int>(A->getColMap()->getLocalNumElements());
-  int nnz = static_cast<int>(A->getLocalMatrixHost().graph.entries.extent(0));
-  
-  // Generate Lookup Tables  
-  int v_log_max = ceil(log(nnz) / log(2))+1;
-  PM.stream_vector_make_table(nrepeat,v_log_max);
-
-  int m_log_max = 15;
-  PM.pingpong_make_table(nrepeat,m_log_max,comm);
-
-
-  if(A->hasCrsGraph()) {
-    auto importer = A->getCrsGraph()->getImporter();
-    if(!importer.is_null()) {
-      size_t recv_size = importer->getRemoteLIDs().size() * sizeof(SC);
-      size_t send_size = importer->getExportLIDs().size() * sizeof(SC);
-      int local_log_max = ceil(log(std::max(send_size,recv_size)) / log(2))+1;
-      int global_log_max=local_log_max;
-      Teuchos::reduceAll<int>(*comm,Teuchos::REDUCE_MAX,1,&local_log_max,&global_log_max);
-      PM.halopong_make_table(nrepeat,global_log_max, importer);   
-    }
-  }
-
-  if(verbose && rank == 0) {
-    std::cout<<"********************************************************"<<std::endl;
-    std::cout<<"Performance model results on "<<nproc<<" ranks"<<std::endl;
-    std::cout<<"****** Launch Latency Table ******"<<std::endl;
-    PM.print_launch_latency_table(std::cout);
-    std::cout<<"****** Stream Table ******"<<std::endl;
-    PM.print_stream_vector_table(std::cout);
-    std::cout<<"****** Latency Corrected Stream Table ******"<<std::endl;
-    PM.print_latency_corrected_stream_vector_table(std::cout);
-    std::cout<<"****** Pingpong Table ******"<<std::endl;
-    PM.print_pingpong_table(std::cout);
-    std::cout<<"****** Halopong Table ******"<<std::endl;
-    PM.print_halopong_table(std::cout);
-  }
-
-  // For convenience
-  const int NUM_TIMERS = 6;
-  std::string SPMV_test_names[NUM_TIMERS] = {"colind","rowptr","vals","x","y","all"};
-  std::vector<int> SPMV_num_objects(NUM_TIMERS), SPMV_object_size(NUM_TIMERS), SPMV_corrected(NUM_TIMERS);
-
-
-  // Composite model: Use latency correction
-  SPMV_num_objects[0] = nnz;     SPMV_object_size[0] = sizeof(LO);           SPMV_corrected[0] = 1;// colind
-  SPMV_num_objects[1] = (m + 1); SPMV_object_size[1] = sizeof(rowptr_type);  SPMV_corrected[1] = 1;// rowptr 
-  SPMV_num_objects[2] = nnz;     SPMV_object_size[2] = sizeof(SC);           SPMV_corrected[2] = 1;// vals
-  SPMV_num_objects[3] = n;       SPMV_object_size[3] = sizeof(SC);           SPMV_corrected[3] = 1; // x
-  SPMV_num_objects[4] = m;       SPMV_object_size[4] = sizeof(SC);           SPMV_corrected[4] = 1;// y
-
-  // All-Model: Do not use latency correction
-  SPMV_object_size[5] = 1;
-  SPMV_num_objects[5]  = (m+1)*sizeof(rowptr_type) + nnz*sizeof(LO) + nnz*sizeof(SC) + 
-    n*sizeof(SC) + m*sizeof(SC);  
-  SPMV_corrected[5] = 0;
-
-
-  std::vector<double> gb_per_sec(NUM_TIMERS);
-  if(verbose && rank == 0)
-    std::cout<<"****** Local Time Model Results ******"<<std::endl;
-  for(int i = 0; i < NUM_TIMERS; i++) {
-    double avg_time;
-
-    // Table interpolation - Take the faster of the two
-    int size_in_bytes = SPMV_object_size[i] *SPMV_num_objects[i];
-    if(SPMV_corrected[i] == 1)
-      avg_time = PM.latency_corrected_stream_vector_lookup(size_in_bytes);
-    else
-      avg_time = PM.stream_vector_lookup(size_in_bytes);
-
-    
-    // The lookup divides by transactions-per-element already
-    double memory_traffic = (double)SPMV_object_size[i] *(double)SPMV_num_objects[i];
-    gb_per_sec[i] = convert_time_to_bandwidth_gbs(avg_time,1,memory_traffic);
-    
-    if(verbose && rank == 0) {
-
-      std::cout<< "Local: "<<SPMV_test_names[i] << " # Scalars = "<<memory_traffic/sizeof(SC) << " time per call = "<<avg_time*1e6 << " us. GB/sec = "<<gb_per_sec[i]<<std::endl;
-    }
-  }
-  
-  // Get the latency info
-  double avg_latency = PM.launch_latency_lookup();    
-
-
-
-  /***************************************************************************/
-  // *** Calculate SPMV minimum time (composite) ***
-  // Model: 
-  // rowptr = One read per row
-  // colind = One read per entry
-  // values = One read per entry
-  // x      = One read per entry in values array (but we assume the cache will work its magic here)
-  // y      = One write per row
- 
-  long unsigned int spmv_memory_bytes[NUM_TIMERS] = {
-     (m+1) * sizeof(rowptr_type), //rowptr
-     nnz * sizeof(LO), //colind
-     nnz * sizeof(SC), // values
-     n  * sizeof(SC), //x
-     m * sizeof(SC), // y
-     0
-    };
-  for(int i=0; i<NUM_TIMERS-1; i++)
-    spmv_memory_bytes[NUM_TIMERS-1] += spmv_memory_bytes[i];
-
-
-  double minimum_local_composite_time = avg_latency;
-  for(int i=0; i<NUM_TIMERS-1; i++)
-    minimum_local_composite_time +=  spmv_memory_bytes[i] / (GB * gb_per_sec[i]);
-
-  double minimum_local_all_time = spmv_memory_bytes[NUM_TIMERS-1] / (GB * gb_per_sec[NUM_TIMERS-1]);
-
-  /***************************************************************************/
-  // *** Calculate Remote part of the SPMV ***
-  double time_pack_unpack_outofplace = 0.0;
-  double time_pack_unpack_inplace    = 0.0;
-  double time_communicate_ping       = 0.0;
-  double time_communicate_halo       = 0.0;
-  // Note: We'll assume that each of the permutes, remotes and exports is a unified
-  // memory transaction, even though that's not strictly speaking correct.
-  if(A->hasCrsGraph()) {
-    auto importer = A->getCrsGraph()->getImporter();
-    if(! importer.is_null()) {
-      // Sames [pack] - 1 read SC, 1 write SC 
-      // NOTE: Only if you're out-of-place
-      size_t num_sames = importer->getNumSameIDs();
-      double same_time = (num_sames == 0) ? 0.0 : 
-        2.0 * PM.latency_corrected_stream_vector_lookup(num_sames*sizeof(SC)) + avg_latency;
-
-      // Permutes [pack] - 2 reads LO [to, from] , 1 read SC [values], 1 write SC [values]
-      size_t num_permutes = importer->getNumPermuteIDs();
-      double permute_time = (num_permutes == 0) ? 0.0 : 
-        2.0 * PM.latency_corrected_stream_vector_lookup(num_permutes*sizeof(LO)) + 
-        2.0 * PM.latency_corrected_stream_vector_lookup(num_permutes*sizeof(SC)) + avg_latency;
-
-      // Exports [pack] - 1 read LO [exportLIDs], 1 read SC [values], 1 write SC [buffer]
-      // This is what Epetra does at least      
-      size_t num_exports = importer->getNumExportIDs();
-      double export_time = (num_exports == 0) ? 0.0 :
-        PM.latency_corrected_stream_vector_lookup(num_exports*sizeof(LO)) + 
-        2.0 * PM.latency_corrected_stream_vector_lookup(num_exports*sizeof(SC)) + avg_latency;
-
-      // Remotes [unpack] - 1 read LO [remoteLIDs],  1 read SC [buffer], 1 write SC [values]
-      // NOTE: Only if you're out of place
-      size_t num_remotes = importer->getNumRemoteIDs();
-      double remote_time = (num_remotes == 0) ? 0.0 :
-        PM.latency_corrected_stream_vector_lookup(num_remotes*sizeof(LO)) + 
-        2.0 * PM.latency_corrected_stream_vector_lookup(num_remotes*sizeof(SC)) + avg_latency;
-
-
-      // Total pack / unpack time
-      time_pack_unpack_outofplace = same_time + permute_time + export_time + remote_time;
-      time_pack_unpack_inplace    = permute_time + export_time;
-
-      // We now need to get the size of each message for the ping-pong costs.
-      double send_time = 0.0;
-      double recv_time = 0.0;
-      double halo_time = 0.0;
-      size_t total_send_length=0, total_recv_length=0;
-      double avg_size_per_msg = 0.0;
-      RCP<const Xpetra::TpetraImport<LO,GO,NO> > t_importer = Teuchos::rcp_dynamic_cast<const Xpetra::TpetraImport<LO,GO,NO> >(importer);
-      if(!t_importer.is_null()) {
-        RCP<const Tpetra::Import<LO,GO,NO> > tt_i  = t_importer->getTpetra_Import();
-        Tpetra::Distributor & distor = tt_i->getDistributor();
-        Teuchos::ArrayView<const size_t> recv_lengths = distor.getLengthsFrom();
-        Teuchos::ArrayView<const size_t> send_lengths = distor.getLengthsTo();
-        
-        for (int i=0; i<(int) send_lengths.size(); i++) {
-          send_time += PM.pingpong_device_lookup(send_lengths[i] * sizeof(SC));
-          total_send_length = send_lengths[i]*sizeof(SC);
-        }
-        
-        for (int i=0; i<(int) recv_lengths.size(); i++)  {
-          recv_time += PM.pingpong_device_lookup(recv_lengths[i] * sizeof(SC));                  
-          total_recv_length = recv_lengths[i]*sizeof(SC);
-        }
- 
-        avg_size_per_msg = (double)total_send_length/(2.0*send_lengths.size()) +  (double)total_recv_length/(2.0*recv_lengths.size());
-        halo_time = PM.halopong_device_lookup(avg_size_per_msg);
-      }
-
-      if(verbose && rank == 0) {
-         std::cout<<"****** Remote Time Model Results ******"<<std::endl;
-        std::cout << "Remote: same      = "<<same_time*1e6<<" us.\n"
-                  << "Remote: permutes  = "<<permute_time*1e6<<" us.\n"
-                  << "Remote: exports   = "<<export_time*1e6<<" us.\n"
-                  << "Remote: remotes   = "<<remote_time*1e6<<" us.\n"
-                  << "Remote: sends len = "<<total_send_length<<" time = "<<send_time*1e6<<" us.\n"
-                  << "Remote: recvs len = "<<total_recv_length<<" time  = "<<recv_time*1e6<<" us.\n"
-                  << "Remote: halo avg  = "<<(size_t)avg_size_per_msg<<" time  = "<<halo_time*1e6<<" us.\n"<<std::endl;
-      }
-
-      // NOTE: For now we'll do comm time as the larger of send/recv.  Not sure this is
-      // really the optimal thing to do, but we'll start here.
-      time_communicate_ping = std::max(send_time,recv_time);
-      time_communicate_halo = halo_time;
-    }
-  }
-  double minimum_time_in_place_ping     = time_communicate_ping + time_pack_unpack_inplace;
-  double minimum_time_out_of_place_ping = time_communicate_ping + time_pack_unpack_outofplace;
-  double minimum_time_in_place_halo     = time_communicate_halo + time_pack_unpack_inplace;
-  double minimum_time_out_of_place_halo = time_communicate_halo + time_pack_unpack_outofplace;
-
-
-  /***************************************************************************/
-  if(rank == 0)
-    std::cout << "\n\n========================================================\n"
-              << "Minimum time model (composite) : " << minimum_local_composite_time << std::endl
-              << "Minimum time model (all)       : " << minimum_local_all_time << std::endl
-              << "Pack/unpack in-place           : " << time_pack_unpack_inplace << std::endl
-              << "Pack/unpack out-of-place       : " << time_pack_unpack_outofplace << std::endl
-              << "Communication time (ping)      : " << time_communicate_ping << std::endl
-              << "Communication time (halo)      : " << time_communicate_halo << std::endl;
-  
-
-  // Get global average/max time sums
-  constexpr int NUM_TIMES=10;
-  double avg_times[NUM_TIMES];
-  double max_times[NUM_TIMES];
-  {
-    double comp      = minimum_local_composite_time;
-    double alls      = minimum_local_all_time;
-    double p_inplace = minimum_time_in_place_ping;
-    double p_ooplace = minimum_time_out_of_place_ping;
-    double h_inplace = minimum_time_in_place_halo;
-    double h_ooplace = minimum_time_out_of_place_halo;
-
-    double all_times_local[NUM_TIMES] = {
-                 comp,comp+p_inplace,comp+p_ooplace,comp+h_inplace,comp+h_ooplace,
-                 alls,alls+p_inplace,alls+p_ooplace,alls+h_inplace,alls+h_ooplace};
-
-    if(nproc > 1) {
-      Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, NUM_TIMES, &all_times_local[0], &avg_times[0]);
-      Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, NUM_TIMES, &all_times_local[0], &max_times[0]);
-      for(int i=0; i<NUM_TIMES; i++)
-        avg_times[i] /= nproc;
-    }
-    else {
-      for(int i=0; i<NUM_TIMES; i++)
-        avg_times[i] = max_times[i] = all_times_local[i];
-    }
-  }
-
-  // Iterate through all of the "MV" timers
-  // NOTE: This is a hack since TimeMonitor does not give you a way to
-  // iterate through the timers
-  std::vector<const char *> timer_names = {"MV MKL: Total",
-                                           "MV KK: Total",
-                                           "MV Tpetra: Total",
-                                           "MV CuSparse: Total",
-                                           "MV MagmaSparse: Total",
-                                           "MV HYPRE: Total",
-                                           "MV Petsc: Total"};
-
-  if(rank == 0) {
-    if(!globalTimeMonitor.is_null()) {
-      const std::string l[NUM_TIMES]={"Comp","Comp+ping+inplace","Comp+ping+ooplace","Comp+halo+inplace","Comp+halo+ooplace",
-                                    "All", "All+ping+inplace", "All+ping+ooplace", "All+halo+inplace", "All+halo+ooplace"};
-      const std::string div={"-------------------"};
-      printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","Timer",l[0].c_str(),l[1].c_str(),l[2].c_str(),l[3].c_str(),l[4].c_str(),
-             l[5].c_str(),l[6].c_str(),l[7].c_str(),l[8].c_str(),l[9].c_str());
-      printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","-----",div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str(),
-             div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str());
-      for(int i=0; i<(int)timer_names.size(); i++) {
-        Teuchos::RCP<Teuchos::Time> t = globalTimeMonitor->lookupCounter(timer_names[i]);
-        if(!t.is_null()) {
-          double time_per_call = t->totalElapsedTime() / t->numCalls();
-          printf("%-60s %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f\n",timer_names[i],
-                 max_times[0]/time_per_call,max_times[1]/time_per_call,max_times[2]/time_per_call,max_times[3]/time_per_call,max_times[4]/time_per_call,
-                 max_times[5]/time_per_call,max_times[6]/time_per_call,max_times[7]/time_per_call,max_times[8]/time_per_call,max_times[9]/time_per_call);
-
-        }
-      }
-    }
-    else {
-      std::cout<<"Note: Minimum time model individual timers only work with stacked timers off."<<std::endl;
-    }
-  }
-
-}
   
 
 //==============================================================================
@@ -1579,7 +1269,15 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
     // ==========================================
     // Performance Models
     // ==========================================
-    report_performance_models<Matrix>(A,nrepeat,verboseModel);
+    std::vector<const char *> timer_names = {"MV MKL: Total",
+                                             "MV KK: Total",
+                                             "MV Tpetra: Total",
+                                             "MV CuSparse: Total",
+                                             "MV MagmaSparse: Total",
+                                             "MV HYPRE: Total",
+                                             "MV Petsc: Total"};
+    
+    MueLu::report_spmv_performance_models<Matrix>(A,nrepeat,timer_names,globalTimeMonitor,"",verboseModel);
     globalTimeMonitor = Teuchos::null;
 
     success = true;
diff --git a/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp b/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp
new file mode 100644
index 000000000000..022d4b77e298
--- /dev/null
+++ b/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp
@@ -0,0 +1,378 @@
+// @HEADER
+//
+// ***********************************************************************
+//
+//        MueLu: A package for multigrid based preconditioning
+//                  Copyright 2012 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact
+//                    Jonathan Hu       (jhu@sandia.gov)
+//                    Andrey Prokopenko (aprokop@sandia.gov)
+//                    Ray Tuminaro      (rstumin@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+#ifndef MUELU_PERFMODEL_REPORTER_HPP
+#define MUELU_PERFMODEL_REPORTER_HPP
+#include <vector>
+
+#include <Teuchos_RCP.hpp>
+#include <Xpetra_Matrix.hpp>
+
+#include "MueLu_PerfModels.hpp"
+
+namespace MueLu {
+
+
+
+// =========================================================================
+// Performance Routines
+// =========================================================================
+// Report bandwidth in GB / sec
+
+
+/* @brief Generate performance model reports for SPMV of a given matrix
+   Input:
+     A       - Matrix to evaluate
+     nrepeat - # times to repeat the tests in model construction
+     timer_names - Names of Teuchos timers which are SPMVs to compare agains the model
+     verbose - Enable verbose output  
+*/
+template<class Matrix>
+void report_spmv_performance_models(const Teuchos::RCP<const Matrix> & A, int nrepeat,const std::vector<const char *> & timer_names , Teuchos::RCP<Teuchos::TimeMonitor> & myTimeMonitor, const std::string prefix = "", bool verbose=false) {  
+  using Teuchos::RCP;
+  using Teuchos::rcp;
+  const RCP<const Teuchos::Comm<int> > comm = A->getMap()->getComm();
+  using SC = typename Matrix::scalar_type;
+  using LO = typename Matrix::local_ordinal_type;
+  using GO = typename Matrix::global_ordinal_type;
+  using NO = typename Matrix::node_type;
+
+  const double GB = 1024.0 * 1024.0 * 1024.0;
+
+ 
+  // Conversion function as a lambda
+  auto convert_time_to_bandwidth_gbs = [=](double time, int num_calls, double memory_per_call_bytes) {
+    double time_per_call = time / num_calls;
+
+    return memory_per_call_bytes / GB / time_per_call;
+  };
+  
+  // NOTE: We've hardwired this to size_t for the rowptr.  This really should really get read out of a typedef,
+  // if Tpetra actually had one
+  using rowptr_type = size_t;
+
+  // Make a new model if we need one
+  MueLu::PerfModels<SC,LO,GO,NO> PM;
+
+  int rank = comm->getRank(); int nproc = comm->getSize();
+  int m   = static_cast<int>(A->getLocalNumRows());
+  int n   = static_cast<int>(A->getColMap()->getLocalNumElements());
+  int nnz = static_cast<int>(A->getLocalMatrixHost().graph.entries.extent(0));
+  
+  // Generate Lookup Tables  
+  int v_log_max = ceil(log(nnz) / log(2))+1;
+  PM.stream_vector_make_table(nrepeat,v_log_max);
+
+  int m_log_max = 15;
+  PM.pingpong_make_table(nrepeat,m_log_max,comm);
+
+
+  if(A->hasCrsGraph()) {
+    auto importer = A->getCrsGraph()->getImporter();
+    if(!importer.is_null()) {
+      size_t recv_size = importer->getRemoteLIDs().size() * sizeof(SC);
+      size_t send_size = importer->getExportLIDs().size() * sizeof(SC);
+      int local_log_max = ceil(log(std::max(send_size,recv_size)) / log(2))+1;
+      int global_log_max=local_log_max;
+      Teuchos::reduceAll<int>(*comm,Teuchos::REDUCE_MAX,1,&local_log_max,&global_log_max);
+      PM.halopong_make_table(nrepeat,global_log_max, importer);   
+    }
+  }
+
+  if(verbose && rank == 0) {
+    std::cout<<prefix<<"********************************************************"<<std::endl;
+    std::cout<<prefix<<"Performance model results on "<<nproc<<" ranks"<<std::endl;
+    std::cout<<prefix<<"****** Launch Latency Table ******"<<std::endl;
+    PM.print_launch_latency_table(std::cout,prefix);
+    std::cout<<prefix<<"****** Stream Table ******"<<std::endl;
+    PM.print_stream_vector_table(std::cout,prefix);
+    std::cout<<prefix<<"****** Latency Corrected Stream Table ******"<<std::endl;
+    PM.print_latency_corrected_stream_vector_table(std::cout,prefix);
+    std::cout<<prefix<<"****** Pingpong Table ******"<<std::endl;
+    PM.print_pingpong_table(std::cout,prefix);
+    std::cout<<prefix<<"****** Halopong Table ******"<<std::endl;
+    PM.print_halopong_table(std::cout,prefix);
+  }
+
+  // For convenience
+  const int NUM_TIMERS = 6;
+  std::string SPMV_test_names[NUM_TIMERS] = {"colind","rowptr","vals","x","y","all"};
+  std::vector<int> SPMV_num_objects(NUM_TIMERS), SPMV_object_size(NUM_TIMERS), SPMV_corrected(NUM_TIMERS);
+
+
+  // Composite model: Use latency correction
+  SPMV_num_objects[0] = nnz;     SPMV_object_size[0] = sizeof(LO);           SPMV_corrected[0] = 1;// colind
+  SPMV_num_objects[1] = (m + 1); SPMV_object_size[1] = sizeof(rowptr_type);  SPMV_corrected[1] = 1;// rowptr 
+  SPMV_num_objects[2] = nnz;     SPMV_object_size[2] = sizeof(SC);           SPMV_corrected[2] = 1;// vals
+  SPMV_num_objects[3] = n;       SPMV_object_size[3] = sizeof(SC);           SPMV_corrected[3] = 1; // x
+  SPMV_num_objects[4] = m;       SPMV_object_size[4] = sizeof(SC);           SPMV_corrected[4] = 1;// y
+
+  // All-Model: Do not use latency correction
+  SPMV_object_size[5] = 1;
+  SPMV_num_objects[5]  = (m+1)*sizeof(rowptr_type) + nnz*sizeof(LO) + nnz*sizeof(SC) + 
+    n*sizeof(SC) + m*sizeof(SC);  
+  SPMV_corrected[5] = 0;
+
+
+  std::vector<double> gb_per_sec(NUM_TIMERS);
+  if(verbose && rank == 0)
+    std::cout<<prefix<<"****** Local Time Model Results ******"<<std::endl;
+  for(int i = 0; i < NUM_TIMERS; i++) {
+    double avg_time;
+
+    // Table interpolation - Take the faster of the two
+    int size_in_bytes = SPMV_object_size[i] *SPMV_num_objects[i];
+    if(SPMV_corrected[i] == 1)
+      avg_time = PM.latency_corrected_stream_vector_lookup(size_in_bytes);
+    else
+      avg_time = PM.stream_vector_lookup(size_in_bytes);
+
+    
+    // The lookup divides by transactions-per-element already
+    double memory_traffic = (double)SPMV_object_size[i] *(double)SPMV_num_objects[i];
+    gb_per_sec[i] = convert_time_to_bandwidth_gbs(avg_time,1,memory_traffic);
+    
+    if(verbose && rank == 0) {
+
+      std::cout<<prefix<< "Local: "<<SPMV_test_names[i] << " # Scalars = "<<memory_traffic/sizeof(SC) << " time per call = "<<avg_time*1e6 << " us. GB/sec = "<<gb_per_sec[i]<<std::endl;
+    }
+  }
+  
+  // Get the latency info
+  double avg_latency = PM.launch_latency_lookup();    
+
+
+
+  /***************************************************************************/
+  // *** Calculate SPMV minimum time (composite) ***
+  // Model: 
+  // rowptr = One read per row
+  // colind = One read per entry
+  // values = One read per entry
+  // x      = One read per entry in values array (but we assume the cache will work its magic here)
+  // y      = One write per row
+ 
+  long unsigned int spmv_memory_bytes[NUM_TIMERS] = {
+     (m+1) * sizeof(rowptr_type), //rowptr
+     nnz * sizeof(LO), //colind
+     nnz * sizeof(SC), // values
+     n  * sizeof(SC), //x
+     m * sizeof(SC), // y
+     0
+    };
+  for(int i=0; i<NUM_TIMERS-1; i++)
+    spmv_memory_bytes[NUM_TIMERS-1] += spmv_memory_bytes[i];
+
+
+  double minimum_local_composite_time = avg_latency;
+  for(int i=0; i<NUM_TIMERS-1; i++)
+    minimum_local_composite_time +=  spmv_memory_bytes[i] / (GB * gb_per_sec[i]);
+
+  double minimum_local_all_time = spmv_memory_bytes[NUM_TIMERS-1] / (GB * gb_per_sec[NUM_TIMERS-1]);
+
+  /***************************************************************************/
+  // *** Calculate Remote part of the SPMV ***
+  double time_pack_unpack_outofplace = 0.0;
+  double time_pack_unpack_inplace    = 0.0;
+  double time_communicate_ping       = 0.0;
+  double time_communicate_halo       = 0.0;
+  // Note: We'll assume that each of the permutes, remotes and exports is a unified
+  // memory transaction, even though that's not strictly speaking correct.
+  if(A->hasCrsGraph()) {
+    auto importer = A->getCrsGraph()->getImporter();
+    if(! importer.is_null()) {
+      // Sames [pack] - 1 read SC, 1 write SC 
+      // NOTE: Only if you're out-of-place
+      size_t num_sames = importer->getNumSameIDs();
+      double same_time = (num_sames == 0) ? 0.0 : 
+        2.0 * PM.latency_corrected_stream_vector_lookup(num_sames*sizeof(SC)) + avg_latency;
+
+      // Permutes [pack] - 2 reads LO [to, from] , 1 read SC [values], 1 write SC [values]
+      size_t num_permutes = importer->getNumPermuteIDs();
+      double permute_time = (num_permutes == 0) ? 0.0 : 
+        2.0 * PM.latency_corrected_stream_vector_lookup(num_permutes*sizeof(LO)) + 
+        2.0 * PM.latency_corrected_stream_vector_lookup(num_permutes*sizeof(SC)) + avg_latency;
+
+      // Exports [pack] - 1 read LO [exportLIDs], 1 read SC [values], 1 write SC [buffer]
+      // This is what Epetra does at least      
+      size_t num_exports = importer->getNumExportIDs();
+      double export_time = (num_exports == 0) ? 0.0 :
+        PM.latency_corrected_stream_vector_lookup(num_exports*sizeof(LO)) + 
+        2.0 * PM.latency_corrected_stream_vector_lookup(num_exports*sizeof(SC)) + avg_latency;
+
+      // Remotes [unpack] - 1 read LO [remoteLIDs],  1 read SC [buffer], 1 write SC [values]
+      // NOTE: Only if you're out of place
+      size_t num_remotes = importer->getNumRemoteIDs();
+      double remote_time = (num_remotes == 0) ? 0.0 :
+        PM.latency_corrected_stream_vector_lookup(num_remotes*sizeof(LO)) + 
+        2.0 * PM.latency_corrected_stream_vector_lookup(num_remotes*sizeof(SC)) + avg_latency;
+
+
+      // Total pack / unpack time
+      time_pack_unpack_outofplace = same_time + permute_time + export_time + remote_time;
+      time_pack_unpack_inplace    = permute_time + export_time;
+
+      // We now need to get the size of each message for the ping-pong costs.
+      double send_time = 0.0;
+      double recv_time = 0.0;
+      double halo_time = 0.0;
+      size_t total_send_length=0, total_recv_length=0;
+      double avg_size_per_msg = 0.0;
+      RCP<const Xpetra::TpetraImport<LO,GO,NO> > t_importer = Teuchos::rcp_dynamic_cast<const Xpetra::TpetraImport<LO,GO,NO> >(importer);
+      if(!t_importer.is_null()) {
+        RCP<const Tpetra::Import<LO,GO,NO> > tt_i  = t_importer->getTpetra_Import();
+        Tpetra::Distributor & distor = tt_i->getDistributor();
+        Teuchos::ArrayView<const size_t> recv_lengths = distor.getLengthsFrom();
+        Teuchos::ArrayView<const size_t> send_lengths = distor.getLengthsTo();
+        
+        for (int i=0; i<(int) send_lengths.size(); i++) {
+          send_time += PM.pingpong_device_lookup(send_lengths[i] * sizeof(SC));
+          total_send_length = send_lengths[i]*sizeof(SC);
+        }
+        
+        for (int i=0; i<(int) recv_lengths.size(); i++)  {
+          recv_time += PM.pingpong_device_lookup(recv_lengths[i] * sizeof(SC));                  
+          total_recv_length = recv_lengths[i]*sizeof(SC);
+        }
+ 
+        avg_size_per_msg = (double)total_send_length/(2.0*send_lengths.size()) +  (double)total_recv_length/(2.0*recv_lengths.size());
+        halo_time = PM.halopong_device_lookup(avg_size_per_msg);
+      }
+
+      if(verbose && rank == 0) {
+         std::cout<<prefix<<"****** Remote Time Model Results ******"<<std::endl;
+         std::cout<<prefix<< "Remote: same      = "<<same_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: permutes  = "<<permute_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: exports   = "<<export_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: remotes   = "<<remote_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: sends len = "<<total_send_length<<" time = "<<send_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: recvs len = "<<total_recv_length<<" time  = "<<recv_time*1e6<<" us.\n"
+                  <<prefix<< "Remote: halo avg  = "<<(size_t)avg_size_per_msg<<" time  = "<<halo_time*1e6<<" us.\n"<<std::endl;
+      }
+
+      // NOTE: For now we'll do comm time as the larger of send/recv.  Not sure this is
+      // really the optimal thing to do, but we'll start here.
+      time_communicate_ping = std::max(send_time,recv_time);
+      time_communicate_halo = halo_time;
+    }
+  }
+  double minimum_time_in_place_ping     = time_communicate_ping + time_pack_unpack_inplace;
+  double minimum_time_out_of_place_ping = time_communicate_ping + time_pack_unpack_outofplace;
+  double minimum_time_in_place_halo     = time_communicate_halo + time_pack_unpack_inplace;
+  double minimum_time_out_of_place_halo = time_communicate_halo + time_pack_unpack_outofplace;
+
+
+  /***************************************************************************/
+  if(rank == 0)
+    std::cout << "\n\n========================================================\n"
+              <<prefix<< "Minimum time model (composite) : " << minimum_local_composite_time << std::endl
+              <<prefix<< "Minimum time model (all)       : " << minimum_local_all_time << std::endl
+              <<prefix<< "Pack/unpack in-place           : " << time_pack_unpack_inplace << std::endl
+              <<prefix<< "Pack/unpack out-of-place       : " << time_pack_unpack_outofplace << std::endl
+              <<prefix<< "Communication time (ping)      : " << time_communicate_ping << std::endl
+              <<prefix<< "Communication time (halo)      : " << time_communicate_halo << std::endl;
+  
+
+  // Get global average/max time sums
+  constexpr int NUM_TIMES=10;
+  double avg_times[NUM_TIMES];
+  double max_times[NUM_TIMES];
+  {
+    double comp      = minimum_local_composite_time;
+    double alls      = minimum_local_all_time;
+    double p_inplace = minimum_time_in_place_ping;
+    double p_ooplace = minimum_time_out_of_place_ping;
+    double h_inplace = minimum_time_in_place_halo;
+    double h_ooplace = minimum_time_out_of_place_halo;
+
+    double all_times_local[NUM_TIMES] = {
+                 comp,comp+p_inplace,comp+p_ooplace,comp+h_inplace,comp+h_ooplace,
+                 alls,alls+p_inplace,alls+p_ooplace,alls+h_inplace,alls+h_ooplace};
+
+    if(nproc > 1) {
+      Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, NUM_TIMES, &all_times_local[0], &avg_times[0]);
+      Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, NUM_TIMES, &all_times_local[0], &max_times[0]);
+      for(int i=0; i<NUM_TIMES; i++)
+        avg_times[i] /= nproc;
+    }
+    else {
+      for(int i=0; i<NUM_TIMES; i++)
+        avg_times[i] = max_times[i] = all_times_local[i];
+    }
+  }
+
+  // Iterate through all of the "MV" timers
+ 
+  if(rank == 0) {
+    if(!myTimeMonitor.is_null() && timer_names.size() > 0) {
+      const std::string l[NUM_TIMES]={"Comp","Comp+ping+inplace","Comp+ping+ooplace","Comp+halo+inplace","Comp+halo+ooplace",
+                                    "All", "All+ping+inplace", "All+ping+ooplace", "All+halo+inplace", "All+halo+ooplace"};
+      const std::string div={"-------------------"};
+      printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","Timer",l[0].c_str(),l[1].c_str(),l[2].c_str(),l[3].c_str(),l[4].c_str(),
+             l[5].c_str(),l[6].c_str(),l[7].c_str(),l[8].c_str(),l[9].c_str());
+      printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","-----",div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str(),
+             div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str());
+      for(int i=0; i<(int)timer_names.size(); i++) {
+        Teuchos::RCP<Teuchos::Time> t = myTimeMonitor->lookupCounter(timer_names[i]);
+        if(!t.is_null()) {
+          double time_per_call = t->totalElapsedTime() / t->numCalls();
+          printf("%-60s %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f\n",timer_names[i],
+                 max_times[0]/time_per_call,max_times[1]/time_per_call,max_times[2]/time_per_call,max_times[3]/time_per_call,max_times[4]/time_per_call,
+                 max_times[5]/time_per_call,max_times[6]/time_per_call,max_times[7]/time_per_call,max_times[8]/time_per_call,max_times[9]/time_per_call);
+
+        }
+      }
+    }
+    else {
+      std::cout<<prefix<<"Note: Minimum time model individual timers only work with stacked timers off."<<std::endl;
+    }
+  }
+
+}
+
+
+}// end MueLu namespace
+
+#endif