diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp index 1366eb231e68..bd5eed1f59c1 100644 --- a/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp @@ -78,6 +78,7 @@ namespace MueLu { /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */ void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20); + bool has_stream_vector_table() const {return stream_sizes_.size() > 0;} /* Lookup in the stream_vector table */ double stream_vector_copy_lookup(int SIZE_IN_BYTES); @@ -91,8 +92,8 @@ namespace MueLu { double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES); /* Print table */ - void print_stream_vector_table(std::ostream & out); - void print_latency_corrected_stream_vector_table(std::ostream & out); + void print_stream_vector_table(std::ostream & out, const std::string & prefix=""); + void print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix=""); /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks. * The sender process sends a message and then waits for confirmation of reception. @@ -102,13 +103,14 @@ namespace MueLu { * See further: https://mvapich.cse.ohio-state.edu/benchmarks/ */ void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP > &comm); + bool has_pingpong_table() const {return pingpong_sizes_.size() > 0;} /* Lookup in the pingpong_vector table */ double pingpong_host_lookup(int SIZE_IN_BYTES); double pingpong_device_lookup(int SIZE_IN_BYTES); /* Print table */ - void print_pingpong_table(std::ostream & out); + void print_pingpong_table(std::ostream & out, const std::string & prefix=""); /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL. * Here we use exactly the communication pattern specified in the import object @@ -118,13 +120,14 @@ namespace MueLu { * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/ */ void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP > & import); + bool has_halopong_table() const {return halopong_sizes_.size() > 0;} /* Lookup in the halopong_vector table */ double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE); double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE); /* Print table */ - void print_halopong_table(std::ostream & out); + void print_halopong_table(std::ostream & out, const std::string & prefix=""); @@ -133,15 +136,16 @@ namespace MueLu { * e.g., GPUS. */ void launch_latency_make_table(int KERNEL_REPEATS); + bool has_launch_latency_table() const {return launch_and_wait_latency_ > 0;} /* Lookup launch latency */ double launch_latency_lookup(); /* Print table */ - void print_launch_latency_table(std::ostream & out); + void print_launch_latency_table(std::ostream & out, const std::string & prefix=""); private: - void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction); + void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix); std::vector stream_sizes_; diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp index d97a9e4d0c65..5c9a70b4edff 100644 --- a/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp +++ b/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp @@ -422,31 +422,33 @@ namespace MueLu { template void - PerfModels::print_stream_vector_table(std::ostream & out) { - print_stream_vector_table_impl(out,false); + PerfModels::print_stream_vector_table(std::ostream & out, const std::string & prefix) { + print_stream_vector_table_impl(out,false,prefix); } template void - PerfModels::print_latency_corrected_stream_vector_table(std::ostream & out) { - print_stream_vector_table_impl(out,true); + PerfModels::print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix) { + print_stream_vector_table_impl(out,true,prefix); } template void - PerfModels::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction) { + PerfModels::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix) { using namespace std; std::ios old_format(NULL); old_format.copyfmt(out); - out << setw(20) << "Length in Scalars" << setw(1) << " " + out << prefix + << setw(20) << "Length in Scalars" << setw(1) << " " << setw(20) << "COPY (us)" << setw(1) << " " << setw(20) << "ADD (us)" << setw(1) << " " << setw(20) << "COPY (GB/s)" << setw(1) << " " << setw(20) << "ADD (GB/s)" << std::endl; - out << setw(20) << "-----------------" << setw(1) << " " + out << prefix + << setw(20) << "-----------------" << setw(1) << " " << setw(20) << "---------" << setw(1) << " " << setw(20) << "--------" << setw(1) << " " << setw(20) << "-----------" << setw(1) << " " @@ -462,7 +464,8 @@ namespace MueLu { double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar)); - out << setw(20) << size << setw(1) << " " + out << prefix + << setw(20) << size << setw(1) << " " << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " " << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " " << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " " @@ -502,18 +505,20 @@ namespace MueLu { template void - PerfModels::print_pingpong_table(std::ostream & out) { + PerfModels::print_pingpong_table(std::ostream & out, const std::string & prefix) { if(pingpong_sizes_.size() == 0) return; using namespace std; std::ios old_format(NULL); old_format.copyfmt(out); - out << setw(20) << "Message Size" << setw(1) << " " + out << prefix + << setw(20) << "Message Size" << setw(1) << " " << setw(20) << "Host (us)" << setw(1) << " " << setw(20) << "Device (us)" << std::endl; - out << setw(20) << "------------" << setw(1) << " " + out << prefix + << setw(20) << "------------" << setw(1) << " " << setw(20) << "---------" << setw(1) << " " << setw(20) << "-----------" << std::endl; @@ -524,7 +529,8 @@ namespace MueLu { double d_time = pingpong_device_times_[i]; - out << setw(20) << size << setw(1) << " " + out << prefix + << setw(20) << size << setw(1) << " " << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " " << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl; } @@ -562,18 +568,20 @@ namespace MueLu { template void - PerfModels::print_halopong_table(std::ostream & out) { + PerfModels::print_halopong_table(std::ostream & out, const std::string & prefix) { if(halopong_sizes_.size() == 0) return; using namespace std; std::ios old_format(NULL); old_format.copyfmt(out); - out << setw(20) << "Message Size" << setw(1) << " " + out << prefix + << setw(20) << "Message Size" << setw(1) << " " << setw(20) << "Host (us)" << setw(1) << " " << setw(20) << "Device (us)" << std::endl; - out << setw(20) << "------------" << setw(1) << " " + out << prefix + << setw(20) << "------------" << setw(1) << " " << setw(20) << "---------" << setw(1) << " " << setw(20) << "-----------" << std::endl; @@ -584,7 +592,8 @@ namespace MueLu { double d_time = halopong_device_times_[i]; - out << setw(20) << size << setw(1) << " " + out << prefix + << setw(20) << size << setw(1) << " " << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " " << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl; } @@ -629,12 +638,13 @@ namespace MueLu { template void - PerfModels::print_launch_latency_table(std::ostream & out) { + PerfModels::print_launch_latency_table(std::ostream & out, const std::string & prefix) { using namespace std; std::ios old_format(NULL); old_format.copyfmt(out); - out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " + out << prefix + << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl; out.copyfmt(old_format); diff --git a/packages/muelu/test/scaling/CMakeLists.txt b/packages/muelu/test/scaling/CMakeLists.txt index 48739297ee57..e36192802cb2 100644 --- a/packages/muelu/test/scaling/CMakeLists.txt +++ b/packages/muelu/test/scaling/CMakeLists.txt @@ -40,6 +40,15 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV INSTALL(TARGETS "${PACKAGE_NAME}_Driver") + # Perf Model + TRIBITS_ADD_TEST( + Driver + NAME PerformanceModel + COMM mpi + ARGS "--nx=40 --ny=40 --nz=40 --matrixType=Laplace3D --performance-model=verbose" + PASS_REGULAR_EXPRESSION "Belos converged" + ) + # Do a simple weak scaling experiment (4x ranks and 4x grid size) TRIBITS_ADD_TEST( Driver diff --git a/packages/muelu/test/scaling/Driver.cpp b/packages/muelu/test/scaling/Driver.cpp index 7a55e318b8e5..e6c822cb04e0 100644 --- a/packages/muelu/test/scaling/Driver.cpp +++ b/packages/muelu/test/scaling/Driver.cpp @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,7 @@ #include #include #include +#include #include #include @@ -277,8 +279,9 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar int numReruns = 1; clp.setOption("reruns", &numReruns, "number of reruns"); std::string rerunFilePrefix; clp.setOption("fileprefix", &rerunFilePrefix, "if doing reruns, optional prefix to prepend to output files"); std::string rerunFileSuffix; clp.setOption("filesuffix", &rerunFileSuffix, "if doing reruns, optional suffix to append to output files"); - + std::string levelPerformanceModel = "no"; clp.setOption("performance-model", &levelPerformanceModel, "runs the level-by-level performance mode options- 'no', 'yes' or 'verbose'"); clp.recogniseAllOptions(true); + switch (clp.parse(argc, argv)) { case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: return EXIT_SUCCESS; case Teuchos::CommandLineProcessor::PARSE_ERROR: @@ -542,6 +545,25 @@ MueLu::MueLu_AMGX_initialize_plugins(); tm = Teuchos::null; + + + // If we want Level-specific performance model diagnostics, now is the time! + if( (levelPerformanceModel=="yes" || levelPerformanceModel=="verbose") + && !H.is_null()) { + for(int i=0; i < H->GetNumLevels(); i++) { + RCP level = H->GetLevel(i); + try { + RCP A_level = level->Get >("A"); + std::string level_name = std::string("Level-") + std::to_string(i) + std::string(": "); + std::vector timers;//MueLu: Laplace2D: Hierarchy: Solve (level=0) + MueLu::report_spmv_performance_models(A_level,100,timers,globalTimeMonitor,level_name,levelPerformanceModel=="verbose"); + } + catch(...) {;} + } + } + + + globalTimeMonitor = Teuchos::null; if (useStackedTimer) resetStackedTimer = true; diff --git a/packages/muelu/test/scaling/MatvecKernelDriver.cpp b/packages/muelu/test/scaling/MatvecKernelDriver.cpp index fc499d42e8cf..a2b5c07da4fc 100644 --- a/packages/muelu/test/scaling/MatvecKernelDriver.cpp +++ b/packages/muelu/test/scaling/MatvecKernelDriver.cpp @@ -64,8 +64,10 @@ #include "MueLu.hpp" #include "MueLu_TestHelpers.hpp" #include "MueLu_PerfModels.hpp" +#include "MueLu_PerfModelReporter.hpp" #include + #include "Xpetra_TpetraMultiVector.hpp" #include "Xpetra_TpetraImport.hpp" #include "Tpetra_CrsMatrix.hpp" @@ -123,318 +125,6 @@ void print_crs_graph(std::string name, const V1 rowptr, const V2 colind) { printf("\n"); } -// ========================================================================= -// Performance Routines -// ========================================================================= -// Report bandwidth in GB / sec -const double GB = 1024.0 * 1024.0 * 1024.0; - -double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) { - - double time_per_call = time / num_calls; - - return memory_per_call_bytes / GB / time_per_call; -} - - - -template -void report_performance_models(const Teuchos::RCP & A, int nrepeat, bool verbose) { - using Teuchos::RCP; - const RCP > comm = A->getMap()->getComm(); - using SC = typename Matrix::scalar_type; - using LO = typename Matrix::local_ordinal_type; - using GO = typename Matrix::global_ordinal_type; - using NO = typename Matrix::node_type; - - // NOTE: We've hardwired this to size_t for the rowptr. This really should really get read out of a typedef, - // if Tpetra actually had one - using rowptr_type = size_t; - MueLu::PerfModels PM; - int rank = comm->getRank(); int nproc = comm->getSize(); - int m = static_cast(A->getLocalNumRows()); - int n = static_cast(A->getColMap()->getLocalNumElements()); - int nnz = static_cast(A->getLocalMatrixHost().graph.entries.extent(0)); - - // Generate Lookup Tables - int v_log_max = ceil(log(nnz) / log(2))+1; - PM.stream_vector_make_table(nrepeat,v_log_max); - - int m_log_max = 15; - PM.pingpong_make_table(nrepeat,m_log_max,comm); - - - if(A->hasCrsGraph()) { - auto importer = A->getCrsGraph()->getImporter(); - if(!importer.is_null()) { - size_t recv_size = importer->getRemoteLIDs().size() * sizeof(SC); - size_t send_size = importer->getExportLIDs().size() * sizeof(SC); - int local_log_max = ceil(log(std::max(send_size,recv_size)) / log(2))+1; - int global_log_max=local_log_max; - Teuchos::reduceAll(*comm,Teuchos::REDUCE_MAX,1,&local_log_max,&global_log_max); - PM.halopong_make_table(nrepeat,global_log_max, importer); - } - } - - if(verbose && rank == 0) { - std::cout<<"********************************************************"< SPMV_num_objects(NUM_TIMERS), SPMV_object_size(NUM_TIMERS), SPMV_corrected(NUM_TIMERS); - - - // Composite model: Use latency correction - SPMV_num_objects[0] = nnz; SPMV_object_size[0] = sizeof(LO); SPMV_corrected[0] = 1;// colind - SPMV_num_objects[1] = (m + 1); SPMV_object_size[1] = sizeof(rowptr_type); SPMV_corrected[1] = 1;// rowptr - SPMV_num_objects[2] = nnz; SPMV_object_size[2] = sizeof(SC); SPMV_corrected[2] = 1;// vals - SPMV_num_objects[3] = n; SPMV_object_size[3] = sizeof(SC); SPMV_corrected[3] = 1; // x - SPMV_num_objects[4] = m; SPMV_object_size[4] = sizeof(SC); SPMV_corrected[4] = 1;// y - - // All-Model: Do not use latency correction - SPMV_object_size[5] = 1; - SPMV_num_objects[5] = (m+1)*sizeof(rowptr_type) + nnz*sizeof(LO) + nnz*sizeof(SC) + - n*sizeof(SC) + m*sizeof(SC); - SPMV_corrected[5] = 0; - - - std::vector gb_per_sec(NUM_TIMERS); - if(verbose && rank == 0) - std::cout<<"****** Local Time Model Results ******"< 1) { - Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, NUM_TIMES, &all_times_local[0], &avg_times[0]); - Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, NUM_TIMES, &all_times_local[0], &max_times[0]); - for(int i=0; i timer_names = {"MV MKL: Total", - "MV KK: Total", - "MV Tpetra: Total", - "MV CuSparse: Total", - "MV MagmaSparse: Total", - "MV HYPRE: Total", - "MV Petsc: Total"}; - - if(rank == 0) { - if(!globalTimeMonitor.is_null()) { - const std::string l[NUM_TIMES]={"Comp","Comp+ping+inplace","Comp+ping+ooplace","Comp+halo+inplace","Comp+halo+ooplace", - "All", "All+ping+inplace", "All+ping+ooplace", "All+halo+inplace", "All+halo+ooplace"}; - const std::string div={"-------------------"}; - printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","Timer",l[0].c_str(),l[1].c_str(),l[2].c_str(),l[3].c_str(),l[4].c_str(), - l[5].c_str(),l[6].c_str(),l[7].c_str(),l[8].c_str(),l[9].c_str()); - printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","-----",div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str(), - div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str()); - for(int i=0; i<(int)timer_names.size(); i++) { - Teuchos::RCP t = globalTimeMonitor->lookupCounter(timer_names[i]); - if(!t.is_null()) { - double time_per_call = t->totalElapsedTime() / t->numCalls(); - printf("%-60s %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f\n",timer_names[i], - max_times[0]/time_per_call,max_times[1]/time_per_call,max_times[2]/time_per_call,max_times[3]/time_per_call,max_times[4]/time_per_call, - max_times[5]/time_per_call,max_times[6]/time_per_call,max_times[7]/time_per_call,max_times[8]/time_per_call,max_times[9]/time_per_call); - - } - } - } - else { - std::cout<<"Note: Minimum time model individual timers only work with stacked timers off."<(A,nrepeat,verboseModel); + std::vector timer_names = {"MV MKL: Total", + "MV KK: Total", + "MV Tpetra: Total", + "MV CuSparse: Total", + "MV MagmaSparse: Total", + "MV HYPRE: Total", + "MV Petsc: Total"}; + + MueLu::report_spmv_performance_models(A,nrepeat,timer_names,globalTimeMonitor,"",verboseModel); globalTimeMonitor = Teuchos::null; success = true; diff --git a/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp b/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp new file mode 100644 index 000000000000..022d4b77e298 --- /dev/null +++ b/packages/muelu/test/scaling/MueLu_PerfModelReporter.hpp @@ -0,0 +1,378 @@ +// @HEADER +// +// *********************************************************************** +// +// MueLu: A package for multigrid based preconditioning +// Copyright 2012 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact +// Jonathan Hu (jhu@sandia.gov) +// Andrey Prokopenko (aprokop@sandia.gov) +// Ray Tuminaro (rstumin@sandia.gov) +// +// *********************************************************************** +// +// @HEADER +#ifndef MUELU_PERFMODEL_REPORTER_HPP +#define MUELU_PERFMODEL_REPORTER_HPP +#include + +#include +#include + +#include "MueLu_PerfModels.hpp" + +namespace MueLu { + + + +// ========================================================================= +// Performance Routines +// ========================================================================= +// Report bandwidth in GB / sec + + +/* @brief Generate performance model reports for SPMV of a given matrix + Input: + A - Matrix to evaluate + nrepeat - # times to repeat the tests in model construction + timer_names - Names of Teuchos timers which are SPMVs to compare agains the model + verbose - Enable verbose output +*/ +template +void report_spmv_performance_models(const Teuchos::RCP & A, int nrepeat,const std::vector & timer_names , Teuchos::RCP & myTimeMonitor, const std::string prefix = "", bool verbose=false) { + using Teuchos::RCP; + using Teuchos::rcp; + const RCP > comm = A->getMap()->getComm(); + using SC = typename Matrix::scalar_type; + using LO = typename Matrix::local_ordinal_type; + using GO = typename Matrix::global_ordinal_type; + using NO = typename Matrix::node_type; + + const double GB = 1024.0 * 1024.0 * 1024.0; + + + // Conversion function as a lambda + auto convert_time_to_bandwidth_gbs = [=](double time, int num_calls, double memory_per_call_bytes) { + double time_per_call = time / num_calls; + + return memory_per_call_bytes / GB / time_per_call; + }; + + // NOTE: We've hardwired this to size_t for the rowptr. This really should really get read out of a typedef, + // if Tpetra actually had one + using rowptr_type = size_t; + + // Make a new model if we need one + MueLu::PerfModels PM; + + int rank = comm->getRank(); int nproc = comm->getSize(); + int m = static_cast(A->getLocalNumRows()); + int n = static_cast(A->getColMap()->getLocalNumElements()); + int nnz = static_cast(A->getLocalMatrixHost().graph.entries.extent(0)); + + // Generate Lookup Tables + int v_log_max = ceil(log(nnz) / log(2))+1; + PM.stream_vector_make_table(nrepeat,v_log_max); + + int m_log_max = 15; + PM.pingpong_make_table(nrepeat,m_log_max,comm); + + + if(A->hasCrsGraph()) { + auto importer = A->getCrsGraph()->getImporter(); + if(!importer.is_null()) { + size_t recv_size = importer->getRemoteLIDs().size() * sizeof(SC); + size_t send_size = importer->getExportLIDs().size() * sizeof(SC); + int local_log_max = ceil(log(std::max(send_size,recv_size)) / log(2))+1; + int global_log_max=local_log_max; + Teuchos::reduceAll(*comm,Teuchos::REDUCE_MAX,1,&local_log_max,&global_log_max); + PM.halopong_make_table(nrepeat,global_log_max, importer); + } + } + + if(verbose && rank == 0) { + std::cout< SPMV_num_objects(NUM_TIMERS), SPMV_object_size(NUM_TIMERS), SPMV_corrected(NUM_TIMERS); + + + // Composite model: Use latency correction + SPMV_num_objects[0] = nnz; SPMV_object_size[0] = sizeof(LO); SPMV_corrected[0] = 1;// colind + SPMV_num_objects[1] = (m + 1); SPMV_object_size[1] = sizeof(rowptr_type); SPMV_corrected[1] = 1;// rowptr + SPMV_num_objects[2] = nnz; SPMV_object_size[2] = sizeof(SC); SPMV_corrected[2] = 1;// vals + SPMV_num_objects[3] = n; SPMV_object_size[3] = sizeof(SC); SPMV_corrected[3] = 1; // x + SPMV_num_objects[4] = m; SPMV_object_size[4] = sizeof(SC); SPMV_corrected[4] = 1;// y + + // All-Model: Do not use latency correction + SPMV_object_size[5] = 1; + SPMV_num_objects[5] = (m+1)*sizeof(rowptr_type) + nnz*sizeof(LO) + nnz*sizeof(SC) + + n*sizeof(SC) + m*sizeof(SC); + SPMV_corrected[5] = 0; + + + std::vector gb_per_sec(NUM_TIMERS); + if(verbose && rank == 0) + std::cout< 1) { + Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, NUM_TIMES, &all_times_local[0], &avg_times[0]); + Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, NUM_TIMES, &all_times_local[0], &max_times[0]); + for(int i=0; i 0) { + const std::string l[NUM_TIMES]={"Comp","Comp+ping+inplace","Comp+ping+ooplace","Comp+halo+inplace","Comp+halo+ooplace", + "All", "All+ping+inplace", "All+ping+ooplace", "All+halo+inplace", "All+halo+ooplace"}; + const std::string div={"-------------------"}; + printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","Timer",l[0].c_str(),l[1].c_str(),l[2].c_str(),l[3].c_str(),l[4].c_str(), + l[5].c_str(),l[6].c_str(),l[7].c_str(),l[8].c_str(),l[9].c_str()); + printf("%-60s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n","-----",div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str(), + div.c_str(),div.c_str(),div.c_str(),div.c_str(),div.c_str()); + for(int i=0; i<(int)timer_names.size(); i++) { + Teuchos::RCP t = myTimeMonitor->lookupCounter(timer_names[i]); + if(!t.is_null()) { + double time_per_call = t->totalElapsedTime() / t->numCalls(); + printf("%-60s %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f\n",timer_names[i], + max_times[0]/time_per_call,max_times[1]/time_per_call,max_times[2]/time_per_call,max_times[3]/time_per_call,max_times[4]/time_per_call, + max_times[5]/time_per_call,max_times[6]/time_per_call,max_times[7]/time_per_call,max_times[8]/time_per_call,max_times[9]/time_per_call); + + } + } + } + else { + std::cout<