Merge Pull Request #12084 from trilinos/Trilinos/csiefer-e46c1b9

Automatically Merged using Trilinos Pull Request AutoTester PR Title: MueLu: Adding SPMV PerfModel as Driver option PR Author: csiefer2
trilinos · Jul 28, 2023 · 15c7c1a · 15c7c1a
2 parents 74523c6 + e46c1b9
commit 15c7c1a
Show file tree

Hide file tree

Showing 6 changed files with 459 additions and 338 deletions.
diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp
@@ -78,6 +78,7 @@ namespace MueLu {
 
     /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
     void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20);
+    bool has_stream_vector_table() const {return stream_sizes_.size() > 0;}
 
     /* Lookup in the stream_vector table */
     double stream_vector_copy_lookup(int SIZE_IN_BYTES);
@@ -91,8 +92,8 @@ namespace MueLu {
     double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);   
 
     /* Print table */
-    void print_stream_vector_table(std::ostream & out);
-    void print_latency_corrected_stream_vector_table(std::ostream & out);
+    void print_stream_vector_table(std::ostream & out, const std::string & prefix="");
+    void print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix="");
 
     /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
      * The sender process sends a message and then waits for confirmation of reception.
@@ -102,13 +103,14 @@ namespace MueLu {
      * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
      */    
     void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
+    bool has_pingpong_table() const {return pingpong_sizes_.size() > 0;}
 
     /* Lookup in the pingpong_vector table */
     double pingpong_host_lookup(int SIZE_IN_BYTES);
     double pingpong_device_lookup(int SIZE_IN_BYTES);
 
     /* Print table */
-    void print_pingpong_table(std::ostream & out);
+    void print_pingpong_table(std::ostream & out, const std::string & prefix="");
 
     /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
      * Here we use exactly the communication pattern specified in the import object
@@ -118,13 +120,14 @@ namespace MueLu {
      * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
      */
     void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import);
+    bool has_halopong_table() const {return halopong_sizes_.size() > 0;}
 
     /* Lookup in the halopong_vector table */
     double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
     double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
 
     /* Print table */
-    void print_halopong_table(std::ostream & out);
+    void print_halopong_table(std::ostream & out, const std::string & prefix="");
 
 
 
@@ -133,15 +136,16 @@ namespace MueLu {
      * e.g., GPUS.
      */
     void launch_latency_make_table(int KERNEL_REPEATS);
+    bool has_launch_latency_table() const  {return launch_and_wait_latency_ > 0;}
 
     /* Lookup launch latency */
     double launch_latency_lookup();
 
     /* Print table */
-    void print_launch_latency_table(std::ostream & out);
+    void print_launch_latency_table(std::ostream & out, const std::string & prefix="");
 
   private:
-    void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction);    
+    void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix);    
 
 
     std::vector<int>    stream_sizes_;

diff --git a/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp b/packages/muelu/src/Utils/MueLu_PerfModels_def.hpp
@@ -422,31 +422,33 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out) {
-    print_stream_vector_table_impl(out,false);
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out, const std::string & prefix) {
+    print_stream_vector_table_impl(out,false,prefix);
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out) {
-    print_stream_vector_table_impl(out,true);
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix) {
+    print_stream_vector_table_impl(out,true,prefix);
   }
 
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix) {
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Length in Scalars" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Length in Scalars" << setw(1) << " "
         << setw(20) << "COPY (us)" << setw(1) << " "
         << setw(20) << "ADD (us)" << setw(1) << " "
         << setw(20) << "COPY (GB/s)" << setw(1) << " "
         << setw(20) << "ADD (GB/s)" << std::endl;
 
-    out << setw(20) << "-----------------" << setw(1) << " "
+    out << prefix
+        << setw(20) << "-----------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "--------" << setw(1) << " "
         << setw(20) << "-----------" << setw(1) << " "
@@ -462,7 +464,8 @@ namespace MueLu {
       double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
@@ -502,18 +505,20 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out, const std::string & prefix) {
     if(pingpong_sizes_.size() == 0) return;
 
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Message Size" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Message Size" << setw(1) << " "
         << setw(20) << "Host (us)" << setw(1) << " "
         << setw(20) << "Device (us)" << std::endl;
 
-    out << setw(20) << "------------" << setw(1) << " "
+    out << prefix
+        << setw(20) << "------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "-----------" << std::endl;
 
@@ -524,7 +529,8 @@ namespace MueLu {
       double d_time = pingpong_device_times_[i];
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
     }
@@ -562,18 +568,20 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out, const std::string & prefix) {
     if(halopong_sizes_.size() == 0) return;
 
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Message Size" << setw(1) << " "
+    out << prefix
+        << setw(20) << "Message Size" << setw(1) << " "
         << setw(20) << "Host (us)" << setw(1) << " "
         << setw(20) << "Device (us)" << std::endl;
 
-    out << setw(20) << "------------" << setw(1) << " "
+    out << prefix 
+        << setw(20) << "------------" << setw(1) << " "
         << setw(20) << "---------" << setw(1) << " "
         << setw(20) << "-----------" << std::endl;
 
@@ -584,7 +592,8 @@ namespace MueLu {
       double d_time = halopong_device_times_[i];
 
 
-      out << setw(20) << size << setw(1) << " "
+      out << prefix
+          << setw(20) << size << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
           << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
     }
@@ -629,12 +638,13 @@ namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void
-  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out) {
+  PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out, const std::string & prefix) {
     using namespace std;
     std::ios old_format(NULL);
     old_format.copyfmt(out);
 
-    out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " 
+    out << prefix
+        << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " " 
         << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
 
     out.copyfmt(old_format);

diff --git a/packages/muelu/test/scaling/CMakeLists.txt b/packages/muelu/test/scaling/CMakeLists.txt
@@ -40,6 +40,15 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV
 
   INSTALL(TARGETS "${PACKAGE_NAME}_Driver")
 
+  # Perf Model
+  TRIBITS_ADD_TEST(
+    Driver
+    NAME PerformanceModel
+    COMM mpi
+    ARGS "--nx=40 --ny=40 --nz=40 --matrixType=Laplace3D --performance-model=verbose"
+    PASS_REGULAR_EXPRESSION "Belos converged"
+  )
+
   # Do a simple weak scaling experiment (4x ranks and 4x grid size)
   TRIBITS_ADD_TEST(
     Driver

diff --git a/packages/muelu/test/scaling/Driver.cpp b/packages/muelu/test/scaling/Driver.cpp
@@ -47,6 +47,7 @@
 #include <iomanip>
 #include <iostream>
 #include <unistd.h>
+#include <vector>
 #include <sys/resource.h>
 
 #include <Teuchos_XMLParameterListHelpers.hpp>
@@ -77,6 +78,7 @@
 #include <MueLu_MutuallyExclusiveTime.hpp>
 #include <MueLu_ParameterListInterpreter.hpp>
 #include <MueLu_Utilities.hpp>
+#include <MueLu_PerfModelReporter.hpp>
 #include <MatrixLoad.hpp>
 #include <DriverCore.hpp>
 
@@ -277,8 +279,9 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
   int numReruns = 1;                                  clp.setOption("reruns",                &numReruns,  "number of reruns");
   std::string rerunFilePrefix;                             clp.setOption("fileprefix",              &rerunFilePrefix,      "if doing reruns, optional prefix to prepend to output files");
   std::string rerunFileSuffix;                             clp.setOption("filesuffix",              &rerunFileSuffix,      "if doing reruns, optional suffix to append to output files");
-
+  std::string  levelPerformanceModel  = "no";          clp.setOption("performance-model", &levelPerformanceModel,  "runs the level-by-level performance mode options- 'no', 'yes' or 'verbose'");
   clp.recogniseAllOptions(true);
+
   switch (clp.parse(argc, argv)) {
     case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS;
     case Teuchos::CommandLineProcessor::PARSE_ERROR:
@@ -542,6 +545,25 @@ MueLu::MueLu_AMGX_initialize_plugins();
 
 
       tm = Teuchos::null;
+
+
+      // If we want Level-specific performance model diagnostics, now is the time!
+      if( (levelPerformanceModel=="yes" || levelPerformanceModel=="verbose")
+          && !H.is_null()) {
+        for(int i=0; i < H->GetNumLevels(); i++) {
+          RCP<Level> level = H->GetLevel(i);
+          try {
+            RCP<Matrix> A_level = level->Get<RCP<Matrix> >("A");            
+            std::string level_name = std::string("Level-") + std::to_string(i) + std::string(": ");
+            std::vector<const char *> timers;//MueLu: Laplace2D: Hierarchy: Solve (level=0)  
+            MueLu::report_spmv_performance_models<Matrix>(A_level,100,timers,globalTimeMonitor,level_name,levelPerformanceModel=="verbose");
+          }
+          catch(...) {;}
+        }        
+      }
+
+
+
       globalTimeMonitor = Teuchos::null;
       if (useStackedTimer)
         resetStackedTimer = true;