Skip to content

Commit

Permalink
Merge Pull Request #12084 from trilinos/Trilinos/csiefer-e46c1b9
Browse files Browse the repository at this point in the history
Automatically Merged using Trilinos Pull Request AutoTester
PR Title: MueLu: Adding SPMV PerfModel as Driver option
PR Author: csiefer2
  • Loading branch information
trilinos-autotester authored Jul 28, 2023
2 parents 74523c6 + e46c1b9 commit 15c7c1a
Show file tree
Hide file tree
Showing 6 changed files with 459 additions and 338 deletions.
16 changes: 10 additions & 6 deletions packages/muelu/src/Utils/MueLu_PerfModels_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ namespace MueLu {

/* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20);
bool has_stream_vector_table() const {return stream_sizes_.size() > 0;}

/* Lookup in the stream_vector table */
double stream_vector_copy_lookup(int SIZE_IN_BYTES);
Expand All @@ -91,8 +92,8 @@ namespace MueLu {
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);

/* Print table */
void print_stream_vector_table(std::ostream & out);
void print_latency_corrected_stream_vector_table(std::ostream & out);
void print_stream_vector_table(std::ostream & out, const std::string & prefix="");
void print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix="");

/* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
* The sender process sends a message and then waits for confirmation of reception.
Expand All @@ -102,13 +103,14 @@ namespace MueLu {
* See further: https://mvapich.cse.ohio-state.edu/benchmarks/
*/
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
bool has_pingpong_table() const {return pingpong_sizes_.size() > 0;}

/* Lookup in the pingpong_vector table */
double pingpong_host_lookup(int SIZE_IN_BYTES);
double pingpong_device_lookup(int SIZE_IN_BYTES);

/* Print table */
void print_pingpong_table(std::ostream & out);
void print_pingpong_table(std::ostream & out, const std::string & prefix="");

/* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
* Here we use exactly the communication pattern specified in the import object
Expand All @@ -118,13 +120,14 @@ namespace MueLu {
* See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
*/
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal,GlobalOrdinal,Node> > & import);
bool has_halopong_table() const {return halopong_sizes_.size() > 0;}

/* Lookup in the halopong_vector table */
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);

/* Print table */
void print_halopong_table(std::ostream & out);
void print_halopong_table(std::ostream & out, const std::string & prefix="");



Expand All @@ -133,15 +136,16 @@ namespace MueLu {
* e.g., GPUS.
*/
void launch_latency_make_table(int KERNEL_REPEATS);
bool has_launch_latency_table() const {return launch_and_wait_latency_ > 0;}

/* Lookup launch latency */
double launch_latency_lookup();

/* Print table */
void print_launch_latency_table(std::ostream & out);
void print_launch_latency_table(std::ostream & out, const std::string & prefix="");

private:
void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction);
void print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix);


std::vector<int> stream_sizes_;
Expand Down
46 changes: 28 additions & 18 deletions packages/muelu/src/Utils/MueLu_PerfModels_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,31 +422,33 @@ namespace MueLu {

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out) {
print_stream_vector_table_impl(out,false);
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table(std::ostream & out, const std::string & prefix) {
print_stream_vector_table_impl(out,false,prefix);
}

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out) {
print_stream_vector_table_impl(out,true);
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_latency_corrected_stream_vector_table(std::ostream & out, const std::string & prefix) {
print_stream_vector_table_impl(out,true,prefix);
}


template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction) {
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream & out,bool use_latency_correction, const std::string & prefix) {
using namespace std;
std::ios old_format(NULL);
old_format.copyfmt(out);

out << setw(20) << "Length in Scalars" << setw(1) << " "
out << prefix
<< setw(20) << "Length in Scalars" << setw(1) << " "
<< setw(20) << "COPY (us)" << setw(1) << " "
<< setw(20) << "ADD (us)" << setw(1) << " "
<< setw(20) << "COPY (GB/s)" << setw(1) << " "
<< setw(20) << "ADD (GB/s)" << std::endl;

out << setw(20) << "-----------------" << setw(1) << " "
out << prefix
<< setw(20) << "-----------------" << setw(1) << " "
<< setw(20) << "---------" << setw(1) << " "
<< setw(20) << "--------" << setw(1) << " "
<< setw(20) << "-----------" << setw(1) << " "
Expand All @@ -462,7 +464,8 @@ namespace MueLu {
double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));


out << setw(20) << size << setw(1) << " "
out << prefix
<< setw(20) << size << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
Expand Down Expand Up @@ -502,18 +505,20 @@ namespace MueLu {

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out) {
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream & out, const std::string & prefix) {
if(pingpong_sizes_.size() == 0) return;

using namespace std;
std::ios old_format(NULL);
old_format.copyfmt(out);

out << setw(20) << "Message Size" << setw(1) << " "
out << prefix
<< setw(20) << "Message Size" << setw(1) << " "
<< setw(20) << "Host (us)" << setw(1) << " "
<< setw(20) << "Device (us)" << std::endl;

out << setw(20) << "------------" << setw(1) << " "
out << prefix
<< setw(20) << "------------" << setw(1) << " "
<< setw(20) << "---------" << setw(1) << " "
<< setw(20) << "-----------" << std::endl;

Expand All @@ -524,7 +529,8 @@ namespace MueLu {
double d_time = pingpong_device_times_[i];


out << setw(20) << size << setw(1) << " "
out << prefix
<< setw(20) << size << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
}
Expand Down Expand Up @@ -562,18 +568,20 @@ namespace MueLu {

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out) {
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream & out, const std::string & prefix) {
if(halopong_sizes_.size() == 0) return;

using namespace std;
std::ios old_format(NULL);
old_format.copyfmt(out);

out << setw(20) << "Message Size" << setw(1) << " "
out << prefix
<< setw(20) << "Message Size" << setw(1) << " "
<< setw(20) << "Host (us)" << setw(1) << " "
<< setw(20) << "Device (us)" << std::endl;

out << setw(20) << "------------" << setw(1) << " "
out << prefix
<< setw(20) << "------------" << setw(1) << " "
<< setw(20) << "---------" << setw(1) << " "
<< setw(20) << "-----------" << std::endl;

Expand All @@ -584,7 +592,8 @@ namespace MueLu {
double d_time = halopong_device_times_[i];


out << setw(20) << size << setw(1) << " "
out << prefix
<< setw(20) << size << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
}
Expand Down Expand Up @@ -629,12 +638,13 @@ namespace MueLu {

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out) {
PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_launch_latency_table(std::ostream & out, const std::string & prefix) {
using namespace std;
std::ios old_format(NULL);
old_format.copyfmt(out);

out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
out << prefix
<< setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
<< setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;

out.copyfmt(old_format);
Expand Down
9 changes: 9 additions & 0 deletions packages/muelu/test/scaling/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ IF (${PACKAGE_NAME}_HAVE_TPETRA_SOLVER_STACK OR ${PACKAGE_NAME}_HAVE_EPETRA_SOLV

INSTALL(TARGETS "${PACKAGE_NAME}_Driver")

# Perf Model
TRIBITS_ADD_TEST(
Driver
NAME PerformanceModel
COMM mpi
ARGS "--nx=40 --ny=40 --nz=40 --matrixType=Laplace3D --performance-model=verbose"
PASS_REGULAR_EXPRESSION "Belos converged"
)

# Do a simple weak scaling experiment (4x ranks and 4x grid size)
TRIBITS_ADD_TEST(
Driver
Expand Down
24 changes: 23 additions & 1 deletion packages/muelu/test/scaling/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include <iomanip>
#include <iostream>
#include <unistd.h>
#include <vector>
#include <sys/resource.h>

#include <Teuchos_XMLParameterListHelpers.hpp>
Expand Down Expand Up @@ -77,6 +78,7 @@
#include <MueLu_MutuallyExclusiveTime.hpp>
#include <MueLu_ParameterListInterpreter.hpp>
#include <MueLu_Utilities.hpp>
#include <MueLu_PerfModelReporter.hpp>
#include <MatrixLoad.hpp>
#include <DriverCore.hpp>

Expand Down Expand Up @@ -277,8 +279,9 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
int numReruns = 1; clp.setOption("reruns", &numReruns, "number of reruns");
std::string rerunFilePrefix; clp.setOption("fileprefix", &rerunFilePrefix, "if doing reruns, optional prefix to prepend to output files");
std::string rerunFileSuffix; clp.setOption("filesuffix", &rerunFileSuffix, "if doing reruns, optional suffix to append to output files");

std::string levelPerformanceModel = "no"; clp.setOption("performance-model", &levelPerformanceModel, "runs the level-by-level performance mode options- 'no', 'yes' or 'verbose'");
clp.recogniseAllOptions(true);

switch (clp.parse(argc, argv)) {
case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: return EXIT_SUCCESS;
case Teuchos::CommandLineProcessor::PARSE_ERROR:
Expand Down Expand Up @@ -542,6 +545,25 @@ MueLu::MueLu_AMGX_initialize_plugins();


tm = Teuchos::null;


// If we want Level-specific performance model diagnostics, now is the time!
if( (levelPerformanceModel=="yes" || levelPerformanceModel=="verbose")
&& !H.is_null()) {
for(int i=0; i < H->GetNumLevels(); i++) {
RCP<Level> level = H->GetLevel(i);
try {
RCP<Matrix> A_level = level->Get<RCP<Matrix> >("A");
std::string level_name = std::string("Level-") + std::to_string(i) + std::string(": ");
std::vector<const char *> timers;//MueLu: Laplace2D: Hierarchy: Solve (level=0)
MueLu::report_spmv_performance_models<Matrix>(A_level,100,timers,globalTimeMonitor,level_name,levelPerformanceModel=="verbose");
}
catch(...) {;}
}
}



globalTimeMonitor = Teuchos::null;
if (useStackedTimer)
resetStackedTimer = true;
Expand Down
Loading

0 comments on commit 15c7c1a

Please sign in to comment.