From 13dbcbeaab89f7470e73a5a3af9ec9c6b93beb99 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 10 Aug 2020 15:39:56 -0700 Subject: [PATCH 1/5] Fixing bug in memcpy activity The stream ID wasn't getting captured, causing overlapping timers in the OTF2 trace. --- src/apex/activity_trace_async.cpp | 149 ++++++++++++++++++------------ 1 file changed, 88 insertions(+), 61 deletions(-) diff --git a/src/apex/activity_trace_async.cpp b/src/apex/activity_trace_async.cpp index 1ec5a965..83f71a4b 100644 --- a/src/apex/activity_trace_async.cpp +++ b/src/apex/activity_trace_async.cpp @@ -284,7 +284,7 @@ static void memoryActivity(CUpti_Activity *record) { CUpti_ActivityMemcpy *memcpy = (CUpti_ActivityMemcpy *) record; std::string name{getMemcpyKindString(memcpy->copyKind)}; store_profiler_data(name, memcpy->correlationId, memcpy->start, - memcpy->end, memcpy->deviceId, memcpy->contextId, 0); + memcpy->end, memcpy->deviceId, memcpy->contextId, memcpy->streamId); if (apex::apex_options::use_cuda_counters()) { store_counter_data("GPU: Bytes", name, memcpy->end, memcpy->bytes, true); @@ -307,6 +307,8 @@ static void unifiedMemoryActivity(CUpti_Activity *record) { CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD || memcpy->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH) { + // The context isn't available, and the streamID isn't valid + // (per CUPTI documentation) store_profiler_data(name, 0, memcpy->start, memcpy->end, device, 0, 0); if (apex::apex_options::use_cuda_counters()) { @@ -661,6 +663,75 @@ bool getBytesIfMalloc(CUpti_CallbackId id, const void* params, std::string conte return true; } +void register_new_context(const void *params) { + //printf("New Context\n"); + APEX_UNUSED(params); +#if 0 + CUpti_ResourceData * rd = (CUpti_ResourceData*)(params); + /* Register for async activity ON THIS CONTEXT! */ + CUPTI_CALL(cuptiActivityEnableContext(rd->context, + CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10 + CUPTI_CALL(cuptiActivityEnableContext(rd->context, + CUPTI_ACTIVITY_KIND_MEMCPY)); // 1 + CUPTI_CALL(cuptiActivityEnableContext(rd->context, + CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22 + CUPTI_CALL(cuptiActivityEnableContext(rd->context, + CUPTI_ACTIVITY_KIND_MEMSET)); // 2 +#else + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2 +#endif + //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33 + //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34 +#if 0 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3 <- disables concurrency + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT)); // 6 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC)); // 7 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); // 11 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); // 12 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MODULE)); // 27 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_STREAM)); // 37 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PCIE)); // 46 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API)); // 48 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_COUNT)); // 49 +#endif +} + void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, CUpti_CallbackId id, const void *params) { static bool initialized = initialize_first_time(); @@ -677,6 +748,13 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, APEX_UNUSED(domain); if (!apex::thread_instance::is_worker()) { return; } if (params == NULL) { return; } + + if (domain == CUPTI_CB_DOMAIN_RESOURCE && + id == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { + register_new_context(params); + return; + } + CUpti_CallbackData * cbdata = (CUpti_CallbackData*)(params); if (cbdata->callbackSite == CUPTI_API_ENTER) { @@ -747,72 +825,19 @@ void initTrace() { if (apex::apex_options::use_cuda_driver_api()) { CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API)); } + // Make sure we see CUPTI_CBID_RESOURCE_CONTEXT_CREATED events! + CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE)); /* These events aren't begin/end callbacks, so no need to support them. */ //CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE)); - //CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE)); //CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_NVTX)); - /* Register for async activity */ - - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2 - //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33 - //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34 -#if 0 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3 <- disables concurrency - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT)); // 6 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC)); // 7 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); // 11 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); // 12 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MODULE)); // 27 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_STREAM)); // 37 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PCIE)); // 46 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API)); // 48 - CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_COUNT)); // 49 -#endif - // synchronize timestamps startTimestampCPU = apex::profiler::get_time_ns(); cuptiGetTimestamp(&startTimestampGPU); // assume CPU timestamp is greater than GPU deltaTimestamp = (int64_t)(startTimestampCPU) - (int64_t)(startTimestampGPU); - printf("Delta computed to be: %ld\n", deltaTimestamp); + //printf("Delta computed to be: %ld\n", deltaTimestamp); } /* This is the global "shutdown" method for flushing the buffer. This is @@ -821,10 +846,12 @@ void initTrace() { namespace apex { void flushTrace(void) { if ((num_buffers_processed + 10) < num_buffers) { - flushing = true; - std::cout << "Flushing remaining " << std::fixed - << num_buffers-num_buffers_processed << " of " << num_buffers - << " CUDA/CUPTI buffers..." << std::endl; + if (apex::instance()->get_node_id() == 0) { + //flushing = true; + std::cout << "Flushing remaining " << std::fixed + << num_buffers-num_buffers_processed << " of " << num_buffers + << " CUDA/CUPTI buffers..." << std::endl; + } } cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_NONE); if (flushing) { From 788ee8d2ba01a84fd85ba40255fceaf82739ce8f Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 10 Aug 2020 15:40:35 -0700 Subject: [PATCH 2/5] Add MPI_Finalize wrapper When configuring APEX with MPI support, wrap the MPI_Finalize function so that we can use MPI functions during OTF2 event unification instead of the filesystem. --- src/apex/apex.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp index 0fee121f..11509aa6 100644 --- a/src/apex/apex.cpp +++ b/src/apex/apex.cpp @@ -78,6 +78,10 @@ thread_instance::get_id(), __func__, __LINE__); fflush(stdout); #define FUNCTION_EXIT #endif +#if defined(APEX_HAVE_MPI) +#include "mpi.h" +#endif + APEX_NATIVE_TLS bool _registered = false; APEX_NATIVE_TLS bool _exited = false; static bool _initialized = false; @@ -2051,6 +2055,18 @@ extern "C" { return hardware_concurrency(); } +/* When running with MPI and OTF (or other event unification at the end of + * execution) we need to finalize APEX before MPI_Finalize() is called, so + * that we can use MPI for the wrap-up. We can override the weak MPI + * implementation of Finalize, and do what we need to. */ +#if defined(APEX_HAVE_MPI) + int MPI_Finalize(void) { + apex::finalize(); + int retval = PMPI_Finalize(); + apex::cleanup(); + return retval; + } +#endif } // extern "C" From bba82073826607a0f21a6b70199006a48fdc4a73 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 10 Aug 2020 15:41:29 -0700 Subject: [PATCH 3/5] Unify the final timestamp At the end of exeuction, exchange final timestamps so that the OTF2 trace has an accurate final timestamp. --- src/apex/otf2_listener.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/apex/otf2_listener.cpp b/src/apex/otf2_listener.cpp index c83fd381..8a744b95 100644 --- a/src/apex/otf2_listener.cpp +++ b/src/apex/otf2_listener.cpp @@ -1519,6 +1519,7 @@ namespace apex { // first, output our number of threads. //metric_file << thread_instance::get_num_threads() << endl; metric_file << _event_threads.size() << endl; + metric_file << saved_end_timestamp << endl; // then iterate over the metrics and write them out. for (auto const &i : global_metric_indices) { string id = i.first; @@ -1587,6 +1588,12 @@ namespace apex { std::getline(metric_file, metric_line); std::string::size_type sz; // alias of size_t rank_thread_map[i] = std::stoi(metric_line,&sz); + // get the last timestamp + std::getline(metric_file, metric_line); + uint64_t tmp_timestamp = std::stol(metric_line, &sz); + if (saved_end_timestamp < tmp_timestamp) { + saved_end_timestamp = tmp_timestamp; + } // read the map from that rank while (std::getline(metric_file, metric_line)) { rank_metric_map[i] = rank_metric_map[i] + 1; @@ -1935,6 +1942,7 @@ namespace apex { // first, output our number of threads. //metric_file << thread_instance::get_num_threads() << endl; metric_file << _event_threads.size() << endl; + metric_file << saved_end_timestamp << endl; // then iterate over the metrics and write them out. for (auto const &i : global_metric_indices) { string id = i.first; @@ -1986,6 +1994,12 @@ namespace apex { std::getline(metric_file, metric_line); std::string::size_type sz; // alias of size_t rank_thread_map[i] = std::stoi(metric_line,&sz); + // get the last timestamp + std::getline(metric_file, metric_line); + uint64_t tmp_timestamp = std::stol(metric_line, &sz); + if (saved_end_timestamp < tmp_timestamp) { + saved_end_timestamp = tmp_timestamp; + } // read the map from that rank while (std::getline(metric_file, metric_line)) { rank_metric_map[i] = rank_metric_map[i] + 1; @@ -2159,7 +2173,7 @@ namespace apex { << ".\nIgnoring event " << p->tt_ptr->task_id->get_name() << " with timestamp of " << stamp << " after last event " << "with timestamp of " << last << std::endl; - */ + */ return; } // don't close the archive on us! From 359d78fd5e4e000b436fa9a440d22a00cf4dd78f Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 10 Aug 2020 15:42:28 -0700 Subject: [PATCH 4/5] Don't finalize profiles if background stats not computed --- src/apex/profiler_listener.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index 07610cd9..f289b8b6 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -1114,6 +1114,10 @@ node_color * get_node_color(double v,double vmin,double vmax) if (apex_options::use_tau()) { tau_listener::Tau_start_wrapper("profiler_listener::process_profiles"); } + /* + static auto prof = new_task(__func__); + start(prof); + */ std::shared_ptr p; task_dependency* td; @@ -1206,6 +1210,7 @@ node_color * get_node_color(double v,double vmin,double vmax) */ #endif + //stop(prof); if (apex_options::use_tau()) { tau_listener::Tau_stop_wrapper("profiler_listener::process_profiles"); } @@ -1406,7 +1411,9 @@ if (rc != 0) cout << "PAPI error! " << name << ": " << PAPI_strerror(rc) << endl if (ignored > 100000) { std::cerr << "done." << std::endl; } - finalize_profiles(data); + if (apex_options::process_async_state()) { + finalize_profiles(data); + } } if (apex_options::use_taskgraph_output() && node_id == 0) { From 5373cf4900ce199e152e0ec30def14a437a74466 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Mon, 10 Aug 2020 15:43:02 -0700 Subject: [PATCH 5/5] Adding MPI to some CUDA examples to test the event unification support. --- src/unit_tests/CUDA/CMakeLists.txt | 15 +++++++++++---- src/unit_tests/CUDA/apex_cuda.cu | 14 ++++++++++++++ src/unit_tests/CUDA/pi.cu | 18 ++++++++++++++++-- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/unit_tests/CUDA/CMakeLists.txt b/src/unit_tests/CUDA/CMakeLists.txt index 54ad957f..1e179b4a 100644 --- a/src/unit_tests/CUDA/CMakeLists.txt +++ b/src/unit_tests/CUDA/CMakeLists.txt @@ -1,5 +1,12 @@ +if(MPI_CXX_FOUND) + set(APEX_CUDA_CXX_FLAGS ${MPI_COMPILE_FLAGS}) + set(APEX_CUDA_C_FLAGS ${MPI_COMPILE_FLAGS}) + set(APEX_CUDA_EXTRA_INCLUDE ${MPI_CXX_INCLUDE_PATH}) + set(APEX_CUDA_CXX_LINK_FLAGS ${MPI_CXX_LINK_FLAGS} ${MPI_CXX_LIBRARIES}) +endif() + # Make sure the compiler can find include files from our Apex library. -include_directories (${APEX_SOURCE_DIR}/src/apex) +include_directories (${APEX_SOURCE_DIR}/src/apex ${APEX_CUDA_EXTRA_INCLUDE}) # Make sure the linker can find the Apex library once it is built. link_directories (${APEX_BINARY_DIR}/src/apex) @@ -18,14 +25,14 @@ endif (OPENMP_FOUND) message(INFO "Using CUDA libraries: ${CUDA_LIBRARIES}") -set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} ${APEX_CUDA_C_FLAGS}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} ${APEX_CUDA_CXX_FLAGS}") foreach(example_program ${example_programs}) set(sources ${example_program}.cu) source_group("Source Files" FILES ${sources}) add_executable("${example_program}_cu" ${sources}) - target_link_libraries ("${example_program}_cu" apex ${LIBS} OpenMP::OpenMP_CXX CUDA::cuda_driver CUDA::curand) + target_link_libraries ("${example_program}_cu" apex ${LIBS} OpenMP::OpenMP_CXX CUDA::cuda_driver CUDA::curand ${APEX_CUDA_CXX_LINK_FLAGS}) if (BUILD_STATIC_EXECUTABLES) set_target_properties("${example_program}_cu" PROPERTIES LINK_SEARCH_START_STATIC 1 LINK_SEARCH_END_STATIC 1) endif() diff --git a/src/unit_tests/CUDA/apex_cuda.cu b/src/unit_tests/CUDA/apex_cuda.cu index 4f405d0a..34e7f1c2 100644 --- a/src/unit_tests/CUDA/apex_cuda.cu +++ b/src/unit_tests/CUDA/apex_cuda.cu @@ -1,6 +1,9 @@ #include #include #include "apex_api.hpp" +#if defined(APEX_HAVE_MPI) +#include "mpi.h" +#endif #define ITERATIONS 4 @@ -36,9 +39,17 @@ void launch(DataElement *elem) { int main(int argc, char * argv[]) { +#if defined(APEX_HAVE_MPI) + MPI_Init(&argc, &argv); + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + apex::init("apex::cuda unit test", rank, size); +#else APEX_UNUSED(argc); APEX_UNUSED(argv); apex::init("apex::cuda unit test", 0, 1); +#endif apex::apex_options::use_screen_output(true); DataElement *e; RUNTIME_API_CALL(cudaMallocManaged((void**)&e, sizeof(DataElement))); @@ -56,6 +67,9 @@ int main(int argc, char * argv[]) RUNTIME_API_CALL(cudaFree(e->name)); RUNTIME_API_CALL(cudaFree(e)); +#if defined(APEX_HAVE_MPI) + MPI_Finalize(); +#endif apex::finalize(); apex::cleanup(); } diff --git a/src/unit_tests/CUDA/pi.cu b/src/unit_tests/CUDA/pi.cu index 70b148cb..ec5603de 100644 --- a/src/unit_tests/CUDA/pi.cu +++ b/src/unit_tests/CUDA/pi.cu @@ -4,6 +4,9 @@ #include #include #include "apex_api.hpp" +#if defined(APEX_HAVE_MPI) +#include "mpi.h" +#endif #define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \ printf("Error at %s Line %d: %s\n",__FILE__,__LINE__,cudaGetErrorString(x));}} while(0) @@ -26,12 +29,20 @@ __global__ void montecarlo(float* pt1, float* pt2, int* result, int total_thread } int main(int argc, char * argv[]) { - apex::init("apex cuda openmpi test", 0, 1); +#if defined(APEX_HAVE_MPI) + MPI_Init(&argc, &argv); + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + apex::init("apex::cuda PI test", rank, size); +#else + apex::init("apex cuda PI test", 0, 1); +#endif apex::apex_options::use_screen_output(true); omp_set_num_threads(2); int num_darts = 1<<23; // - int N = 1<<29; // can't be more than 2^30 or memory errors + int N = 1<<25; // can't be more than 2^30 or memory errors int Nx = omp_get_num_threads()*2; // must be even, can be arbitrarily large int num_threads = 256; int num_blocks = 128; @@ -161,6 +172,9 @@ int main(int argc, char * argv[]) { free(results_host[i]); } free(results_host); +#if defined(APEX_HAVE_MPI) + MPI_Finalize(); +#endif apex::finalize(); apex::cleanup(); }