From 2f787503632ca1e425d0a47d104e9e49ec22e4d1 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Sun, 9 May 2021 09:37:48 -0700 Subject: [PATCH] Writing a memory wrapper report. There's a huge amount of CUPTI memory leaks, and they happen when the first real call to CUDA happens. I can't force that call, or ignore memory during the first "real" call, yet. --- src/apex/activity_trace_async.cpp | 94 ++++++++- src/apex/apex.cpp | 9 +- src/apex/apex_kokkos_tuning.cpp | 26 ++- src/apex/apex_options.cpp | 2 +- src/apex/apex_types.h | 22 +- src/apex/otf2_listener.cpp | 5 + src/apex/proc_read.cpp | 2 + src/apex/profile.hpp | 41 ++++ src/apex/profiler.hpp | 11 + src/apex/profiler_listener.cpp | 107 ++++++++-- src/apex/profiler_listener.hpp | 2 + src/scripts/apex_exec | 1 + src/wrappers/CMakeLists.txt | 3 + src/wrappers/memory_wrapper.cpp | 14 +- src/wrappers/memory_wrapper_internal.cpp | 235 ++++++++++++++++++++-- src/wrappers/pthread_wrapper_internal.cpp | 7 +- 16 files changed, 528 insertions(+), 53 deletions(-) diff --git a/src/apex/activity_trace_async.cpp b/src/apex/activity_trace_async.cpp index a28277a0..089ce37b 100644 --- a/src/apex/activity_trace_async.cpp +++ b/src/apex/activity_trace_async.cpp @@ -754,6 +754,7 @@ static void openmpActivity(CUpti_Activity *record) { } static void syncActivity(CUpti_Activity *record) { + // when tracking memory allocations, ignore the ones in cuda device synchronize CUpti_ActivitySynchronization *data = (CUpti_ActivitySynchronization *) record; /* Check whether there is timing information */ @@ -865,6 +866,8 @@ static void printActivity(CUpti_Activity *record) { void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { + // when tracking memory allocations, ignore these + apex::in_apex prevent_nonsense; num_buffers++; uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE); if (bfr == NULL) { @@ -883,6 +886,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, //auto p = apex::scoped_timer("APEX: CUPTI Buffer Completed"); //printf("%s...", __func__); fflush(stdout); static bool registered = register_myself(false); + // when tracking memory allocations, ignore these + apex::in_apex prevent_nonsense; num_buffers_processed++; if (flushing) { std::cout << "." << std::flush; } APEX_UNUSED(registered); @@ -1380,6 +1385,8 @@ double get_nvtx_payload(const nvtxEventAttributes_t * eventAttrib) { void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata, std::stack >& timer_stack) { + // disable memory management tracking in APEX during this callback + apex::in_apex prevent_deadlocks; /* Unfortunately, when ranges are started/ended, they can overlap. * Unlike push/pop, which are a true stack. Even worse, CUDA/CUPTI @@ -1541,8 +1548,22 @@ void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata, } } +inline bool ignoreMalloc(CUpti_CallbackId id) { + if (id == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 || + id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 || + id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 || + id == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 || + id == CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 || + id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020) { + return true; + } + return false; +} + void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, CUpti_CallbackId id, const void *params) { + // disable memory management tracking in APEX during this callback + apex::in_apex prevent_deadlocks; static bool initialized = initialize_first_time(); APEX_UNUSED(initialized); static APEX_NATIVE_TLS bool registered = register_myself(true); @@ -1563,7 +1584,7 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, if (id == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { register_new_context(params); } - return; + //return; } /* Check for user-level instrumentation */ @@ -1575,6 +1596,8 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, CUpti_CallbackData * cbdata = (CUpti_CallbackData*)(params); if (cbdata->callbackSite == CUPTI_API_ENTER) { + // sadly, CUPTI leaks a lot of memory. Don't track memory in CUPTI or CUDA. + // apex::in_apex::get()++; std::stringstream ss; ss << cbdata->functionName; if (apex::apex_options::use_cuda_kernel_details()) { @@ -1594,6 +1617,12 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, map_mutex.unlock(); getBytesIfMalloc(id, cbdata->functionParams, tmp, true); } else { + // sadly, CUPTI leaks a lot of memory. Don't track memory in CUPTI or CUDA. + /* + if (cbdata->callbackSite == CUPTI_API_EXIT) { + apex::in_apex::get()--; + } + */ /* Not sure how to use this yet... if this is a kernel launch, we can * run a function on the host, launched from the stream. That gives us * a synchronous callback to tell us an event when the kernel finished. @@ -1640,6 +1669,9 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain, } void initTrace() { + // disable memory management tracking in APEX during this initialization + apex::in_apex prevent_deadlocks; + // make a first call into CUDA bool& registered = get_registered(); registered = true; @@ -1681,7 +1713,6 @@ void initTrace() { CUPTI_CALL(cuptiActivityEnableLatencyTimestamps(enable)); } - // get user-added instrumentation CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_NVTX)); // Make sure we see CUPTI_CBID_RESOURCE_CONTEXT_CREATED events! @@ -1724,4 +1755,63 @@ namespace apex { std::cout << std::endl; } } + + void finalizeCuda(void) { + std::cout << "Finalizing CUPTI " << std::endl; + flushTrace(); + CUPTI_CALL(cuptiUnsubscribe(subscriber)); + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47 + #if 0 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3 <- disables concurrency + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT)); // 6 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC)); // 7 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // 11 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // 12 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MODULE)); // 27 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_STREAM)); // 37 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PCIE)); // 46 + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_COUNT)); // 49 + #endif + cuptiFinalize(); + } } diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp index de50d182..8ac8c5aa 100644 --- a/src/apex/apex.cpp +++ b/src/apex/apex.cpp @@ -1110,12 +1110,16 @@ void yield(std::shared_ptr tt_ptr) void sample_value(const std::string &name, double value, bool threaded) { in_apex prevent_deadlocks; + // check these before checking the options, because if we have already + // cleaned up, checking the options can cause deadlock. This can + // happen if we are tracking memory. + if (_exited || _measurement_stopped) return; // protect against calls after finalization // if APEX is disabled, do nothing. if (apex_options::disable() == true) { return; } // if APEX is suspended, do nothing. if (apex_options::suspend() == true) { return; } apex* instance = apex::instance(); // get the Apex static instance - if (!instance || _exited) return; // protect against calls after finalization + if (!instance) return; // protect against calls after finalization // parse the counter name // either /threadqueue{locality#0/total}/length // or /threadqueue{locality#0/worker-thread#0}/length @@ -1411,6 +1415,7 @@ void finalize_plugins(void) { // forward declare CUPTI buffer flushing #ifdef APEX_WITH_CUDA void flushTrace(void); +void finalizeCuda(void); #endif std::string dump(bool reset) { @@ -1494,6 +1499,7 @@ void finalize() /* This could take a while */ #ifdef APEX_WITH_CUDA flushTrace(); + finalizeCuda(); #endif #ifdef APEX_HAVE_TCMALLOC //tcmalloc::destroy_hook(); @@ -1571,6 +1577,7 @@ void cleanup(void) { } */ delete(instance); + // stop tracking memory! FUNCTION_EXIT } diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index b66fbbf4..127bd83a 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -463,7 +463,9 @@ extern "C" { */ void kokkosp_declare_output_type(const char* name, const size_t id, Kokkos_Tools_VariableInfo& info) { - if (!apex::apex_options::use_kokkos_tuning()) { return; } + // don't track memory in this function. + apex::in_apex prevent_memory_tracking; + //if (!apex::apex_options::use_kokkos_tuning()) { return; } if(getSession().verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << std::endl; @@ -483,7 +485,9 @@ void kokkosp_declare_output_type(const char* name, const size_t id, */ void kokkosp_declare_input_type(const char* name, const size_t id, Kokkos_Tools_VariableInfo& info) { - if (!apex::apex_options::use_kokkos_tuning()) { return; } + // don't track memory in this function. + apex::in_apex prevent_memory_tracking; + //if (!apex::apex_options::use_kokkos_tuning()) { return; } if(getSession().verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << std::endl; @@ -519,7 +523,8 @@ void kokkosp_request_values( const Kokkos_Tools_VariableValue* contextVariableValues, const size_t numTuningVariables, Kokkos_Tools_VariableValue* tuningVariableValues) { - if (!apex::apex_options::use_kokkos_tuning()) { return; } + // don't track memory in this function. + apex::in_apex prevent_memory_tracking; if (getSession().verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << " ctx: " << contextId; @@ -527,9 +532,11 @@ void kokkosp_request_values( } std::string name{hashContext(numContextVariables, contextVariableValues, getSession().inputs)}; - handle_start(name, numTuningVariables, tuningVariableValues); getSession().active_requests.insert( std::pair(contextId, name)); + if (apex::apex_options::use_kokkos_tuning()) { + handle_start(name, numTuningVariables, tuningVariableValues); + } if (getSession().verbose) { std::cout << std::endl << std::string(getDepth(), ' '); printTuning(numTuningVariables, tuningVariableValues); @@ -543,7 +550,9 @@ void kokkosp_request_values( * starting measurement. */ void kokkosp_begin_context(size_t contextId) { - if (!apex::apex_options::use_kokkos_tuning()) { return; } + // don't track memory in this function. + apex::in_apex prevent_memory_tracking; + //if (!apex::apex_options::use_kokkos_tuning()) { return; } if (getSession().verbose) { std::cout << std::string(getDepth()++, ' '); std::cout << __func__ << "\t" << contextId << std::endl; @@ -558,7 +567,8 @@ void kokkosp_begin_context(size_t contextId) { * values can now be associated with a result. */ void kokkosp_end_context(const size_t contextId) { - if (!apex::apex_options::use_kokkos_tuning()) { return; } + // don't track memory in this function. + apex::in_apex prevent_memory_tracking; if (getSession().verbose) { std::cout << std::string(--getDepth(), ' '); std::cout << __func__ << "\t" << contextId << std::endl; @@ -569,7 +579,9 @@ void kokkosp_end_context(const size_t contextId) { if (name != getSession().active_requests.end() && start != getSession().context_starts.end()) { apex::sample_value(name->second, (double)(end-start->second)); - handle_stop(name->second); + if (apex::apex_options::use_kokkos_tuning()) { + handle_stop(name->second); + } getSession().active_requests.erase(contextId); getSession().context_starts.erase(contextId); } diff --git a/src/apex/apex_options.cpp b/src/apex/apex_options.cpp index 18250c7d..bd7fa5e5 100644 --- a/src/apex/apex_options.cpp +++ b/src/apex/apex_options.cpp @@ -51,8 +51,8 @@ namespace apex #endif } } - conf_file.close(); } + conf_file.close(); char* option = nullptr; // getenv is not thread-safe, but the constructor for this static singleton is. diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index 99c967d3..e1bdb487 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -189,16 +189,20 @@ typedef enum _profile_type { */ typedef struct _profile { - double calls; /*!< Number of times a timer was called, or the number - of samples collected for a counter */ - double accumulated; /*!< Accumulated values for all calls/samples */ - double sum_squares; /*!< Running sum of squares calculation for all - calls/samples */ - double minimum; /*!< Minimum value seen by the timer or counter */ - double maximum; /*!< Maximum value seen by the timer or counter */ + double calls; /*!< Number of times a timer was called, or the number + of samples collected for a counter */ + double accumulated; /*!< Accumulated values for all calls/samples */ + double sum_squares; /*!< Running sum of squares calculation for all + calls/samples */ + double minimum; /*!< Minimum value seen by the timer or counter */ + double maximum; /*!< Maximum value seen by the timer or counter */ apex_profile_type type; /*!< Whether this is a timer or a counter */ - double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */ - int times_reset; /*!< How many times was this timer reset */ + double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */ + size_t allocations; /*!< total calls to [m/c/re]alloc and related */ + size_t frees; /*!< total calls to free and related (realloc) */ + size_t bytes_allocated; /*!< total bytes allocated in this task */ + size_t bytes_freed; /*!< total bytes freed in this task */ + int times_reset; /*!< How many times was this timer reset */ } apex_profile; /** Rather than use void pointers everywhere, be explicit about diff --git a/src/apex/otf2_listener.cpp b/src/apex/otf2_listener.cpp index 5f8cb787..58e7234c 100644 --- a/src/apex/otf2_listener.cpp +++ b/src/apex/otf2_listener.cpp @@ -924,6 +924,11 @@ namespace apex { static bool _finalized = false; if (_finalized) { return; } _finalized = true; + // if we are tracking memory, there are some alloc/free events + // we recorded before the end of the trace. + if (apex_options::track_memory()) { + saved_end_timestamp = get_time(); + } // get an exclusive lock, to make sure no other threads // are writing to the archive. write_lock_type lock(_archive_mutex); diff --git a/src/apex/proc_read.cpp b/src/apex/proc_read.cpp index 87ea27af..09967861 100644 --- a/src/apex/proc_read.cpp +++ b/src/apex/proc_read.cpp @@ -1101,6 +1101,8 @@ namespace apex { /* This is the main function for the reader thread. */ void* proc_data_reader::read_proc(void * _ptw) { in_apex prevent_deadlocks; + // when tracking memory allocations, ignore these + in_apex prevent_nonsense; pthread_wrapper* ptw = (pthread_wrapper*)_ptw; // make sure APEX knows this is not a worker thread thread_instance::instance(false); diff --git a/src/apex/profile.hpp b/src/apex/profile.hpp index 34248501..4dcf191e 100644 --- a/src/apex/profile.hpp +++ b/src/apex/profile.hpp @@ -42,6 +42,34 @@ class profile { _profile.minimum = initial; _profile.maximum = initial; #endif + _profile.allocations = 0; + _profile.frees = 0; + _profile.bytes_allocated = 0; + _profile.bytes_freed = 0; + }; + profile(double initial, int num_metrics, double * papi_metrics, bool + yielded, double allocations, double frees, double bytes_allocated, + double bytes_freed) { + _profile.type = APEX_TIMER; + if (!yielded) { + _profile.calls = 1.0; + } else { + _profile.calls = 0.0; + } + _profile.accumulated = initial; + for (int i = 0 ; i < num_metrics ; i++) { + _profile.papi_metrics[i] = papi_metrics[i]; + } + _profile.times_reset = 0; +#ifdef FULL_STATISTICS + _profile.sum_squares = initial*initial; + _profile.minimum = initial; + _profile.maximum = initial; +#endif + _profile.allocations = allocations; + _profile.frees = frees; + _profile.bytes_allocated = bytes_allocated; + _profile.bytes_freed = bytes_freed; }; void increment(double increase, int num_metrics, double * papi_metrics, bool yielded) { @@ -59,6 +87,15 @@ class profile { _profile.calls = _profile.calls + 1.0; } } + void increment(double increase, int num_metrics, double * papi_metrics, + double allocations, double frees, double bytes_allocated, double bytes_freed, + bool yielded) { + increment(increase, num_metrics, papi_metrics, yielded); + _profile.allocations += allocations; + _profile.frees += frees; + _profile.bytes_allocated += bytes_allocated; + _profile.bytes_freed += bytes_freed; + } void reset() { _profile.calls = 0.0; _profile.accumulated = 0.0; @@ -103,6 +140,10 @@ class profile { return _profile.sum_squares; } double get_stddev() { return sqrt(get_variance()); } + double get_allocations() { return _profile.allocations; } + double get_frees() { return _profile.frees; } + double get_bytes_allocated() { return _profile.bytes_allocated; } + double get_bytes_freed() { return _profile.bytes_freed; } apex_profile_type get_type() { return _profile.type; } apex_profile * get_profile() { return &_profile; }; diff --git a/src/apex/profiler.hpp b/src/apex/profiler.hpp index 88a5c8f7..fb04cbc2 100644 --- a/src/apex/profiler.hpp +++ b/src/apex/profiler.hpp @@ -56,6 +56,10 @@ class profiler { long long papi_start_values[8]; long long papi_stop_values[8]; #endif + double allocations; + double frees; + double bytes_allocated; + double bytes_freed; double value; double children_value; uint64_t guid; @@ -80,6 +84,7 @@ class profiler { papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, #endif + allocations(0), frees(0), bytes_allocated(0), bytes_freed(0), value(0.0), children_value(0.0), guid(task->guid), @@ -97,6 +102,7 @@ class profiler { papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, #endif + allocations(0), frees(0), bytes_allocated(0), bytes_freed(0), value(0.0), children_value(0.0), guid(0), @@ -112,6 +118,7 @@ class profiler { papi_start_values{0,0,0,0,0,0,0,0}, papi_stop_values{0,0,0,0,0,0,0,0}, #endif + allocations(0), frees(0), bytes_allocated(0), bytes_freed(0), value(value_), children_value(0.0), is_counter(true), @@ -123,6 +130,10 @@ class profiler { tt_ptr(in.tt_ptr), start_ns(in.start_ns), end_ns(in.end_ns), + allocations(in.allocations), + frees(in.frees), + bytes_allocated(in.bytes_allocated), + bytes_freed(in.bytes_freed), value(in.value), children_value(in.children_value), guid(in.guid), diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp index f2652f9c..6981815c 100644 --- a/src/apex/profiler_listener.cpp +++ b/src/apex/profiler_listener.cpp @@ -287,8 +287,14 @@ std::unordered_set free_profiles; if(p->is_reset == reset_type::CURRENT) { theprofile->reset(); } else { - theprofile->increment(p->elapsed(), tmp_num_counters, - values, p->is_resume); + if (apex_options::track_memory()) { + theprofile->increment(p->elapsed(), tmp_num_counters, + values, p->allocations, p->frees, p->bytes_allocated, + p->bytes_freed, p->is_resume); + } else { + theprofile->increment(p->elapsed(), tmp_num_counters, + values, p->is_resume); + } } #if defined(APEX_THROTTLE) if (!apex_options::use_tau()) { @@ -324,11 +330,20 @@ std::unordered_set free_profiles; #endif } else { // Create a new profile for this name. - theprofile = new profile(p->is_reset == - reset_type::CURRENT ? 0.0 : p->elapsed(), - tmp_num_counters, values, p->is_resume, - p->is_counter ? APEX_COUNTER : APEX_TIMER); - task_map[*(p->get_task_id())] = theprofile; + if (apex_options::track_memory() && !p->is_counter) { + theprofile = new profile(p->is_reset == + reset_type::CURRENT ? 0.0 : p->elapsed(), + tmp_num_counters, values, p->is_resume, + p->allocations, p->frees, p->bytes_allocated, + p->bytes_freed); + task_map[*(p->get_task_id())] = theprofile; + } else { + theprofile = new profile(p->is_reset == + reset_type::CURRENT ? 0.0 : p->elapsed(), + tmp_num_counters, values, p->is_resume, + p->is_counter ? APEX_COUNTER : APEX_TIMER); + task_map[*(p->get_task_id())] = theprofile; + } task_map_lock.unlock(); #ifdef APEX_HAVE_HPX #ifdef APEX_REGISTER_HPX3_COUNTERS @@ -548,7 +563,7 @@ std::unordered_set free_profiles; double tmp = ((p->get_accumulated_seconds()) /total_main)*100.0; if (tmp > 100.0) { - screen_output << " --n/a-- " ; + screen_output << " --n/a--" ; } else { screen_output << string_format(FORMAT_PERCENT, tmp); } @@ -560,6 +575,43 @@ std::unordered_set free_profiles; csv_output << "," << std::llround(p->get_papi_metrics()[i]); } #endif + if (apex_options::track_memory()) { + if (p->get_allocations() > 999999) { + screen_output << " " << string_format(FORMAT_SCIENTIFIC, + (p->get_allocations())); + } else { + screen_output << " " << string_format(PAD_WITH_SPACES, + to_string((int)p->get_allocations()).c_str()); + } + csv_output << "," << std::llround(p->get_allocations()); + + if (p->get_bytes_allocated() > 999999) { + screen_output << " " << string_format(FORMAT_SCIENTIFIC, + (p->get_bytes_allocated())); + } else { + screen_output << " " << string_format(PAD_WITH_SPACES, + to_string((int)p->get_bytes_allocated()).c_str()); + } + csv_output << "," << std::llround(p->get_bytes_allocated()); + + if (p->get_frees() > 999999) { + screen_output << " " << string_format(FORMAT_SCIENTIFIC, + (p->get_frees())); + } else { + screen_output << " " << string_format(PAD_WITH_SPACES, + to_string((int)p->get_frees()).c_str()); + } + csv_output << "," << std::llround(p->get_frees()); + + if (p->get_bytes_freed() > 999999) { + screen_output << " " << string_format(FORMAT_SCIENTIFIC, + (p->get_bytes_freed())); + } else { + screen_output << " " << string_format(PAD_WITH_SPACES, + to_string((int)p->get_bytes_freed()).c_str()); + } + csv_output << "," << std::llround(p->get_bytes_freed()); + } screen_output << endl; csv_output << endl; } else { @@ -669,8 +721,8 @@ std::unordered_set free_profiles; } } screen_output << "------------------------------------------" - << "------------------------------------------------------" << endl - << endl;; + << "------------------------------------------------------\n"; + screen_output << "\n" << endl; } csv_output << "\n\n\"task\",\"num calls\",\"total microseconds\""; #if APEX_HAVE_PAPI @@ -678,6 +730,9 @@ std::unordered_set free_profiles; csv_output << ",\"" << metric_names[i] << "\""; } #endif + if (apex_options::track_memory()) { + csv_output << ",\"allocations\", \"bytes allocated\", \"frees\", \"bytes freed\""; + } csv_output << endl; std::string re("PAPI_"); std::string tmpstr(apex_options::papi_metrics()); @@ -693,10 +748,18 @@ std::unordered_set free_profiles; } screen_output << "Timer : " << "#calls | mean | total | % total " - << tmpstr << endl; + << tmpstr; + if (apex_options::track_memory()) { + screen_output << "| allocs | (bytes) | frees | (bytes) "; + } + screen_output << endl; screen_output << "----------------------------------------------" - << "--------------------------------------------------" << endl; - id_vector.clear(); + << "--------------------------------------------------"; + if (apex_options::track_memory()) { + screen_output << "--------------------------------------------"; + } + screen_output << endl; + id_vector.clear(); // iterate over the timers for(it2 = task_map.begin(); it2 != task_map.end(); it2++) { profile * p = it2->second; @@ -732,7 +795,11 @@ std::unordered_set free_profiles; ((idle_rate/total_main)*100)) << endl; } screen_output << "--------------------------------------------------" - << "----------------------------------------------" << endl; + << "----------------------------------------------"; + if (apex_options::track_memory()) { + screen_output << "--------------------------------------------"; + } + screen_output << endl; screen_output << string_format("%52s", "Total timers") << " : "; //if (total_hpx_threads < 999999) { //screen_output << string_format(PAD_WITH_SPACES, @@ -1748,6 +1815,18 @@ if (rc != 0) cout << "PAPI error! " << name << ": " << PAPI_strerror(rc) << endl main_timer->restart(); } + void profiler_listener::increment_main_timer_allocations(double bytes) { + APEX_ASSERT(main_timer != nullptr); + main_timer->allocations++; + main_timer->bytes_allocated += bytes; + } + + void profiler_listener::increment_main_timer_frees(double bytes) { + APEX_ASSERT(main_timer != nullptr); + main_timer->frees++; + main_timer->bytes_freed += bytes; + } + void profiler_listener::stop_main_timer(void) { static bool stopped{false}; if (!stopped) { diff --git a/src/apex/profiler_listener.hpp b/src/apex/profiler_listener.hpp index fd41cb82..c11a0d16 100644 --- a/src/apex/profiler_listener.hpp +++ b/src/apex/profiler_listener.hpp @@ -254,6 +254,8 @@ class profiler_listener : public event_listener { void stop_main_timer(void); void yield_main_timer(void); void resume_main_timer(void); + void increment_main_timer_allocations(double bytes); + void increment_main_timer_frees(double bytes); void push_profiler_public(std::shared_ptr &p); }; diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec index 6e0a91e7..5b72c81c 100755 --- a/src/scripts/apex_exec +++ b/src/scripts/apex_exec @@ -123,6 +123,7 @@ while (( "$#" )); do --apex:memory) memory=yes export APEX_TRACK_MEMORY=1 + export APEX_SOURCE_LOCATION=1 shift ;; --apex:otf2) diff --git a/src/wrappers/CMakeLists.txt b/src/wrappers/CMakeLists.txt index 382f1460..7ee1fb82 100644 --- a/src/wrappers/CMakeLists.txt +++ b/src/wrappers/CMakeLists.txt @@ -1,6 +1,9 @@ # Make sure the compiler can find include files from our Apex library. include_directories (${APEX_SOURCE_DIR}/src/apex ${APEX_SOURCE_DIR}/src/wrappers) +# add the binary tree to the search path for include files +# so that we will find ApexConfig.h +include_directories("${PROJECT_BINARY_DIR}/src/apex") # Make sure the linker can find the Apex library once it is built. link_directories (${APEX_BINARY_DIR}/src/apex) diff --git a/src/wrappers/memory_wrapper.cpp b/src/wrappers/memory_wrapper.cpp index b2c52779..aac1b728 100644 --- a/src/wrappers/memory_wrapper.cpp +++ b/src/wrappers/memory_wrapper.cpp @@ -54,11 +54,16 @@ void apex_memory_initialized() { } extern "C" -void apex_memory_finalized() { +void apex_memory_lights_out() { apex_ready() = false; apex_report_leaks(); } +extern "C" +void apex_memory_finalized() { + //apex_memory_lights_out(); +} + extern "C" void apex_memory_dl_initialized() { dl_ready() = true; @@ -71,12 +76,19 @@ void apex_memory_dl_initialized() { char bootstrap_heap[BOOTSTRAP_HEAP_SIZE]; char * bootstrap_base = bootstrap_heap; +uintptr_t reportHeapLocation() { + printf("Bootstrap heap located at: %p\n", &bootstrap_heap[0]); + return (uintptr_t)&bootstrap_heap[0]; +} + static inline int is_bootstrap(void * ptr) { char const * const p = (char*)ptr; return (p < bootstrap_heap + BOOTSTRAP_HEAP_SIZE) && (bootstrap_heap < p); } static void * bootstrap_alloc(size_t align, size_t size) { + //static uintptr_t dummy = reportHeapLocation(); + //APEX_UNUSED(dummy); char * ptr; // Check alignment. Default alignment is sizeof(long) diff --git a/src/wrappers/memory_wrapper_internal.cpp b/src/wrappers/memory_wrapper_internal.cpp index 267240c6..a15008bd 100644 --- a/src/wrappers/memory_wrapper_internal.cpp +++ b/src/wrappers/memory_wrapper_internal.cpp @@ -12,35 +12,142 @@ #include #include #include +#include +#include #include "apex_api.hpp" +#include "thread_instance.hpp" +#include "address_resolution.hpp" #include "utils.hpp" //#include +//for unwinding +#include +#include +// for backtrace +#include /////////////////////////////////////////////////////////////////////////////// // Below is the malloc wrapper /////////////////////////////////////////////////////////////////////////////// -typedef struct book_s { +typedef enum allocator { + MALLOC = 0, + CALLOC, + REALLOC +} allocator_t; + +const char * allocator_strings[] = { + "malloc", "calloc", "realloc" +}; + +class record_t { +public: + size_t bytes; + apex::task_identifier * id; + size_t tid; + allocator_t alloc; + record_t() : bytes(0), id(nullptr), tid(0), alloc(MALLOC) {} + record_t(size_t b, size_t t, allocator_t a) : bytes(b), id(nullptr), tid(t), alloc(a) {} + //std::vector backtrace; + std::array backtrace; + size_t size; +}; + +void apex_report_leaks(); + +extern "C" void apex_memory_lights_out(); + +class book_t { +public: + size_t saved_node_id; std::atomic totalAllocated = 0.0; - std::unordered_map memoryMap; + std::unordered_map memoryMap; std::mutex mapMutex; -} book_t; + ~book_t() { + apex_memory_lights_out(); + } +}; book_t& getBook() { static book_t book; return book; } -void record_alloc(size_t bytes, void* ptr) { +class backtrace_record_t { +public: + size_t skip; + std::vector& _stack; + backtrace_record_t(size_t s, std::vector& _s) : skip(s), _stack(_s) {} +}; + +/* Unwind library callback routine. This is passed to + _Unwind_Backtrace. */ + +/* +static _Unwind_Reason_Code +default_unwind (struct _Unwind_Context *context, void *vdata) +{ + backtrace_record_t *data = (backtrace_record_t*) vdata; + uintptr_t pc; + +#ifdef _Unwind_GetIPInfo + int ip_before_insn = 0; + pc = _Unwind_GetIPInfo (context, &ip_before_insn); + if (!ip_before_insn) --pc; +#else + pc = _Unwind_GetIP (context); +#endif + + if (data->skip > 0) { + --data->skip; + return _URC_NO_REASON; + } + + data->_stack.push_back(pc); + + return _URC_NO_REASON; +} +*/ + +inline void printBacktrace() { + void *trace[32]; + size_t size, i; + char **strings; + size = backtrace( trace, 32 ); + strings = backtrace_symbols( trace, size ); + std::cerr << std::endl; + // skip the first frame, it is this handler + for( i = 1; i < size; i++ ){ + std::cerr << apex::demangle(strings[i]) << std::endl; + } +} + +void record_alloc(size_t bytes, void* ptr, allocator_t alloc) { static book_t& book = getBook(); double value = (double)(bytes); apex::sample_value("Memory: Bytes Allocated", value, true); + apex::profiler * p = apex::thread_instance::instance().get_current_profiler(); + record_t tmp(value, apex::thread_instance::instance().get_id(), alloc); + if (p != nullptr) { tmp.id = p->get_task_id(); } + //backtrace_record_t rec(3,tmp.backtrace); + //_Unwind_Backtrace (default_unwind, &(rec)); + tmp.size = backtrace(tmp.backtrace.data(), tmp.backtrace.size()); book.mapMutex.lock(); - book.memoryMap[ptr] = value; + //book.memoryMap[ptr] = value; + book.memoryMap.insert(std::pair(ptr, tmp)); book.mapMutex.unlock(); book.totalAllocated.fetch_add(bytes, std::memory_order_relaxed); value = (double)(book.totalAllocated); apex::sample_value("Memory: Total Bytes Occupied", value); + if (p == nullptr) { + auto i = apex::apex::instance(); + // might be after finalization, so double-check! + if (i != nullptr) { + i->the_profiler_listener->increment_main_timer_allocations(value); + } + } else { + p->allocations++; + p->bytes_allocated += value; + } } void record_free(void* ptr) { @@ -48,9 +155,12 @@ void record_free(void* ptr) { size_t bytes; book.mapMutex.lock(); if (book.memoryMap.count(ptr) > 0) { - bytes = book.memoryMap[ptr]; + record_t& tmp = book.memoryMap[ptr]; + bytes = tmp.bytes; book.memoryMap.erase(ptr); } else { + //std::cout << std::hex << ptr << std::dec << " NOT FOUND" << std::endl; + //printBacktrace(); book.mapMutex.unlock(); return; } @@ -60,6 +170,17 @@ void record_free(void* ptr) { book.totalAllocated.fetch_sub(bytes, std::memory_order_relaxed); value = (double)(book.totalAllocated); apex::sample_value("Memory: Total Bytes Occupied", value); + apex::profiler * p = apex::thread_instance::instance().get_current_profiler(); + if (p == nullptr) { + auto i = apex::apex::instance(); + // might be after finalization, so double-check! + if (i != nullptr) { + i->the_profiler_listener->increment_main_timer_frees(value); + } + } else { + p->frees++; + p->bytes_freed += value; + } } /* We need to access this global before the memory wrapper is enabled. @@ -67,6 +188,8 @@ void record_free(void* ptr) { * could end up with a deadlock. */ void apex_memory_wrapper_init() { static book_t& book = getBook(); + apex::apex_options::track_memory(true); + getBook().saved_node_id = apex::apex::instance()->get_node_id(); APEX_UNUSED(book); } @@ -84,7 +207,7 @@ void* apex_malloc_wrapper(malloc_p malloc_call, size_t size) { // do the allocation auto retval = malloc_call(size); // record the state - record_alloc(size, retval); + record_alloc(size, retval, MALLOC); inWrapper() = false; return retval; } @@ -96,7 +219,7 @@ void apex_free_wrapper(free_p free_call, void* ptr) { } inWrapper() = true; // record the state - record_free(ptr); + if (ptr != nullptr) { record_free(ptr); } // do the allocation free_call(ptr); inWrapper() = false; @@ -104,32 +227,110 @@ void apex_free_wrapper(free_p free_call, void* ptr) { } // Comparator function to sort pairs descending, according to second value -bool cmp(std::pair& a, - std::pair& b) +bool cmp(std::pair& a, + std::pair& b) +{ + return a.second.bytes > b.second.bytes; +} + +// Comparator function to sort pairs descending, according to second value +bool cmp2(std::pair& a, + std::pair& b) { return a.second > b.second; } void apex_report_leaks() { static book_t& book = getBook(); + std::stringstream ss; + ss << "memory_report." << book.saved_node_id << ".txt"; + std::string tmp{ss.str()}; + std::ofstream report (tmp); // Declare vector of pairs - std::vector > sorted; + std::vector > sorted; + + if (book.saved_node_id == 0) { + std::cout << "APEX Memory Report:" << std::endl; + std::cout << "sorting " << book.memoryMap.size() << " leaks by size..." << std::endl; + } // Copy key-value pair from Map // to vector of pairs - book.mapMutex.lock(); for (auto& it : book.memoryMap) { sorted.push_back(it); } - book.mapMutex.unlock(); // Sort using comparator function sort(sorted.begin(), sorted.end(), cmp); + //std::unordered_map locations; + + if (book.saved_node_id == 0) { + std::cout << "Aggregating leaks by task and writing report..." << std::endl; +#ifdef APEX_WITH_CUDA + std::cout << "Ignoring known leaks in CUPTI..." << std::endl; +#endif + } // Print the sorted value for (auto& it : sorted) { - std::cout << it.first << " leaked " << it.second << " bytes." << std::endl; + std::stringstream ss; + //if (it.second.bytes > 1000) { + ss << it.second.bytes << " bytes leaked at " << std::hex << it.first << std::dec << " from task "; + //} else { + //break; + //} + std::string name{"(no timer)"}; + if (it.second.id != nullptr) { + name = it.second.id->name; + // skip known CUPTI leaks. + if (name.rfind("cuda", 0) == 0) { continue; } + } + ss << name << " on tid " << it.second.tid << " with backtrace: " << std::endl; + ss << "\t" << allocator_strings[it.second.alloc] << std::endl; + char** strings = backtrace_symbols( it.second.backtrace.data(), it.second.size ); + for(size_t i = 3; i < it.second.size; i++ ){ + ss << "\t" << apex::demangle(strings[i]) << std::endl; + } + + /* + //for (auto a : it.second.backtrace) { + for (size_t a = 2 ; a < it.second.size; a++) { + //std::string * tmp = apex::lookup_address(a, true); + std::string * tmp = apex::lookup_address((uintptr_t)it.second.backtrace[a], true); + std::string demangled = apex::demangle(*tmp); + ss << "\t" << demangled << std::endl; + } + */ + ss << std::endl; + /* + if (locations.count(name) > 0) { + locations[name] += it.second.bytes; + } else { + locations[name] = it.second.bytes; + } + */ + report << ss.str(); + } + report.close(); + +/* + std::cout << "sorting task leaks by size..." << std::endl; + // Declare vector of pairs + std::vector > sorted2; + + // Copy key-value pair from Map to vector of pairs + for (auto& it : locations) { + sorted2.push_back(it); + } + + // Sort using comparator function + sort(sorted2.begin(), sorted2.end(), cmp2); + + // print the locations + for (auto& l : sorted2) { + std::cout << l.first << " leaked " << l.second << " bytes." << std::endl; } +*/ } void* apex_calloc_wrapper(calloc_p calloc_call, size_t nmemb, size_t size) { @@ -141,7 +342,7 @@ void* apex_calloc_wrapper(calloc_p calloc_call, size_t nmemb, size_t size) { // do the allocation auto retval = calloc_call(nmemb, size); // record the state - record_alloc(size, retval); + record_alloc(size, retval, CALLOC); inWrapper() = false; return retval; } @@ -153,11 +354,11 @@ void* apex_realloc_wrapper(realloc_p realloc_call, void* ptr, size_t size) { } inWrapper() = true; // record the state - record_free(ptr); + if (ptr != nullptr) { record_free(ptr); } // do the allocation auto retval = realloc_call(ptr, size); // record the state - record_alloc(size, retval); + record_alloc(size, retval, REALLOC); inWrapper() = false; return retval; } diff --git a/src/wrappers/pthread_wrapper_internal.cpp b/src/wrappers/pthread_wrapper_internal.cpp index 8525c708..db4f28d4 100644 --- a/src/wrappers/pthread_wrapper_internal.cpp +++ b/src/wrappers/pthread_wrapper_internal.cpp @@ -52,7 +52,6 @@ struct apex_system_wrapper_t } } virtual ~apex_system_wrapper_t() { - apex::apex_options::use_screen_output(true); apex::finalize(); } }; @@ -210,6 +209,9 @@ int apex_pthread_create_wrapper(pthread_create_p pthread_create_call, pthread_t * threadp, const pthread_attr_t * attr, start_routine_p start_routine, void * arg) { + // disable the memory wrapper + apex::in_apex prevent_problems; + apex::profiler * p = apex::start("pthread_create"); // JUST ONCE, create the key (void) pthread_once(&key_once, make_key); // get the thread-local variable @@ -238,6 +240,7 @@ int apex_pthread_create_wrapper(pthread_create_p pthread_create_call, } wrapper->_wrapped = false; } + apex::stop(p); return retval; } @@ -245,6 +248,8 @@ extern "C" int apex_pthread_join_wrapper(pthread_join_p pthread_join_call, pthread_t thread, void ** retval) { + // disable the memory wrapper + apex::in_apex prevent_problems; apex_wrapper * wrapper = get_tl_wrapper(); int ret;