From 2f787503632ca1e425d0a47d104e9e49ec22e4d1 Mon Sep 17 00:00:00 2001
From: Kevin Huck <khuck@cs.uoregon.edu>
Date: Sun, 9 May 2021 09:37:48 -0700
Subject: [PATCH] Writing a memory wrapper report. There's a huge amount of
 CUPTI memory leaks, and they happen when the first real call to CUDA happens.
  I can't force that call, or ignore memory during the first "real" call, yet.

---
 src/apex/activity_trace_async.cpp         |  94 ++++++++-
 src/apex/apex.cpp                         |   9 +-
 src/apex/apex_kokkos_tuning.cpp           |  26 ++-
 src/apex/apex_options.cpp                 |   2 +-
 src/apex/apex_types.h                     |  22 +-
 src/apex/otf2_listener.cpp                |   5 +
 src/apex/proc_read.cpp                    |   2 +
 src/apex/profile.hpp                      |  41 ++++
 src/apex/profiler.hpp                     |  11 +
 src/apex/profiler_listener.cpp            | 107 ++++++++--
 src/apex/profiler_listener.hpp            |   2 +
 src/scripts/apex_exec                     |   1 +
 src/wrappers/CMakeLists.txt               |   3 +
 src/wrappers/memory_wrapper.cpp           |  14 +-
 src/wrappers/memory_wrapper_internal.cpp  | 235 ++++++++++++++++++++--
 src/wrappers/pthread_wrapper_internal.cpp |   7 +-
 16 files changed, 528 insertions(+), 53 deletions(-)

diff --git a/src/apex/activity_trace_async.cpp b/src/apex/activity_trace_async.cpp
index a28277a0..089ce37b 100644
--- a/src/apex/activity_trace_async.cpp
+++ b/src/apex/activity_trace_async.cpp
@@ -754,6 +754,7 @@ static void openmpActivity(CUpti_Activity *record) {
 }
 
 static void syncActivity(CUpti_Activity *record) {
+    // when tracking memory allocations, ignore the ones in cuda device synchronize
     CUpti_ActivitySynchronization *data =
         (CUpti_ActivitySynchronization *) record;
     /* Check whether there is timing information */
@@ -865,6 +866,8 @@ static void printActivity(CUpti_Activity *record) {
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
     size_t *maxNumRecords)
 {
+    // when tracking memory allocations, ignore these
+    apex::in_apex prevent_nonsense;
     num_buffers++;
     uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE);
     if (bfr == NULL) {
@@ -883,6 +886,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId,
     //auto p = apex::scoped_timer("APEX: CUPTI Buffer Completed");
     //printf("%s...", __func__); fflush(stdout);
     static bool registered = register_myself(false);
+    // when tracking memory allocations, ignore these
+    apex::in_apex prevent_nonsense;
     num_buffers_processed++;
     if (flushing) { std::cout << "." << std::flush; }
     APEX_UNUSED(registered);
@@ -1380,6 +1385,8 @@ double get_nvtx_payload(const nvtxEventAttributes_t * eventAttrib) {
 
 void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
     std::stack<std::shared_ptr<apex::task_wrapper> >& timer_stack) {
+    // disable memory management tracking in APEX during this callback
+    apex::in_apex prevent_deadlocks;
 
     /* Unfortunately, when ranges are started/ended, they can overlap.
      * Unlike push/pop, which are a true stack.  Even worse, CUDA/CUPTI
@@ -1541,8 +1548,22 @@ void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
     }
 }
 
+inline bool ignoreMalloc(CUpti_CallbackId id) {
+    if (id == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020) {
+        return true;
+    }
+    return false;
+}
+
 void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         CUpti_CallbackId id, const void *params) {
+    // disable memory management tracking in APEX during this callback
+    apex::in_apex prevent_deadlocks;
     static bool initialized = initialize_first_time();
     APEX_UNUSED(initialized);
     static APEX_NATIVE_TLS bool registered = register_myself(true);
@@ -1563,7 +1584,7 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         if (id == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) {
             register_new_context(params);
         }
-        return;
+        //return;
     }
 
     /* Check for user-level instrumentation */
@@ -1575,6 +1596,8 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
     CUpti_CallbackData * cbdata = (CUpti_CallbackData*)(params);
 
     if (cbdata->callbackSite == CUPTI_API_ENTER) {
+        // sadly, CUPTI leaks a lot of memory.  Don't track memory in CUPTI or CUDA.
+        // apex::in_apex::get()++;
         std::stringstream ss;
         ss << cbdata->functionName;
         if (apex::apex_options::use_cuda_kernel_details()) {
@@ -1594,6 +1617,12 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         map_mutex.unlock();
         getBytesIfMalloc(id, cbdata->functionParams, tmp, true);
     } else {
+        // sadly, CUPTI leaks a lot of memory.  Don't track memory in CUPTI or CUDA.
+        /*
+        if (cbdata->callbackSite == CUPTI_API_EXIT) {
+            apex::in_apex::get()--;
+        }
+        */
         /* Not sure how to use this yet... if this is a kernel launch, we can
          * run a function on the host, launched from the stream.  That gives us
          * a synchronous callback to tell us an event when the kernel finished.
@@ -1640,6 +1669,9 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
 }
 
 void initTrace() {
+    // disable memory management tracking in APEX during this initialization
+    apex::in_apex prevent_deadlocks;
+    // make a first call into CUDA
     bool& registered = get_registered();
     registered = true;
 
@@ -1681,7 +1713,6 @@ void initTrace() {
         CUPTI_CALL(cuptiActivityEnableLatencyTimestamps(enable));
     }
 
-
     // get user-added instrumentation
     CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_NVTX));
     // Make sure we see CUPTI_CBID_RESOURCE_CONTEXT_CREATED events!
@@ -1724,4 +1755,63 @@ namespace apex {
             std::cout << std::endl;
         }
     }
+
+    void finalizeCuda(void) {
+        std::cout << "Finalizing CUPTI " << std::endl;
+        flushTrace();
+        CUPTI_CALL(cuptiUnsubscribe(subscriber));
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47
+        #if 0
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3   <- disables concurrency
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT)); // 6
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC)); // 7
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // 11
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // 12
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MODULE)); // 27
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_STREAM)); // 37
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PCIE)); // 46
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_COUNT)); // 49
+        #endif
+        cuptiFinalize();
+    }
 }
diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp
index de50d182..8ac8c5aa 100644
--- a/src/apex/apex.cpp
+++ b/src/apex/apex.cpp
@@ -1110,12 +1110,16 @@ void yield(std::shared_ptr<task_wrapper> tt_ptr)
 void sample_value(const std::string &name, double value, bool threaded)
 {
     in_apex prevent_deadlocks;
+    // check these before checking the options, because if we have already
+    // cleaned up, checking the options can cause deadlock. This can
+    // happen if we are tracking memory.
+    if (_exited || _measurement_stopped) return; // protect against calls after finalization
     // if APEX is disabled, do nothing.
     if (apex_options::disable() == true) { return; }
     // if APEX is suspended, do nothing.
     if (apex_options::suspend() == true) { return; }
     apex* instance = apex::instance(); // get the Apex static instance
-    if (!instance || _exited) return; // protect against calls after finalization
+    if (!instance) return; // protect against calls after finalization
     // parse the counter name
     // either /threadqueue{locality#0/total}/length
     // or     /threadqueue{locality#0/worker-thread#0}/length
@@ -1411,6 +1415,7 @@ void finalize_plugins(void) {
 // forward declare CUPTI buffer flushing
 #ifdef APEX_WITH_CUDA
 void flushTrace(void);
+void finalizeCuda(void);
 #endif
 
 std::string dump(bool reset) {
@@ -1494,6 +1499,7 @@ void finalize()
     /* This could take a while */
 #ifdef APEX_WITH_CUDA
     flushTrace();
+    finalizeCuda();
 #endif
 #ifdef APEX_HAVE_TCMALLOC
     //tcmalloc::destroy_hook();
@@ -1571,6 +1577,7 @@ void cleanup(void) {
     }
     */
     delete(instance);
+    // stop tracking memory!
     FUNCTION_EXIT
 }
 
diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp
index b66fbbf4..127bd83a 100644
--- a/src/apex/apex_kokkos_tuning.cpp
+++ b/src/apex/apex_kokkos_tuning.cpp
@@ -463,7 +463,9 @@ extern "C" {
  */
 void kokkosp_declare_output_type(const char* name, const size_t id,
     Kokkos_Tools_VariableInfo& info) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if(getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << std::endl;
@@ -483,7 +485,9 @@ void kokkosp_declare_output_type(const char* name, const size_t id,
  */
 void kokkosp_declare_input_type(const char* name, const size_t id,
     Kokkos_Tools_VariableInfo& info) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if(getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << std::endl;
@@ -519,7 +523,8 @@ void kokkosp_request_values(
     const Kokkos_Tools_VariableValue* contextVariableValues,
     const size_t numTuningVariables,
     Kokkos_Tools_VariableValue* tuningVariableValues) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
     if (getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << " ctx: " << contextId;
@@ -527,9 +532,11 @@ void kokkosp_request_values(
     }
     std::string name{hashContext(numContextVariables, contextVariableValues,
         getSession().inputs)};
-    handle_start(name, numTuningVariables, tuningVariableValues);
     getSession().active_requests.insert(
         std::pair<uint32_t, std::string>(contextId, name));
+    if (apex::apex_options::use_kokkos_tuning()) {
+        handle_start(name, numTuningVariables, tuningVariableValues);
+    }
     if (getSession().verbose) {
         std::cout << std::endl << std::string(getDepth(), ' ');
         printTuning(numTuningVariables, tuningVariableValues);
@@ -543,7 +550,9 @@ void kokkosp_request_values(
  * starting measurement.
  */
 void kokkosp_begin_context(size_t contextId) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if (getSession().verbose) {
         std::cout << std::string(getDepth()++, ' ');
         std::cout << __func__ << "\t" << contextId << std::endl;
@@ -558,7 +567,8 @@ void kokkosp_begin_context(size_t contextId) {
  * values can now be associated with a result.
  */
 void kokkosp_end_context(const size_t contextId) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
     if (getSession().verbose) {
         std::cout << std::string(--getDepth(), ' ');
         std::cout << __func__ << "\t" << contextId << std::endl;
@@ -569,7 +579,9 @@ void kokkosp_end_context(const size_t contextId) {
     if (name != getSession().active_requests.end() &&
         start != getSession().context_starts.end()) {
         apex::sample_value(name->second, (double)(end-start->second));
-        handle_stop(name->second);
+        if (apex::apex_options::use_kokkos_tuning()) {
+            handle_stop(name->second);
+        }
         getSession().active_requests.erase(contextId);
         getSession().context_starts.erase(contextId);
     }
diff --git a/src/apex/apex_options.cpp b/src/apex/apex_options.cpp
index 18250c7d..bd7fa5e5 100644
--- a/src/apex/apex_options.cpp
+++ b/src/apex/apex_options.cpp
@@ -51,8 +51,8 @@ namespace apex
 #endif
                 }
             }
-            conf_file.close();
         }
+        conf_file.close();
 
         char* option = nullptr;
 // getenv is not thread-safe, but the constructor for this static singleton is.
diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h
index 99c967d3..e1bdb487 100644
--- a/src/apex/apex_types.h
+++ b/src/apex/apex_types.h
@@ -189,16 +189,20 @@ typedef enum _profile_type {
  */
 typedef struct _profile
 {
-    double calls;         /*!< Number of times a timer was called, or the number
-                              of samples collected for a counter */
-    double accumulated;   /*!< Accumulated values for all calls/samples */
-    double sum_squares;   /*!< Running sum of squares calculation for all
-                              calls/samples */
-    double minimum;       /*!< Minimum value seen by the timer or counter */
-    double maximum;       /*!< Maximum value seen by the timer or counter */
+    double calls;           /*!< Number of times a timer was called, or the number
+                                 of samples collected for a counter */
+    double accumulated;     /*!< Accumulated values for all calls/samples */
+    double sum_squares;     /*!< Running sum of squares calculation for all
+                                 calls/samples */
+    double minimum;         /*!< Minimum value seen by the timer or counter */
+    double maximum;         /*!< Maximum value seen by the timer or counter */
     apex_profile_type type; /*!< Whether this is a timer or a counter */
-    double papi_metrics[8];  /*!< Array of accumulated PAPI hardware metrics */
-    int times_reset;      /*!< How many times was this timer reset */
+    double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */
+    size_t allocations;     /*!< total calls to [m/c/re]alloc and related */
+    size_t frees;           /*!< total calls to free and related (realloc) */
+    size_t bytes_allocated; /*!< total bytes allocated in this task */
+    size_t bytes_freed;     /*!< total bytes freed in this task */
+    int times_reset;        /*!< How many times was this timer reset */
 } apex_profile;
 
 /** Rather than use void pointers everywhere, be explicit about
diff --git a/src/apex/otf2_listener.cpp b/src/apex/otf2_listener.cpp
index 5f8cb787..58e7234c 100644
--- a/src/apex/otf2_listener.cpp
+++ b/src/apex/otf2_listener.cpp
@@ -924,6 +924,11 @@ namespace apex {
         static bool _finalized = false;
         if (_finalized) { return; }
         _finalized = true;
+        // if we are tracking memory, there are some alloc/free events
+        // we recorded before the end of the trace.
+        if (apex_options::track_memory()) {
+            saved_end_timestamp = get_time();
+        }
          // get an exclusive lock, to make sure no other threads
         // are writing to the archive.
         write_lock_type lock(_archive_mutex);
diff --git a/src/apex/proc_read.cpp b/src/apex/proc_read.cpp
index 87ea27af..09967861 100644
--- a/src/apex/proc_read.cpp
+++ b/src/apex/proc_read.cpp
@@ -1101,6 +1101,8 @@ namespace apex {
     /* This is the main function for the reader thread. */
     void* proc_data_reader::read_proc(void * _ptw) {
         in_apex prevent_deadlocks;
+        // when tracking memory allocations, ignore these
+        in_apex prevent_nonsense;
         pthread_wrapper* ptw = (pthread_wrapper*)_ptw;
         // make sure APEX knows this is not a worker thread
         thread_instance::instance(false);
diff --git a/src/apex/profile.hpp b/src/apex/profile.hpp
index 34248501..4dcf191e 100644
--- a/src/apex/profile.hpp
+++ b/src/apex/profile.hpp
@@ -42,6 +42,34 @@ class profile {
         _profile.minimum = initial;
         _profile.maximum = initial;
 #endif
+        _profile.allocations = 0;
+        _profile.frees = 0;
+        _profile.bytes_allocated = 0;
+        _profile.bytes_freed = 0;
+    };
+    profile(double initial, int num_metrics, double * papi_metrics, bool
+        yielded, double allocations, double frees, double bytes_allocated,
+        double bytes_freed) {
+        _profile.type = APEX_TIMER;
+        if (!yielded) {
+            _profile.calls = 1.0;
+        } else {
+            _profile.calls = 0.0;
+        }
+        _profile.accumulated = initial;
+        for (int i = 0 ; i < num_metrics ; i++) {
+            _profile.papi_metrics[i] = papi_metrics[i];
+        }
+        _profile.times_reset = 0;
+#ifdef FULL_STATISTICS
+        _profile.sum_squares = initial*initial;
+        _profile.minimum = initial;
+        _profile.maximum = initial;
+#endif
+        _profile.allocations = allocations;
+        _profile.frees = frees;
+        _profile.bytes_allocated = bytes_allocated;
+        _profile.bytes_freed = bytes_freed;
     };
     void increment(double increase, int num_metrics, double * papi_metrics,
         bool yielded) {
@@ -59,6 +87,15 @@ class profile {
           _profile.calls = _profile.calls + 1.0;
         }
     }
+    void increment(double increase, int num_metrics, double * papi_metrics,
+        double allocations, double frees, double bytes_allocated, double bytes_freed,
+        bool yielded) {
+        increment(increase, num_metrics, papi_metrics, yielded);
+        _profile.allocations += allocations;
+        _profile.frees += frees;
+        _profile.bytes_allocated += bytes_allocated;
+        _profile.bytes_freed += bytes_freed;
+    }
     void reset() {
         _profile.calls = 0.0;
         _profile.accumulated = 0.0;
@@ -103,6 +140,10 @@ class profile {
         return _profile.sum_squares;
     }
     double get_stddev() { return sqrt(get_variance()); }
+    double get_allocations() { return _profile.allocations; }
+    double get_frees() { return _profile.frees; }
+    double get_bytes_allocated() { return _profile.bytes_allocated; }
+    double get_bytes_freed() { return _profile.bytes_freed; }
     apex_profile_type get_type() { return _profile.type; }
     apex_profile * get_profile() { return &_profile; };
 
diff --git a/src/apex/profiler.hpp b/src/apex/profiler.hpp
index 88a5c8f7..fb04cbc2 100644
--- a/src/apex/profiler.hpp
+++ b/src/apex/profiler.hpp
@@ -56,6 +56,10 @@ class profiler {
     long long papi_start_values[8];
     long long papi_stop_values[8];
 #endif
+    double allocations;
+    double frees;
+    double bytes_allocated;
+    double bytes_freed;
     double value;
     double children_value;
     uint64_t guid;
@@ -80,6 +84,7 @@ class profiler {
         papi_start_values{0,0,0,0,0,0,0,0},
         papi_stop_values{0,0,0,0,0,0,0,0},
 #endif
+        allocations(0), frees(0), bytes_allocated(0), bytes_freed(0),
         value(0.0),
         children_value(0.0),
         guid(task->guid),
@@ -97,6 +102,7 @@ class profiler {
         papi_start_values{0,0,0,0,0,0,0,0},
         papi_stop_values{0,0,0,0,0,0,0,0},
 #endif
+        allocations(0), frees(0), bytes_allocated(0), bytes_freed(0),
         value(0.0),
         children_value(0.0),
         guid(0),
@@ -112,6 +118,7 @@ class profiler {
         papi_start_values{0,0,0,0,0,0,0,0},
         papi_stop_values{0,0,0,0,0,0,0,0},
 #endif
+        allocations(0), frees(0), bytes_allocated(0), bytes_freed(0),
         value(value_),
         children_value(0.0),
         is_counter(true),
@@ -123,6 +130,10 @@ class profiler {
         tt_ptr(in.tt_ptr),
         start_ns(in.start_ns),
         end_ns(in.end_ns),
+        allocations(in.allocations),
+        frees(in.frees),
+        bytes_allocated(in.bytes_allocated),
+        bytes_freed(in.bytes_freed),
         value(in.value),
         children_value(in.children_value),
         guid(in.guid),
diff --git a/src/apex/profiler_listener.cpp b/src/apex/profiler_listener.cpp
index f2652f9c..6981815c 100644
--- a/src/apex/profiler_listener.cpp
+++ b/src/apex/profiler_listener.cpp
@@ -287,8 +287,14 @@ std::unordered_set<profile*> free_profiles;
         if(p->is_reset == reset_type::CURRENT) {
             theprofile->reset();
         } else {
-            theprofile->increment(p->elapsed(), tmp_num_counters,
-                values, p->is_resume);
+            if (apex_options::track_memory()) {
+                theprofile->increment(p->elapsed(), tmp_num_counters,
+                    values, p->allocations, p->frees, p->bytes_allocated,
+                    p->bytes_freed, p->is_resume);
+            } else {
+                theprofile->increment(p->elapsed(), tmp_num_counters,
+                    values, p->is_resume);
+            }
         }
 #if defined(APEX_THROTTLE)
         if (!apex_options::use_tau()) {
@@ -324,11 +330,20 @@ std::unordered_set<profile*> free_profiles;
 #endif
       } else {
         // Create a new profile for this name.
-        theprofile = new profile(p->is_reset ==
-            reset_type::CURRENT ? 0.0 : p->elapsed(),
-            tmp_num_counters, values, p->is_resume,
-            p->is_counter ? APEX_COUNTER : APEX_TIMER);
-        task_map[*(p->get_task_id())] = theprofile;
+        if (apex_options::track_memory() && !p->is_counter) {
+            theprofile = new profile(p->is_reset ==
+                reset_type::CURRENT ? 0.0 : p->elapsed(),
+                tmp_num_counters, values, p->is_resume,
+                p->allocations, p->frees, p->bytes_allocated,
+                p->bytes_freed);
+            task_map[*(p->get_task_id())] = theprofile;
+        } else {
+            theprofile = new profile(p->is_reset ==
+                reset_type::CURRENT ? 0.0 : p->elapsed(),
+                tmp_num_counters, values, p->is_resume,
+                p->is_counter ? APEX_COUNTER : APEX_TIMER);
+            task_map[*(p->get_task_id())] = theprofile;
+        }
         task_map_lock.unlock();
 #ifdef APEX_HAVE_HPX
 #ifdef APEX_REGISTER_HPX3_COUNTERS
@@ -548,7 +563,7 @@ std::unordered_set<profile*> free_profiles;
             double tmp = ((p->get_accumulated_seconds())
                 /total_main)*100.0;
             if (tmp > 100.0) {
-                screen_output << " --n/a--   " ;
+                screen_output << " --n/a--" ;
             } else {
                 screen_output << string_format(FORMAT_PERCENT, tmp);
             }
@@ -560,6 +575,43 @@ std::unordered_set<profile*> free_profiles;
             csv_output << "," << std::llround(p->get_papi_metrics()[i]);
         }
 #endif
+        if (apex_options::track_memory()) {
+            if (p->get_allocations() > 999999) {
+                screen_output  << "   " << string_format(FORMAT_SCIENTIFIC,
+                    (p->get_allocations()));
+            } else {
+                screen_output  << "   " << string_format(PAD_WITH_SPACES,
+                    to_string((int)p->get_allocations()).c_str());
+            }
+            csv_output << "," << std::llround(p->get_allocations());
+
+            if (p->get_bytes_allocated() > 999999) {
+                screen_output  << "   " << string_format(FORMAT_SCIENTIFIC,
+                    (p->get_bytes_allocated()));
+            } else {
+                screen_output  << "   " << string_format(PAD_WITH_SPACES,
+                    to_string((int)p->get_bytes_allocated()).c_str());
+            }
+            csv_output << "," << std::llround(p->get_bytes_allocated());
+
+            if (p->get_frees() > 999999) {
+                screen_output  << "   " << string_format(FORMAT_SCIENTIFIC,
+                    (p->get_frees()));
+            } else {
+                screen_output  << "   " << string_format(PAD_WITH_SPACES,
+                    to_string((int)p->get_frees()).c_str());
+            }
+            csv_output << "," << std::llround(p->get_frees());
+
+            if (p->get_bytes_freed() > 999999) {
+                screen_output  << "   " << string_format(FORMAT_SCIENTIFIC,
+                    (p->get_bytes_freed()));
+            } else {
+                screen_output  << "   " << string_format(PAD_WITH_SPACES,
+                    to_string((int)p->get_bytes_freed()).c_str());
+            }
+            csv_output << "," << std::llround(p->get_bytes_freed());
+        }
         screen_output << endl;
         csv_output << endl;
       } else {
@@ -669,8 +721,8 @@ std::unordered_set<profile*> free_profiles;
             }
         }
         screen_output << "------------------------------------------"
-            << "------------------------------------------------------" << endl
-            << endl;;
+            << "------------------------------------------------------\n";
+        screen_output << "\n" << endl;
     }
     csv_output << "\n\n\"task\",\"num calls\",\"total microseconds\"";
 #if APEX_HAVE_PAPI
@@ -678,6 +730,9 @@ std::unordered_set<profile*> free_profiles;
        csv_output << ",\"" << metric_names[i] << "\"";
     }
 #endif
+    if (apex_options::track_memory()) {
+       csv_output << ",\"allocations\", \"bytes allocated\", \"frees\", \"bytes freed\"";
+    }
     csv_output << endl;
     std::string re("PAPI_");
     std::string tmpstr(apex_options::papi_metrics());
@@ -693,10 +748,18 @@ std::unordered_set<profile*> free_profiles;
     }
     screen_output << "Timer                                                : "
         << "#calls  |    mean  |   total  |  % total  "
-        << tmpstr << endl;
+        << tmpstr;
+    if (apex_options::track_memory()) {
+       screen_output << "|  allocs |  (bytes) |    frees |   (bytes) ";
+    }
+    screen_output << endl;
     screen_output << "----------------------------------------------"
-        << "--------------------------------------------------" << endl;
-     id_vector.clear();
+        << "--------------------------------------------------";
+    if (apex_options::track_memory()) {
+        screen_output << "--------------------------------------------";
+    }
+    screen_output << endl;
+    id_vector.clear();
     // iterate over the timers
     for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
         profile * p = it2->second;
@@ -732,7 +795,11 @@ std::unordered_set<profile*> free_profiles;
         ((idle_rate/total_main)*100)) << endl;
     }
     screen_output << "--------------------------------------------------"
-        << "----------------------------------------------" << endl;
+        << "----------------------------------------------";
+    if (apex_options::track_memory()) {
+        screen_output << "--------------------------------------------";
+    }
+    screen_output << endl;
     screen_output << string_format("%52s", "Total timers") << " : ";
     //if (total_hpx_threads < 999999) {
         //screen_output << string_format(PAD_WITH_SPACES,
@@ -1748,6 +1815,18 @@ if (rc != 0) cout << "PAPI error! " << name << ": " << PAPI_strerror(rc) << endl
       main_timer->restart();
   }
 
+  void profiler_listener::increment_main_timer_allocations(double bytes) {
+      APEX_ASSERT(main_timer != nullptr);
+      main_timer->allocations++;
+      main_timer->bytes_allocated += bytes;
+  }
+
+  void profiler_listener::increment_main_timer_frees(double bytes) {
+      APEX_ASSERT(main_timer != nullptr);
+      main_timer->frees++;
+      main_timer->bytes_freed += bytes;
+  }
+
   void profiler_listener::stop_main_timer(void) {
     static bool stopped{false};
     if (!stopped) {
diff --git a/src/apex/profiler_listener.hpp b/src/apex/profiler_listener.hpp
index fd41cb82..c11a0d16 100644
--- a/src/apex/profiler_listener.hpp
+++ b/src/apex/profiler_listener.hpp
@@ -254,6 +254,8 @@ class profiler_listener : public event_listener {
   void stop_main_timer(void);
   void yield_main_timer(void);
   void resume_main_timer(void);
+  void increment_main_timer_allocations(double bytes);
+  void increment_main_timer_frees(double bytes);
   void push_profiler_public(std::shared_ptr<profiler> &p);
 };
 
diff --git a/src/scripts/apex_exec b/src/scripts/apex_exec
index 6e0a91e7..5b72c81c 100755
--- a/src/scripts/apex_exec
+++ b/src/scripts/apex_exec
@@ -123,6 +123,7 @@ while (( "$#" )); do
     --apex:memory)
       memory=yes
       export APEX_TRACK_MEMORY=1
+      export APEX_SOURCE_LOCATION=1
       shift
       ;;
     --apex:otf2)
diff --git a/src/wrappers/CMakeLists.txt b/src/wrappers/CMakeLists.txt
index 382f1460..7ee1fb82 100644
--- a/src/wrappers/CMakeLists.txt
+++ b/src/wrappers/CMakeLists.txt
@@ -1,6 +1,9 @@
 # Make sure the compiler can find include files from our Apex library.
 include_directories (${APEX_SOURCE_DIR}/src/apex
 ${APEX_SOURCE_DIR}/src/wrappers)
+# add the binary tree to the search path for include files
+# so that we will find ApexConfig.h
+include_directories("${PROJECT_BINARY_DIR}/src/apex")
 
 # Make sure the linker can find the Apex library once it is built.
 link_directories (${APEX_BINARY_DIR}/src/apex)
diff --git a/src/wrappers/memory_wrapper.cpp b/src/wrappers/memory_wrapper.cpp
index b2c52779..aac1b728 100644
--- a/src/wrappers/memory_wrapper.cpp
+++ b/src/wrappers/memory_wrapper.cpp
@@ -54,11 +54,16 @@ void apex_memory_initialized() {
 }
 
 extern "C"
-void apex_memory_finalized() {
+void apex_memory_lights_out() {
     apex_ready() = false;
     apex_report_leaks();
 }
 
+extern "C"
+void apex_memory_finalized() {
+    //apex_memory_lights_out();
+}
+
 extern "C"
 void apex_memory_dl_initialized() {
     dl_ready() = true;
@@ -71,12 +76,19 @@ void apex_memory_dl_initialized() {
 char bootstrap_heap[BOOTSTRAP_HEAP_SIZE];
 char * bootstrap_base = bootstrap_heap;
 
+uintptr_t reportHeapLocation() {
+    printf("Bootstrap heap located at: %p\n", &bootstrap_heap[0]);
+    return (uintptr_t)&bootstrap_heap[0];
+}
+
 static inline int is_bootstrap(void * ptr) {
     char const * const p = (char*)ptr;
     return (p < bootstrap_heap + BOOTSTRAP_HEAP_SIZE) && (bootstrap_heap < p);
 }
 
 static void * bootstrap_alloc(size_t align, size_t size) {
+    //static uintptr_t dummy = reportHeapLocation();
+    //APEX_UNUSED(dummy);
     char * ptr;
 
     // Check alignment.  Default alignment is sizeof(long)
diff --git a/src/wrappers/memory_wrapper_internal.cpp b/src/wrappers/memory_wrapper_internal.cpp
index 267240c6..a15008bd 100644
--- a/src/wrappers/memory_wrapper_internal.cpp
+++ b/src/wrappers/memory_wrapper_internal.cpp
@@ -12,35 +12,142 @@
 #include <unordered_map>
 #include <mutex>
 #include <vector>
+#include <iostream>
+#include <fstream>
 #include "apex_api.hpp"
+#include "thread_instance.hpp"
+#include "address_resolution.hpp"
 #include "utils.hpp"
 //#include <bits/stdc++.h>
+//for unwinding
+#include <unwind.h>
+#include <stdint.h>
+// for backtrace
+#include <execinfo.h>
 
 ///////////////////////////////////////////////////////////////////////////////
 // Below is the malloc wrapper
 ///////////////////////////////////////////////////////////////////////////////
 
-typedef struct book_s {
+typedef enum allocator {
+    MALLOC = 0,
+    CALLOC,
+    REALLOC
+} allocator_t;
+
+const char * allocator_strings[] = {
+    "malloc", "calloc", "realloc"
+};
+
+class record_t {
+public:
+    size_t bytes;
+    apex::task_identifier * id;
+    size_t tid;
+    allocator_t alloc;
+    record_t() : bytes(0), id(nullptr), tid(0), alloc(MALLOC) {}
+    record_t(size_t b, size_t t, allocator_t a) : bytes(b), id(nullptr), tid(t), alloc(a) {}
+    //std::vector<uintptr_t> backtrace;
+    std::array<void*,32> backtrace;
+    size_t size;
+};
+
+void apex_report_leaks();
+
+extern "C" void apex_memory_lights_out();
+
+class book_t {
+public:
+    size_t saved_node_id;
     std::atomic<size_t> totalAllocated = 0.0;
-    std::unordered_map<void*,size_t> memoryMap;
+    std::unordered_map<void*,record_t> memoryMap;
     std::mutex mapMutex;
-} book_t;
+    ~book_t() {
+        apex_memory_lights_out();
+    }
+};
 
 book_t& getBook() {
     static book_t book;
     return book;
 }
 
-void record_alloc(size_t bytes, void* ptr) {
+class backtrace_record_t {
+public:
+    size_t skip;
+    std::vector<uintptr_t>& _stack;
+    backtrace_record_t(size_t s, std::vector<uintptr_t>& _s) : skip(s), _stack(_s) {}
+};
+
+/* Unwind library callback routine.  This is passed to
+   _Unwind_Backtrace.  */
+
+/*
+static _Unwind_Reason_Code
+default_unwind (struct _Unwind_Context *context, void *vdata)
+{
+    backtrace_record_t *data = (backtrace_record_t*) vdata;
+    uintptr_t pc;
+
+#ifdef _Unwind_GetIPInfo
+    int ip_before_insn = 0;
+    pc = _Unwind_GetIPInfo (context, &ip_before_insn);
+    if (!ip_before_insn) --pc;
+#else
+    pc = _Unwind_GetIP (context);
+#endif
+
+    if (data->skip > 0) {
+        --data->skip;
+        return _URC_NO_REASON;
+    }
+
+    data->_stack.push_back(pc);
+
+    return _URC_NO_REASON;
+}
+*/
+
+inline void printBacktrace() {
+  void *trace[32];
+  size_t size, i;
+  char **strings;
+  size    = backtrace( trace, 32 );
+  strings = backtrace_symbols( trace, size );
+  std::cerr << std::endl;
+  // skip the first frame, it is this handler
+  for( i = 1; i < size; i++ ){
+   std::cerr << apex::demangle(strings[i]) << std::endl;
+  }
+}
+
+void record_alloc(size_t bytes, void* ptr, allocator_t alloc) {
     static book_t& book = getBook();
     double value = (double)(bytes);
     apex::sample_value("Memory: Bytes Allocated", value, true);
+    apex::profiler * p = apex::thread_instance::instance().get_current_profiler();
+    record_t tmp(value, apex::thread_instance::instance().get_id(), alloc);
+    if (p != nullptr) { tmp.id = p->get_task_id(); }
+    //backtrace_record_t rec(3,tmp.backtrace);
+    //_Unwind_Backtrace (default_unwind, &(rec));
+    tmp.size = backtrace(tmp.backtrace.data(), tmp.backtrace.size());
     book.mapMutex.lock();
-    book.memoryMap[ptr] = value;
+    //book.memoryMap[ptr] = value;
+    book.memoryMap.insert(std::pair<void*,record_t>(ptr, tmp));
     book.mapMutex.unlock();
     book.totalAllocated.fetch_add(bytes, std::memory_order_relaxed);
     value = (double)(book.totalAllocated);
     apex::sample_value("Memory: Total Bytes Occupied", value);
+    if (p == nullptr) {
+        auto i = apex::apex::instance();
+        // might be after finalization, so double-check!
+        if (i != nullptr) {
+            i->the_profiler_listener->increment_main_timer_allocations(value);
+        }
+    } else {
+        p->allocations++;
+        p->bytes_allocated += value;
+    }
 }
 
 void record_free(void* ptr) {
@@ -48,9 +155,12 @@ void record_free(void* ptr) {
     size_t bytes;
     book.mapMutex.lock();
     if (book.memoryMap.count(ptr) > 0) {
-        bytes = book.memoryMap[ptr];
+        record_t& tmp = book.memoryMap[ptr];
+        bytes = tmp.bytes;
         book.memoryMap.erase(ptr);
     } else {
+        //std::cout << std::hex << ptr << std::dec << " NOT FOUND" << std::endl;
+        //printBacktrace();
         book.mapMutex.unlock();
         return;
     }
@@ -60,6 +170,17 @@ void record_free(void* ptr) {
     book.totalAllocated.fetch_sub(bytes, std::memory_order_relaxed);
     value = (double)(book.totalAllocated);
     apex::sample_value("Memory: Total Bytes Occupied", value);
+    apex::profiler * p = apex::thread_instance::instance().get_current_profiler();
+    if (p == nullptr) {
+        auto i = apex::apex::instance();
+        // might be after finalization, so double-check!
+        if (i != nullptr) {
+            i->the_profiler_listener->increment_main_timer_frees(value);
+        }
+    } else {
+        p->frees++;
+        p->bytes_freed += value;
+    }
 }
 
 /* We need to access this global before the memory wrapper is enabled.
@@ -67,6 +188,8 @@ void record_free(void* ptr) {
  * could end up with a deadlock. */
 void apex_memory_wrapper_init() {
     static book_t& book = getBook();
+    apex::apex_options::track_memory(true);
+    getBook().saved_node_id = apex::apex::instance()->get_node_id();
     APEX_UNUSED(book);
 }
 
@@ -84,7 +207,7 @@ void* apex_malloc_wrapper(malloc_p malloc_call, size_t size) {
     // do the allocation
     auto retval = malloc_call(size);
     // record the state
-    record_alloc(size, retval);
+    record_alloc(size, retval, MALLOC);
     inWrapper() = false;
     return retval;
 }
@@ -96,7 +219,7 @@ void apex_free_wrapper(free_p free_call, void* ptr) {
     }
     inWrapper() = true;
     // record the state
-    record_free(ptr);
+    if (ptr != nullptr) { record_free(ptr); }
     // do the allocation
     free_call(ptr);
     inWrapper() = false;
@@ -104,32 +227,110 @@ void apex_free_wrapper(free_p free_call, void* ptr) {
 }
 
 // Comparator function to sort pairs descending, according to second value
-bool cmp(std::pair<void*, size_t>& a,
-        std::pair<void*, size_t>& b)
+bool cmp(std::pair<void*, record_t>& a,
+        std::pair<void*, record_t>& b)
+{
+    return a.second.bytes > b.second.bytes;
+}
+
+// Comparator function to sort pairs descending, according to second value
+bool cmp2(std::pair<std::string, size_t>& a,
+        std::pair<std::string, size_t>& b)
 {
     return a.second > b.second;
 }
 
 void apex_report_leaks() {
     static book_t& book = getBook();
+    std::stringstream ss;
+    ss << "memory_report." << book.saved_node_id << ".txt";
+    std::string tmp{ss.str()};
+    std::ofstream report (tmp);
     // Declare vector of pairs
-    std::vector<std::pair<void*, size_t> > sorted;
+    std::vector<std::pair<void*, record_t> > sorted;
+
+    if (book.saved_node_id == 0) {
+        std::cout << "APEX Memory Report:" << std::endl;
+        std::cout << "sorting " << book.memoryMap.size() << " leaks by size..." << std::endl;
+    }
 
     // Copy key-value pair from Map
     // to vector of pairs
-    book.mapMutex.lock();
     for (auto& it : book.memoryMap) {
         sorted.push_back(it);
     }
-    book.mapMutex.unlock();
 
     // Sort using comparator function
     sort(sorted.begin(), sorted.end(), cmp);
 
+    //std::unordered_map<std::string, size_t> locations;
+
+    if (book.saved_node_id == 0) {
+        std::cout << "Aggregating leaks by task and writing report..." << std::endl;
+#ifdef APEX_WITH_CUDA
+        std::cout << "Ignoring known leaks in CUPTI..." << std::endl;
+#endif
+    }
     // Print the sorted value
     for (auto& it : sorted) {
-        std::cout << it.first << " leaked " << it.second << " bytes." << std::endl;
+        std::stringstream ss;
+        //if (it.second.bytes > 1000) {
+            ss << it.second.bytes << " bytes leaked at " << std::hex << it.first << std::dec << " from task ";
+        //} else {
+            //break;
+        //}
+        std::string name{"(no timer)"};
+        if (it.second.id != nullptr) {
+            name = it.second.id->name;
+            // skip known CUPTI leaks.
+            if (name.rfind("cuda", 0) == 0) { continue; }
+        }
+        ss << name << " on tid " << it.second.tid << " with backtrace: " << std::endl;
+        ss << "\t" << allocator_strings[it.second.alloc] << std::endl;
+        char** strings = backtrace_symbols( it.second.backtrace.data(), it.second.size );
+        for(size_t i = 3; i < it.second.size; i++ ){
+            ss << "\t" << apex::demangle(strings[i]) << std::endl;
+        }
+
+        /*
+        //for (auto a : it.second.backtrace) {
+        for (size_t a = 2 ; a <  it.second.size; a++) {
+            //std::string * tmp = apex::lookup_address(a, true);
+            std::string * tmp = apex::lookup_address((uintptr_t)it.second.backtrace[a], true);
+            std::string demangled = apex::demangle(*tmp);
+            ss << "\t" << demangled << std::endl;
+        }
+        */
+        ss << std::endl;
+        /*
+        if (locations.count(name) > 0) {
+            locations[name] += it.second.bytes;
+        } else {
+            locations[name] = it.second.bytes;
+        }
+        */
+        report << ss.str();
+    }
+    report.close();
+
+/*
+    std::cout << "sorting task leaks by size..." << std::endl;
+    // Declare vector of pairs
+    std::vector<std::pair<std::string, size_t> > sorted2;
+
+    // Copy key-value pair from Map to vector of pairs
+    for (auto& it : locations) {
+        sorted2.push_back(it);
+    }
+
+    // Sort using comparator function
+    sort(sorted2.begin(), sorted2.end(), cmp2);
+
+    // print the locations
+    for (auto& l : sorted2) {
+        std::cout << l.first << " leaked " << l.second << " bytes." << std::endl;
     }
+*/
 }
 
 void* apex_calloc_wrapper(calloc_p calloc_call, size_t nmemb, size_t size) {
@@ -141,7 +342,7 @@ void* apex_calloc_wrapper(calloc_p calloc_call, size_t nmemb, size_t size) {
     // do the allocation
     auto retval = calloc_call(nmemb, size);
     // record the state
-    record_alloc(size, retval);
+    record_alloc(size, retval, CALLOC);
     inWrapper() = false;
     return retval;
 }
@@ -153,11 +354,11 @@ void* apex_realloc_wrapper(realloc_p realloc_call, void* ptr, size_t size) {
     }
     inWrapper() = true;
     // record the state
-    record_free(ptr);
+    if (ptr != nullptr) { record_free(ptr); }
     // do the allocation
     auto retval = realloc_call(ptr, size);
     // record the state
-    record_alloc(size, retval);
+    record_alloc(size, retval, REALLOC);
     inWrapper() = false;
     return retval;
 }
diff --git a/src/wrappers/pthread_wrapper_internal.cpp b/src/wrappers/pthread_wrapper_internal.cpp
index 8525c708..db4f28d4 100644
--- a/src/wrappers/pthread_wrapper_internal.cpp
+++ b/src/wrappers/pthread_wrapper_internal.cpp
@@ -52,7 +52,6 @@ struct apex_system_wrapper_t
     }
   }
   virtual ~apex_system_wrapper_t() {
-    apex::apex_options::use_screen_output(true);
     apex::finalize();
   }
 };
@@ -210,6 +209,9 @@ int apex_pthread_create_wrapper(pthread_create_p pthread_create_call,
     pthread_t * threadp, const pthread_attr_t * attr,
     start_routine_p start_routine, void * arg)
 {
+  // disable the memory wrapper
+  apex::in_apex prevent_problems;
+  apex::profiler * p = apex::start("pthread_create");
   // JUST ONCE, create the key
   (void) pthread_once(&key_once, make_key);
   // get the thread-local variable
@@ -238,6 +240,7 @@ int apex_pthread_create_wrapper(pthread_create_p pthread_create_call,
     }
     wrapper->_wrapped = false;
   }
+  apex::stop(p);
   return retval;
 }
 
@@ -245,6 +248,8 @@ extern "C"
 int apex_pthread_join_wrapper(pthread_join_p pthread_join_call,
     pthread_t thread, void ** retval)
 {
+  // disable the memory wrapper
+  apex::in_apex prevent_problems;
   apex_wrapper * wrapper = get_tl_wrapper();
 
   int ret;