Writing a memory wrapper report.

There's a huge amount of CUPTI memory leaks, and they happen when the first real call to CUDA happens. I can't force that call, or ignore memory during the first "real" call, yet.
UO-OACISS · May 9, 2021 · 2f78750 · 2f78750
1 parent ce8c8cb
commit 2f78750
Show file tree

Hide file tree

Showing 16 changed files with 528 additions and 53 deletions.
diff --git a/src/apex/activity_trace_async.cpp b/src/apex/activity_trace_async.cpp
@@ -754,6 +754,7 @@ static void openmpActivity(CUpti_Activity *record) {
 }
 
 static void syncActivity(CUpti_Activity *record) {
+    // when tracking memory allocations, ignore the ones in cuda device synchronize
     CUpti_ActivitySynchronization *data =
         (CUpti_ActivitySynchronization *) record;
     /* Check whether there is timing information */
@@ -865,6 +866,8 @@ static void printActivity(CUpti_Activity *record) {
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
     size_t *maxNumRecords)
 {
+    // when tracking memory allocations, ignore these
+    apex::in_apex prevent_nonsense;
     num_buffers++;
     uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE);
     if (bfr == NULL) {
@@ -883,6 +886,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId,
     //auto p = apex::scoped_timer("APEX: CUPTI Buffer Completed");
     //printf("%s...", __func__); fflush(stdout);
     static bool registered = register_myself(false);
+    // when tracking memory allocations, ignore these
+    apex::in_apex prevent_nonsense;
     num_buffers_processed++;
     if (flushing) { std::cout << "." << std::flush; }
     APEX_UNUSED(registered);
@@ -1380,6 +1385,8 @@ double get_nvtx_payload(const nvtxEventAttributes_t * eventAttrib) {
 
 void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
     std::stack<std::shared_ptr<apex::task_wrapper> >& timer_stack) {
+    // disable memory management tracking in APEX during this callback
+    apex::in_apex prevent_deadlocks;
 
     /* Unfortunately, when ranges are started/ended, they can overlap.
      * Unlike push/pop, which are a true stack.  Even worse, CUDA/CUPTI
@@ -1541,8 +1548,22 @@ void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
     }
 }
 
+inline bool ignoreMalloc(CUpti_CallbackId id) {
+    if (id == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 ||
+        id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020) {
+        return true;
+    }
+    return false;
+}
+
 void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         CUpti_CallbackId id, const void *params) {
+    // disable memory management tracking in APEX during this callback
+    apex::in_apex prevent_deadlocks;
     static bool initialized = initialize_first_time();
     APEX_UNUSED(initialized);
     static APEX_NATIVE_TLS bool registered = register_myself(true);
@@ -1563,7 +1584,7 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         if (id == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) {
             register_new_context(params);
         }
-        return;
+        //return;
     }
 
     /* Check for user-level instrumentation */
@@ -1575,6 +1596,8 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
     CUpti_CallbackData * cbdata = (CUpti_CallbackData*)(params);
 
     if (cbdata->callbackSite == CUPTI_API_ENTER) {
+        // sadly, CUPTI leaks a lot of memory.  Don't track memory in CUPTI or CUDA.
+        // apex::in_apex::get()++;
         std::stringstream ss;
         ss << cbdata->functionName;
         if (apex::apex_options::use_cuda_kernel_details()) {
@@ -1594,6 +1617,12 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
         map_mutex.unlock();
         getBytesIfMalloc(id, cbdata->functionParams, tmp, true);
     } else {
+        // sadly, CUPTI leaks a lot of memory.  Don't track memory in CUPTI or CUDA.
+        /*
+        if (cbdata->callbackSite == CUPTI_API_EXIT) {
+            apex::in_apex::get()--;
+        }
+        */
         /* Not sure how to use this yet... if this is a kernel launch, we can
          * run a function on the host, launched from the stream.  That gives us
          * a synchronous callback to tell us an event when the kernel finished.
@@ -1640,6 +1669,9 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
 }
 
 void initTrace() {
+    // disable memory management tracking in APEX during this initialization
+    apex::in_apex prevent_deadlocks;
+    // make a first call into CUDA
     bool& registered = get_registered();
     registered = true;
 
@@ -1681,7 +1713,6 @@ void initTrace() {
         CUPTI_CALL(cuptiActivityEnableLatencyTimestamps(enable));
     }
 
-
     // get user-added instrumentation
     CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_NVTX));
     // Make sure we see CUPTI_CBID_RESOURCE_CONTEXT_CREATED events!
@@ -1724,4 +1755,63 @@ namespace apex {
             std::cout << std::endl;
         }
     }
+
+    void finalizeCuda(void) {
+        std::cout << "Finalizing CUPTI " << std::endl;
+        flushTrace();
+        CUPTI_CALL(cuptiUnsubscribe(subscriber));
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47
+        #if 0
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3   <- disables concurrency
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT)); // 6
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC)); // 7
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // 11
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // 12
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MODULE)); // 27
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_STREAM)); // 37
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PCIE)); // 46
+        CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_COUNT)); // 49
+        #endif
+        cuptiFinalize();
+    }
 }
diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp
@@ -1110,12 +1110,16 @@ void yield(std::shared_ptr<task_wrapper> tt_ptr)
 void sample_value(const std::string &name, double value, bool threaded)
 {
     in_apex prevent_deadlocks;
+    // check these before checking the options, because if we have already
+    // cleaned up, checking the options can cause deadlock. This can
+    // happen if we are tracking memory.
+    if (_exited || _measurement_stopped) return; // protect against calls after finalization
     // if APEX is disabled, do nothing.
     if (apex_options::disable() == true) { return; }
     // if APEX is suspended, do nothing.
     if (apex_options::suspend() == true) { return; }
     apex* instance = apex::instance(); // get the Apex static instance
-    if (!instance || _exited) return; // protect against calls after finalization
+    if (!instance) return; // protect against calls after finalization
     // parse the counter name
     // either /threadqueue{locality#0/total}/length
     // or     /threadqueue{locality#0/worker-thread#0}/length
@@ -1411,6 +1415,7 @@ void finalize_plugins(void) {
 // forward declare CUPTI buffer flushing
 #ifdef APEX_WITH_CUDA
 void flushTrace(void);
+void finalizeCuda(void);
 #endif
 
 std::string dump(bool reset) {
@@ -1494,6 +1499,7 @@ void finalize()
     /* This could take a while */
 #ifdef APEX_WITH_CUDA
     flushTrace();
+    finalizeCuda();
 #endif
 #ifdef APEX_HAVE_TCMALLOC
     //tcmalloc::destroy_hook();
@@ -1571,6 +1577,7 @@ void cleanup(void) {
     }
     */
     delete(instance);
+    // stop tracking memory!
     FUNCTION_EXIT
 }
 

diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp
@@ -463,7 +463,9 @@ extern "C" {
  */
 void kokkosp_declare_output_type(const char* name, const size_t id,
     Kokkos_Tools_VariableInfo& info) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if(getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << std::endl;
@@ -483,7 +485,9 @@ void kokkosp_declare_output_type(const char* name, const size_t id,
  */
 void kokkosp_declare_input_type(const char* name, const size_t id,
     Kokkos_Tools_VariableInfo& info) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if(getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << std::endl;
@@ -519,17 +523,20 @@ void kokkosp_request_values(
     const Kokkos_Tools_VariableValue* contextVariableValues,
     const size_t numTuningVariables,
     Kokkos_Tools_VariableValue* tuningVariableValues) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
     if (getSession().verbose) {
         std::cout << std::string(getDepth(), ' ');
         std::cout << __func__ << " ctx: " << contextId;
         printContext(numContextVariables, contextVariableValues);
     }
     std::string name{hashContext(numContextVariables, contextVariableValues,
         getSession().inputs)};
-    handle_start(name, numTuningVariables, tuningVariableValues);
     getSession().active_requests.insert(
         std::pair<uint32_t, std::string>(contextId, name));
+    if (apex::apex_options::use_kokkos_tuning()) {
+        handle_start(name, numTuningVariables, tuningVariableValues);
+    }
     if (getSession().verbose) {
         std::cout << std::endl << std::string(getDepth(), ' ');
         printTuning(numTuningVariables, tuningVariableValues);
@@ -543,7 +550,9 @@ void kokkosp_request_values(
  * starting measurement.
  */
 void kokkosp_begin_context(size_t contextId) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
+    //if (!apex::apex_options::use_kokkos_tuning()) { return; }
     if (getSession().verbose) {
         std::cout << std::string(getDepth()++, ' ');
         std::cout << __func__ << "\t" << contextId << std::endl;
@@ -558,7 +567,8 @@ void kokkosp_begin_context(size_t contextId) {
  * values can now be associated with a result.
  */
 void kokkosp_end_context(const size_t contextId) {
-    if (!apex::apex_options::use_kokkos_tuning()) { return; }
+    // don't track memory in this function.
+    apex::in_apex prevent_memory_tracking;
     if (getSession().verbose) {
         std::cout << std::string(--getDepth(), ' ');
         std::cout << __func__ << "\t" << contextId << std::endl;
@@ -569,7 +579,9 @@ void kokkosp_end_context(const size_t contextId) {
     if (name != getSession().active_requests.end() &&
         start != getSession().context_starts.end()) {
         apex::sample_value(name->second, (double)(end-start->second));
-        handle_stop(name->second);
+        if (apex::apex_options::use_kokkos_tuning()) {
+            handle_stop(name->second);
+        }
         getSession().active_requests.erase(contextId);
         getSession().context_starts.erase(contextId);
     }

diff --git a/src/apex/apex_options.cpp b/src/apex/apex_options.cpp
@@ -51,8 +51,8 @@ namespace apex
 #endif
                 }
             }
-            conf_file.close();
         }
+        conf_file.close();
 
         char* option = nullptr;
 // getenv is not thread-safe, but the constructor for this static singleton is.

diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h
@@ -189,16 +189,20 @@ typedef enum _profile_type {
  */
 typedef struct _profile
 {
-    double calls;         /*!< Number of times a timer was called, or the number
-                              of samples collected for a counter */
-    double accumulated;   /*!< Accumulated values for all calls/samples */
-    double sum_squares;   /*!< Running sum of squares calculation for all
-                              calls/samples */
-    double minimum;       /*!< Minimum value seen by the timer or counter */
-    double maximum;       /*!< Maximum value seen by the timer or counter */
+    double calls;           /*!< Number of times a timer was called, or the number
+                                 of samples collected for a counter */
+    double accumulated;     /*!< Accumulated values for all calls/samples */
+    double sum_squares;     /*!< Running sum of squares calculation for all
+                                 calls/samples */
+    double minimum;         /*!< Minimum value seen by the timer or counter */
+    double maximum;         /*!< Maximum value seen by the timer or counter */
     apex_profile_type type; /*!< Whether this is a timer or a counter */
-    double papi_metrics[8];  /*!< Array of accumulated PAPI hardware metrics */
-    int times_reset;      /*!< How many times was this timer reset */
+    double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */
+    size_t allocations;     /*!< total calls to [m/c/re]alloc and related */
+    size_t frees;           /*!< total calls to free and related (realloc) */
+    size_t bytes_allocated; /*!< total bytes allocated in this task */
+    size_t bytes_freed;     /*!< total bytes freed in this task */
+    int times_reset;        /*!< How many times was this timer reset */
 } apex_profile;
 
 /** Rather than use void pointers everywhere, be explicit about

diff --git a/src/apex/otf2_listener.cpp b/src/apex/otf2_listener.cpp
@@ -924,6 +924,11 @@ namespace apex {
         static bool _finalized = false;
         if (_finalized) { return; }
         _finalized = true;
+        // if we are tracking memory, there are some alloc/free events
+        // we recorded before the end of the trace.
+        if (apex_options::track_memory()) {
+            saved_end_timestamp = get_time();
+        }
          // get an exclusive lock, to make sure no other threads
         // are writing to the archive.
         write_lock_type lock(_archive_mutex);

diff --git a/src/apex/proc_read.cpp b/src/apex/proc_read.cpp
@@ -1101,6 +1101,8 @@ namespace apex {
     /* This is the main function for the reader thread. */
     void* proc_data_reader::read_proc(void * _ptw) {
         in_apex prevent_deadlocks;
+        // when tracking memory allocations, ignore these
+        in_apex prevent_nonsense;
         pthread_wrapper* ptw = (pthread_wrapper*)_ptw;
         // make sure APEX knows this is not a worker thread
         thread_instance::instance(false);