Skip to content

Commit

Permalink
Writing a memory wrapper report.
Browse files Browse the repository at this point in the history
There's a huge amount of CUPTI memory leaks, and they happen
when the first real call to CUDA happens.  I can't force that call,
or ignore memory during the first "real" call, yet.
  • Loading branch information
khuck committed May 9, 2021
1 parent ce8c8cb commit 2f78750
Show file tree
Hide file tree
Showing 16 changed files with 528 additions and 53 deletions.
94 changes: 92 additions & 2 deletions src/apex/activity_trace_async.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ static void openmpActivity(CUpti_Activity *record) {
}

static void syncActivity(CUpti_Activity *record) {
// when tracking memory allocations, ignore the ones in cuda device synchronize
CUpti_ActivitySynchronization *data =
(CUpti_ActivitySynchronization *) record;
/* Check whether there is timing information */
Expand Down Expand Up @@ -865,6 +866,8 @@ static void printActivity(CUpti_Activity *record) {
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
size_t *maxNumRecords)
{
// when tracking memory allocations, ignore these
apex::in_apex prevent_nonsense;
num_buffers++;
uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE);
if (bfr == NULL) {
Expand All @@ -883,6 +886,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId,
//auto p = apex::scoped_timer("APEX: CUPTI Buffer Completed");
//printf("%s...", __func__); fflush(stdout);
static bool registered = register_myself(false);
// when tracking memory allocations, ignore these
apex::in_apex prevent_nonsense;
num_buffers_processed++;
if (flushing) { std::cout << "." << std::flush; }
APEX_UNUSED(registered);
Expand Down Expand Up @@ -1380,6 +1385,8 @@ double get_nvtx_payload(const nvtxEventAttributes_t * eventAttrib) {

void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
std::stack<std::shared_ptr<apex::task_wrapper> >& timer_stack) {
// disable memory management tracking in APEX during this callback
apex::in_apex prevent_deadlocks;

/* Unfortunately, when ranges are started/ended, they can overlap.
* Unlike push/pop, which are a true stack. Even worse, CUDA/CUPTI
Expand Down Expand Up @@ -1541,8 +1548,22 @@ void handle_nvtx_callback(CUpti_CallbackId id, const void *cbdata,
}
}

inline bool ignoreMalloc(CUpti_CallbackId id) {
if (id == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 ||
id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 ||
id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
id == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
id == CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 ||
id == CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020) {
return true;
}
return false;
}

void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
CUpti_CallbackId id, const void *params) {
// disable memory management tracking in APEX during this callback
apex::in_apex prevent_deadlocks;
static bool initialized = initialize_first_time();
APEX_UNUSED(initialized);
static APEX_NATIVE_TLS bool registered = register_myself(true);
Expand All @@ -1563,7 +1584,7 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
if (id == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) {
register_new_context(params);
}
return;
//return;
}

/* Check for user-level instrumentation */
Expand All @@ -1575,6 +1596,8 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
CUpti_CallbackData * cbdata = (CUpti_CallbackData*)(params);

if (cbdata->callbackSite == CUPTI_API_ENTER) {
// sadly, CUPTI leaks a lot of memory. Don't track memory in CUPTI or CUDA.
// apex::in_apex::get()++;
std::stringstream ss;
ss << cbdata->functionName;
if (apex::apex_options::use_cuda_kernel_details()) {
Expand All @@ -1594,6 +1617,12 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
map_mutex.unlock();
getBytesIfMalloc(id, cbdata->functionParams, tmp, true);
} else {
// sadly, CUPTI leaks a lot of memory. Don't track memory in CUPTI or CUDA.
/*
if (cbdata->callbackSite == CUPTI_API_EXIT) {
apex::in_apex::get()--;
}
*/
/* Not sure how to use this yet... if this is a kernel launch, we can
* run a function on the host, launched from the stream. That gives us
* a synchronous callback to tell us an event when the kernel finished.
Expand Down Expand Up @@ -1640,6 +1669,9 @@ void apex_cupti_callback_dispatch(void *ud, CUpti_CallbackDomain domain,
}

void initTrace() {
// disable memory management tracking in APEX during this initialization
apex::in_apex prevent_deadlocks;
// make a first call into CUDA
bool& registered = get_registered();
registered = true;

Expand Down Expand Up @@ -1681,7 +1713,6 @@ void initTrace() {
CUPTI_CALL(cuptiActivityEnableLatencyTimestamps(enable));
}


// get user-added instrumentation
CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_NVTX));
// Make sure we see CUPTI_CBID_RESOURCE_CONTEXT_CREATED events!
Expand Down Expand Up @@ -1724,4 +1755,63 @@ namespace apex {
std::cout << std::endl;
}
}

void finalizeCuda(void) {
std::cout << "Finalizing CUPTI " << std::endl;
flushTrace();
CUPTI_CALL(cuptiUnsubscribe(subscriber));
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47
#if 0
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3 <- disables concurrency
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT)); // 6
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC)); // 7
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // 11
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // 12
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MODULE)); // 27
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_STREAM)); // 37
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PCIE)); // 46
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_COUNT)); // 49
#endif
cuptiFinalize();
}
}
9 changes: 8 additions & 1 deletion src/apex/apex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1110,12 +1110,16 @@ void yield(std::shared_ptr<task_wrapper> tt_ptr)
void sample_value(const std::string &name, double value, bool threaded)
{
in_apex prevent_deadlocks;
// check these before checking the options, because if we have already
// cleaned up, checking the options can cause deadlock. This can
// happen if we are tracking memory.
if (_exited || _measurement_stopped) return; // protect against calls after finalization
// if APEX is disabled, do nothing.
if (apex_options::disable() == true) { return; }
// if APEX is suspended, do nothing.
if (apex_options::suspend() == true) { return; }
apex* instance = apex::instance(); // get the Apex static instance
if (!instance || _exited) return; // protect against calls after finalization
if (!instance) return; // protect against calls after finalization
// parse the counter name
// either /threadqueue{locality#0/total}/length
// or /threadqueue{locality#0/worker-thread#0}/length
Expand Down Expand Up @@ -1411,6 +1415,7 @@ void finalize_plugins(void) {
// forward declare CUPTI buffer flushing
#ifdef APEX_WITH_CUDA
void flushTrace(void);
void finalizeCuda(void);
#endif

std::string dump(bool reset) {
Expand Down Expand Up @@ -1494,6 +1499,7 @@ void finalize()
/* This could take a while */
#ifdef APEX_WITH_CUDA
flushTrace();
finalizeCuda();
#endif
#ifdef APEX_HAVE_TCMALLOC
//tcmalloc::destroy_hook();
Expand Down Expand Up @@ -1571,6 +1577,7 @@ void cleanup(void) {
}
*/
delete(instance);
// stop tracking memory!
FUNCTION_EXIT
}

Expand Down
26 changes: 19 additions & 7 deletions src/apex/apex_kokkos_tuning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,9 @@ extern "C" {
*/
void kokkosp_declare_output_type(const char* name, const size_t id,
Kokkos_Tools_VariableInfo& info) {
if (!apex::apex_options::use_kokkos_tuning()) { return; }
// don't track memory in this function.
apex::in_apex prevent_memory_tracking;
//if (!apex::apex_options::use_kokkos_tuning()) { return; }
if(getSession().verbose) {
std::cout << std::string(getDepth(), ' ');
std::cout << __func__ << std::endl;
Expand All @@ -483,7 +485,9 @@ void kokkosp_declare_output_type(const char* name, const size_t id,
*/
void kokkosp_declare_input_type(const char* name, const size_t id,
Kokkos_Tools_VariableInfo& info) {
if (!apex::apex_options::use_kokkos_tuning()) { return; }
// don't track memory in this function.
apex::in_apex prevent_memory_tracking;
//if (!apex::apex_options::use_kokkos_tuning()) { return; }
if(getSession().verbose) {
std::cout << std::string(getDepth(), ' ');
std::cout << __func__ << std::endl;
Expand Down Expand Up @@ -519,17 +523,20 @@ void kokkosp_request_values(
const Kokkos_Tools_VariableValue* contextVariableValues,
const size_t numTuningVariables,
Kokkos_Tools_VariableValue* tuningVariableValues) {
if (!apex::apex_options::use_kokkos_tuning()) { return; }
// don't track memory in this function.
apex::in_apex prevent_memory_tracking;
if (getSession().verbose) {
std::cout << std::string(getDepth(), ' ');
std::cout << __func__ << " ctx: " << contextId;
printContext(numContextVariables, contextVariableValues);
}
std::string name{hashContext(numContextVariables, contextVariableValues,
getSession().inputs)};
handle_start(name, numTuningVariables, tuningVariableValues);
getSession().active_requests.insert(
std::pair<uint32_t, std::string>(contextId, name));
if (apex::apex_options::use_kokkos_tuning()) {
handle_start(name, numTuningVariables, tuningVariableValues);
}
if (getSession().verbose) {
std::cout << std::endl << std::string(getDepth(), ' ');
printTuning(numTuningVariables, tuningVariableValues);
Expand All @@ -543,7 +550,9 @@ void kokkosp_request_values(
* starting measurement.
*/
void kokkosp_begin_context(size_t contextId) {
if (!apex::apex_options::use_kokkos_tuning()) { return; }
// don't track memory in this function.
apex::in_apex prevent_memory_tracking;
//if (!apex::apex_options::use_kokkos_tuning()) { return; }
if (getSession().verbose) {
std::cout << std::string(getDepth()++, ' ');
std::cout << __func__ << "\t" << contextId << std::endl;
Expand All @@ -558,7 +567,8 @@ void kokkosp_begin_context(size_t contextId) {
* values can now be associated with a result.
*/
void kokkosp_end_context(const size_t contextId) {
if (!apex::apex_options::use_kokkos_tuning()) { return; }
// don't track memory in this function.
apex::in_apex prevent_memory_tracking;
if (getSession().verbose) {
std::cout << std::string(--getDepth(), ' ');
std::cout << __func__ << "\t" << contextId << std::endl;
Expand All @@ -569,7 +579,9 @@ void kokkosp_end_context(const size_t contextId) {
if (name != getSession().active_requests.end() &&
start != getSession().context_starts.end()) {
apex::sample_value(name->second, (double)(end-start->second));
handle_stop(name->second);
if (apex::apex_options::use_kokkos_tuning()) {
handle_stop(name->second);
}
getSession().active_requests.erase(contextId);
getSession().context_starts.erase(contextId);
}
Expand Down
2 changes: 1 addition & 1 deletion src/apex/apex_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ namespace apex
#endif
}
}
conf_file.close();
}
conf_file.close();

char* option = nullptr;
// getenv is not thread-safe, but the constructor for this static singleton is.
Expand Down
22 changes: 13 additions & 9 deletions src/apex/apex_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,20 @@ typedef enum _profile_type {
*/
typedef struct _profile
{
double calls; /*!< Number of times a timer was called, or the number
of samples collected for a counter */
double accumulated; /*!< Accumulated values for all calls/samples */
double sum_squares; /*!< Running sum of squares calculation for all
calls/samples */
double minimum; /*!< Minimum value seen by the timer or counter */
double maximum; /*!< Maximum value seen by the timer or counter */
double calls; /*!< Number of times a timer was called, or the number
of samples collected for a counter */
double accumulated; /*!< Accumulated values for all calls/samples */
double sum_squares; /*!< Running sum of squares calculation for all
calls/samples */
double minimum; /*!< Minimum value seen by the timer or counter */
double maximum; /*!< Maximum value seen by the timer or counter */
apex_profile_type type; /*!< Whether this is a timer or a counter */
double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */
int times_reset; /*!< How many times was this timer reset */
double papi_metrics[8]; /*!< Array of accumulated PAPI hardware metrics */
size_t allocations; /*!< total calls to [m/c/re]alloc and related */
size_t frees; /*!< total calls to free and related (realloc) */
size_t bytes_allocated; /*!< total bytes allocated in this task */
size_t bytes_freed; /*!< total bytes freed in this task */
int times_reset; /*!< How many times was this timer reset */
} apex_profile;

/** Rather than use void pointers everywhere, be explicit about
Expand Down
5 changes: 5 additions & 0 deletions src/apex/otf2_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,11 @@ namespace apex {
static bool _finalized = false;
if (_finalized) { return; }
_finalized = true;
// if we are tracking memory, there are some alloc/free events
// we recorded before the end of the trace.
if (apex_options::track_memory()) {
saved_end_timestamp = get_time();
}
// get an exclusive lock, to make sure no other threads
// are writing to the archive.
write_lock_type lock(_archive_mutex);
Expand Down
2 changes: 2 additions & 0 deletions src/apex/proc_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,8 @@ namespace apex {
/* This is the main function for the reader thread. */
void* proc_data_reader::read_proc(void * _ptw) {
in_apex prevent_deadlocks;
// when tracking memory allocations, ignore these
in_apex prevent_nonsense;
pthread_wrapper* ptw = (pthread_wrapper*)_ptw;
// make sure APEX knows this is not a worker thread
thread_instance::instance(false);
Expand Down
Loading

0 comments on commit 2f78750

Please sign in to comment.