diff --git a/CMakeLists.txt b/CMakeLists.txt index 25a6d76e..57223ea5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules) if(APEX_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED QUIET COMPONENTS CUPTI) + set(USE_CLOCK_TIMESTAMP ON) endif(APEX_USE_CUDA) # if the user didn't specify, set the installation directory. @@ -572,6 +573,15 @@ endif() # Just in case, to prevent concurrent builds add_dependencies (project_otf2 project_activeharmony) +################################################################################ +# CUDA configuration +################################################################################ + +if (APEX_USE_CUDA) + set(LIBS ${LIBS} cuda ${CUDA_cupti_LIBRARY}) + add_definitions(-DAPEX_USE_CUDA) +endif (APEX_USE_CUDA) + ################################################################################ # MSR configuration ################################################################################ diff --git a/src/apex/CMakeLists.standalone b/src/apex/CMakeLists.standalone index f43ee5e9..ac8f342a 100644 --- a/src/apex/CMakeLists.standalone +++ b/src/apex/CMakeLists.standalone @@ -25,8 +25,13 @@ if (OTF2_FOUND) SET(OTF2_SOURCE otf2_listener.cpp) endif(OTF2_FOUND) +if (APEX_USE_CUDA) +SET(CUPTI_SOURCE activity_trace_async.cpp) +endif(APEX_USE_CUDA) + # Try to keep this in alphabetical order SET(all_SOURCE +${CUPTI_SOURCE} apex.cpp apex_options.cpp apex_policies.cpp @@ -75,6 +80,10 @@ if (MPI_CXX_FOUND) include_directories (${MPI_CXX_INCLUDE_PATH}) endif(MPI_CXX_FOUND) +if (APEX_USE_CUDA) + include_directories (${CUDAToolkit_INCLUDE_DIR}) +endif(APEX_USE_CUDA) + # If we are building libapex.so, we want to include all the other libraries, # so that we can LD_PRELOAD this library with all requirements met. if (NOT BUILD_STATIC_EXECUTABLES) diff --git a/src/apex/activity_trace_async.cpp b/src/apex/activity_trace_async.cpp new file mode 100644 index 00000000..4876feb3 --- /dev/null +++ b/src/apex/activity_trace_async.cpp @@ -0,0 +1,450 @@ +/* + * Copyright 2011-2015 NVIDIA Corporation. All rights reserved + * + * Sample CUPTI app to print a trace of CUDA API and GPU activity + * using asynchronous handling of activity buffers. + * + */ + +#include +#include +#include +#include "apex.hpp" + +static void __attribute__((constructor)) initTrace(void); +//static void __attribute__((destructor)) flushTrace(void); + +#define CUPTI_CALL(call) \ + do { \ + CUptiResult _status = call; \ + if (_status != CUPTI_SUCCESS) { \ + const char *errstr; \ + cuptiGetResultString(_status, &errstr); \ + fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ + __FILE__, __LINE__, #call, errstr); \ + exit(-1); \ + } \ + } while (0) + +#define BUF_SIZE (32 * 1024) +#define ALIGN_SIZE (8) +#define ALIGN_BUFFER(buffer, align) \ + (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) + +// Timestamp at trace initialization time. Used to normalized other +// timestamps +static uint64_t startTimestamp; + +static const char * +getMemcpyKindString(CUpti_ActivityMemcpyKind kind) +{ + switch (kind) { + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: + return "HtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: + return "DtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: + return "HtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: + return "AtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: + return "AtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: + return "AtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: + return "DtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: + return "DtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: + return "HtoH"; + default: + break; + } + + return ""; +} + +const char * +getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) +{ + switch (kind) { + case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: + return "COMPILER"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: + return "BUFFER_FLUSH"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: + return "INSTRUMENTATION"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: + return "RESOURCE"; + default: + break; + } + + return ""; +} + +const char * +getActivityObjectKindString(CUpti_ActivityObjectKind kind) +{ + switch (kind) { + case CUPTI_ACTIVITY_OBJECT_PROCESS: + return "PROCESS"; + case CUPTI_ACTIVITY_OBJECT_THREAD: + return "THREAD"; + case CUPTI_ACTIVITY_OBJECT_DEVICE: + return "DEVICE"; + case CUPTI_ACTIVITY_OBJECT_CONTEXT: + return "CONTEXT"; + case CUPTI_ACTIVITY_OBJECT_STREAM: + return "STREAM"; + default: + break; + } + + return ""; +} + +uint32_t +getActivityObjectKindId(CUpti_ActivityObjectKind kind, CUpti_ActivityObjectKindId *id) +{ + switch (kind) { + case CUPTI_ACTIVITY_OBJECT_PROCESS: + return id->pt.processId; + case CUPTI_ACTIVITY_OBJECT_THREAD: + return id->pt.threadId; + case CUPTI_ACTIVITY_OBJECT_DEVICE: + return id->dcs.deviceId; + case CUPTI_ACTIVITY_OBJECT_CONTEXT: + return id->dcs.contextId; + case CUPTI_ACTIVITY_OBJECT_STREAM: + return id->dcs.streamId; + default: + break; + } + + return 0xffffffff; +} + +static const char * +getComputeApiKindString(CUpti_ActivityComputeApiKind kind) +{ + switch (kind) { + case CUPTI_ACTIVITY_COMPUTE_API_CUDA: + return "CUDA"; + case CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS: + return "CUDA_MPS"; + default: + break; + } + + return ""; +} + +static void +printActivity(CUpti_Activity *record) +{ + switch (record->kind) + { + case CUPTI_ACTIVITY_KIND_DEVICE: + { + CUpti_ActivityDevice2 *device = (CUpti_ActivityDevice2 *) record; + printf("DEVICE %s (%u), capability %u.%u, global memory (bandwidth %u GB/s, size %u MB), " + "multiprocessors %u, clock %u MHz\n", + device->name, device->id, + device->computeCapabilityMajor, device->computeCapabilityMinor, + (unsigned int) (device->globalMemoryBandwidth / 1024 / 1024), + (unsigned int) (device->globalMemorySize / 1024 / 1024), + device->numMultiprocessors, (unsigned int) (device->coreClockRate / 1000)); + break; + } + case CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE: + { + CUpti_ActivityDeviceAttribute *attribute = (CUpti_ActivityDeviceAttribute *)record; + printf("DEVICE_ATTRIBUTE %u, device %u, value=0x%llx\n", + attribute->attribute.cupti, attribute->deviceId, (unsigned long long)attribute->value.vUint64); + break; + } + case CUPTI_ACTIVITY_KIND_CONTEXT: + { + CUpti_ActivityContext *context = (CUpti_ActivityContext *) record; + printf("CONTEXT %u, device %u, compute API %s, NULL stream %d\n", + context->contextId, context->deviceId, + getComputeApiKindString((CUpti_ActivityComputeApiKind) context->computeApiKind), + (int) context->nullStreamId); + break; + } + case CUPTI_ACTIVITY_KIND_MEMCPY: + { + CUpti_ActivityMemcpy *memcpy = (CUpti_ActivityMemcpy *) record; + printf("MEMCPY %s [ %llu - %llu ] device %u, context %u, stream %u, correlation %u/r%u\n", + getMemcpyKindString((CUpti_ActivityMemcpyKind) memcpy->copyKind), + (unsigned long long) (memcpy->start - startTimestamp), + (unsigned long long) (memcpy->end - startTimestamp), + memcpy->deviceId, memcpy->contextId, memcpy->streamId, + memcpy->correlationId, memcpy->runtimeCorrelationId); + break; + } + case CUPTI_ACTIVITY_KIND_MEMSET: + { + CUpti_ActivityMemset *memset = (CUpti_ActivityMemset *) record; + printf("MEMSET value=%u [ %llu - %llu ] device %u, context %u, stream %u, correlation %u\n", + memset->value, + (unsigned long long) (memset->start - startTimestamp), + (unsigned long long) (memset->end - startTimestamp), + memset->deviceId, memset->contextId, memset->streamId, + memset->correlationId); + break; + } + case CUPTI_ACTIVITY_KIND_KERNEL: + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + case CUPTI_ACTIVITY_KIND_CDP_KERNEL: + { + CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *) record; +#if 0 + const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ? "KERNEL" : "CONC KERNEL"; + printf("%s \"%s\" [ %llu - %llu ] device %u, context %u, stream %u, correlation %u\n", + kindString, + kernel->name, + (unsigned long long) (kernel->start - startTimestamp), + (unsigned long long) (kernel->end - startTimestamp), + kernel->deviceId, kernel->contextId, kernel->streamId, + kernel->correlationId); + printf(" grid [%u,%u,%u], block [%u,%u,%u], shared memory (static %u, dynamic %u)\n", + kernel->gridX, kernel->gridY, kernel->gridZ, + kernel->blockX, kernel->blockY, kernel->blockZ, + kernel->staticSharedMemory, kernel->dynamicSharedMemory); + printf("%f\n", nanoseconds); +#endif + double nanoseconds = ((double)(kernel->end - kernel->start)/1000.0); + std::shared_ptr p = + std::make_shared(apex::task_identifier::get_task_id( + kernel->name), nanoseconds); + p->is_counter = false; + p->is_device = true; + apex::apex* instance = apex::apex::instance(); + instance->the_profiler_listener->push_profiler_public(p); + break; + } + case CUPTI_ACTIVITY_KIND_DRIVER: + { + CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record; + printf("DRIVER cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n", + api->cbid, + (unsigned long long) (api->start - startTimestamp), + (unsigned long long) (api->end - startTimestamp), + api->processId, api->threadId, api->correlationId); + break; + } + case CUPTI_ACTIVITY_KIND_RUNTIME: + { + CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record; + printf("RUNTIME cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n", + api->cbid, + (unsigned long long) (api->start - startTimestamp), + (unsigned long long) (api->end - startTimestamp), + api->processId, api->threadId, api->correlationId); + break; + } + case CUPTI_ACTIVITY_KIND_NAME: + { + CUpti_ActivityName *name = (CUpti_ActivityName *) record; + switch (name->objectKind) + { + case CUPTI_ACTIVITY_OBJECT_CONTEXT: + printf("NAME %s %u %s id %u, name %s\n", + getActivityObjectKindString(name->objectKind), + getActivityObjectKindId(name->objectKind, &name->objectId), + getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE), + getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId), + name->name); + break; + case CUPTI_ACTIVITY_OBJECT_STREAM: + printf("NAME %s %u %s %u %s id %u, name %s\n", + getActivityObjectKindString(name->objectKind), + getActivityObjectKindId(name->objectKind, &name->objectId), + getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_CONTEXT), + getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_CONTEXT, &name->objectId), + getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE), + getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId), + name->name); + break; + default: + printf("NAME %s id %u, name %s\n", + getActivityObjectKindString(name->objectKind), + getActivityObjectKindId(name->objectKind, &name->objectId), + name->name); + break; + } + break; + } + case CUPTI_ACTIVITY_KIND_MARKER: + { + CUpti_ActivityMarker2 *marker = (CUpti_ActivityMarker2 *) record; + printf("MARKER id %u [ %llu ], name %s, domain %s\n", + marker->id, (unsigned long long) marker->timestamp, marker->name, marker->domain); + break; + } + case CUPTI_ACTIVITY_KIND_MARKER_DATA: + { + CUpti_ActivityMarkerData *marker = (CUpti_ActivityMarkerData *) record; + printf("MARKER_DATA id %u, color 0x%x, category %u, payload %llu/%f\n", + marker->id, marker->color, marker->category, + (unsigned long long) marker->payload.metricValueUint64, + marker->payload.metricValueDouble); + break; + } + case CUPTI_ACTIVITY_KIND_OVERHEAD: + { + CUpti_ActivityOverhead *overhead = (CUpti_ActivityOverhead *) record; + printf("OVERHEAD %s [ %llu, %llu ] %s id %u\n", + getActivityOverheadKindString(overhead->overheadKind), + (unsigned long long) overhead->start - startTimestamp, + (unsigned long long) overhead->end - startTimestamp, + getActivityObjectKindString(overhead->objectKind), + getActivityObjectKindId(overhead->objectKind, &overhead->objectId)); + break; + } + default: + printf(" \n"); + break; + } +} + +void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) +{ + uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE); + if (bfr == NULL) { + printf("Error: out of memory\n"); + exit(-1); + } + + *size = BUF_SIZE; + *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); + *maxNumRecords = 0; +} + +void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) +{ + CUptiResult status; + CUpti_Activity *record = NULL; + APEX_UNUSED(size); + + if (validSize > 0) { + do { + status = cuptiActivityGetNextRecord(buffer, validSize, &record); + if (status == CUPTI_SUCCESS) { + printActivity(record); + } + else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) + break; + else { + CUPTI_CALL(status); + } + } while (1); + + // report any records dropped from the queue + size_t dropped; + CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + if (dropped != 0) { + printf("Dropped %u activity records\n", (unsigned int) dropped); + } + } + + free(buffer); +} + +void +initTrace() +{ + size_t attrValue = 0, attrValueSize = sizeof(size_t); + // Device activity record is created when CUDA initializes, so we + // want to enable it before cuInit() or any CUDA runtime call. + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // 8 + // Enable all other activity record kinds. + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); // 10 +#if 0 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); // 1 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); // 2 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); // 3 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // 4 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // 5 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT)); // 6 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC)); // 7 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // 9 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); // 11 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); // 12 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER_DATA)); // 13 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR)); // 14 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS)); // 15 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); // 16 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // 17 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CDP_KERNEL)); // 18 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PREEMPTION)); // 19 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_ENVIRONMENT)); // 20 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EVENT_INSTANCE)); // 21 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY2)); // 22 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_METRIC_INSTANCE)); // 23 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); // 24 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER)); // 25 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_FUNCTION)); // 26 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MODULE)); // 27 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE)); // 28 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SHARED_ACCESS)); // 29 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); // 30 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO)); // 31 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION)); // 32 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_DATA)); // 33 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH)); // 34 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENACC_OTHER)); // 35 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CUDA_EVENT)); // 36 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_STREAM)); // 37 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); // 38 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); // 39 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK)); // 40 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT)); // 41 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE)); // 42 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC)); // 43 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE)); // 44 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMORY)); // 45 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PCIE)); // 46 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OPENMP)); // 47 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API)); // 48 + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_COUNT)); // 49 +#endif + + // Register callbacks for buffer requests and for buffers completed by CUPTI. + CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); + + // Get and set activity attributes. + // Attributes can be set by the CUPTI client to change behavior of the activity API. + // Some attributes require to be set before any CUDA context is created to be effective, + // e.g. to be applied to all device buffer allocations (see documentation). + CUPTI_CALL(cuptiActivityGetAttribute( + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue)); + printf("%s = %llu\n", + "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", + (long long unsigned)attrValue); + attrValue *= 2; + CUPTI_CALL(cuptiActivitySetAttribute( + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue)); + + CUPTI_CALL(cuptiActivityGetAttribute( + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue)); + printf("%s = %llu\n", + "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", + (long long unsigned)attrValue); + attrValue *= 2; + CUPTI_CALL(cuptiActivitySetAttribute( + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue)); + + CUPTI_CALL(cuptiGetTimestamp(&startTimestamp)); +} + +/* This is the global "shutdown" method for flushing the buffer. This is + * called from apex::finalize(). It's the only function in the CUDA support + * that APEX will call directly. */ +namespace apex { + void flushTrace(void) { + cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_NONE); + } +} diff --git a/src/apex/apex.cpp b/src/apex/apex.cpp index cca1cff9..53b3131f 100644 --- a/src/apex/apex.cpp +++ b/src/apex/apex.cpp @@ -1352,8 +1352,13 @@ std::string dump(bool reset) { return(std::string("")); } +void flushTrace(void); + void finalize() { +#ifdef APEX_USE_CUDA + flushTrace(); +#endif #ifdef APEX_WITH_JUPYTER_SUPPORT // reset all counters, and return. //reset(APEX_NULL_FUNCTION_ADDRESS); diff --git a/src/apex/profiler.hpp b/src/apex/profiler.hpp index 0f6ae25e..661798cb 100644 --- a/src/apex/profiler.hpp +++ b/src/apex/profiler.hpp @@ -87,6 +87,7 @@ class profiler { //bool have_name; uint64_t guid; bool is_counter; + bool is_device; bool is_resume; // for yield or resume reset_type is_reset; bool stopped; @@ -111,6 +112,7 @@ class profiler { children_value(0.0), guid(0), is_counter(false), + is_device(false), is_resume(resume), is_reset(reset), stopped(false) { task->prof = this; }; // this constructor is for resetting profile values @@ -128,6 +130,7 @@ class profiler { children_value(0.0), guid(0), is_counter(false), + is_device(false), is_resume(resume), is_reset(reset), stopped(false) { }; // this constructor is for counters @@ -142,6 +145,7 @@ class profiler { value(value_), children_value(0.0), is_counter(true), + is_device(true), is_resume(false), is_reset(reset_type::NONE), stopped(true) { }; //copy constructor @@ -153,6 +157,7 @@ class profiler { value(in.value), children_value(in.children_value), is_counter(in.is_counter), + is_device(in.is_device), is_resume(in.is_resume), // for yield or resume is_reset(in.is_reset), stopped(in.stopped) @@ -182,6 +187,8 @@ class profiler { double elapsed(void) { if(is_counter) { return value; + } else if (is_device) { + return value/1000.0; } else { std::chrono::duration time_span = std::chrono::duration_cast>(end - @@ -258,7 +265,7 @@ class profiler { return duration; } double normalized_timestamp(void) { - if(is_counter) { + if(is_counter || is_device) { return value; } else { std::chrono::duration time_span = diff --git a/src/apex/profiler_listener.hpp b/src/apex/profiler_listener.hpp index 9258a71d..218053b1 100644 --- a/src/apex/profiler_listener.hpp +++ b/src/apex/profiler_listener.hpp @@ -238,6 +238,9 @@ class profiler_listener : public event_listener { #if APEX_HAVE_PAPI std::vector& get_metric_names(void) { return metric_names; }; #endif + void push_profiler_public(std::shared_ptr &p) { + push_profiler(0, p); + } }; } diff --git a/src/unit_tests/CUDA/CMakeLists.txt b/src/unit_tests/CUDA/CMakeLists.txt index 6823c46a..393539ed 100644 --- a/src/unit_tests/CUDA/CMakeLists.txt +++ b/src/unit_tests/CUDA/CMakeLists.txt @@ -7,13 +7,16 @@ link_directories (${APEX_BINARY_DIR}/src/apex) set(example_programs apex_cuda + apex_vector ) +message(INFO "Using CUDA libraries: ${CUDA_LIBRARIES}") + foreach(example_program ${example_programs}) set(sources ${example_program}.cu) source_group("Source Files" FILES ${sources}) add_executable("${example_program}_cu" ${sources}) - target_link_libraries ("${example_program}_cu" apex ${LIBS}) + target_link_libraries ("${example_program}_cu" apex ${LIBS} cuda) if (BUILD_STATIC_EXECUTABLES) set_target_properties("${example_program}_cu" PROPERTIES LINK_SEARCH_START_STATIC 1 LINK_SEARCH_END_STATIC 1) endif()