diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index 4866c45e..525e29c2 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -133,11 +133,11 @@ typedef enum {APEX_SIMPLE_HYSTERESIS, /*!< optimize using sliding window of /** * Typedef for enumerating the different asynchronous activity types */ -typedef enum {APEX_CUDA_KERNEL, /* CUDA Kernel */ - APEX_CUDA_MEMORY, /* CUDA memory copy */ - APEX_CUDA_SYNCHRONIZE, /* CUDA Synchronization events */ - APEX_CUDA_OTHER /* CUDA OpenACC "other" events */ -} apex_cuda_async_activity_t; +typedef enum {APEX_ASYNC_KERNEL, /* Kernel */ + APEX_ASYNC_MEMORY, /* memory copy */ + APEX_ASYNC_SYNCHRONIZE, /* Synchronization events */ + APEX_ASYNC_OTHER /* OpenACC or "other" events */ +} apex_async_activity_t; /** * Structure that holds a profiler ID @@ -319,6 +319,13 @@ inline unsigned int sc_nprocessors_onln() macro (APEX_CUDA_SYNC_ACTIVITY, use_cuda_sync_activity, bool, true) \ macro (APEX_CUDA_MEMORY_ACTIVITY, use_cuda_memory_activity, bool, true) \ macro (APEX_CUDA_KERNEL_ACTIVITY, use_cuda_kernel_activity, bool, true) \ + macro (APEX_HIP_COUNTERS, use_hip_counters, int, false) \ + macro (APEX_HIP_KERNEL_DETAILS, use_hip_kernel_details, int, false) \ + macro (APEX_HIP_RUNTIME_API, use_hip_runtime_api, bool, true) \ + macro (APEX_HIP_KFD_API, use_hip_kfd_api, bool, false) \ + macro (APEX_HIP_SYNC_ACTIVITY, use_hip_sync_activity, bool, true) \ + macro (APEX_HIP_MEMORY_ACTIVITY, use_hip_memory_activity, bool, true) \ + macro (APEX_HIP_KERNEL_ACTIVITY, use_hip_kernel_activity, bool, true) \ macro (APEX_JUPYTER_SUPPORT, use_jupyter_support, int, false) \ macro (APEX_KOKKOS_VERBOSE, use_kokkos_verbose, bool, false) \ macro (APEX_KOKKOS_TUNING, use_kokkos_tuning, bool, true) \ diff --git a/src/apex/async_thread_node.hpp b/src/apex/async_thread_node.hpp new file mode 100644 index 00000000..fb3eff40 --- /dev/null +++ b/src/apex/async_thread_node.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2014-2021 Kevin Huck + * Copyright (c) 2014-2021 University of Oregon + * + * Distributed under the Boost Software License, Version 1.0. (See accompanying + * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#pragma once + +namespace apex { + + class cuda_thread_node { + public: + uint32_t _device; + uint32_t _context; + uint32_t _stream; + apex_async_activity_t _activity; + cuda_thread_node(uint32_t device, uint32_t context, uint32_t stream, + apex_async_activity_t activity) : + _device(device), _context(context), _stream(stream), + _activity(activity) { } + bool operator==(const cuda_thread_node &rhs) const { + return (_device == rhs._device && + _context == rhs._context && + _stream == rhs._stream && + _activity == rhs._activity); + } + bool operator<(const cuda_thread_node &rhs) const { + if (_device #include @@ -570,7 +570,7 @@ static void memcpyActivity2(CUpti_Activity *record) { << memcpy->deviceId << "->" << memcpy->dstDeviceId; std::string name{ss.str()}; apex::cuda_thread_node node(memcpy->deviceId, memcpy->contextId, - memcpy->streamId, APEX_CUDA_MEMORY); + memcpy->streamId, APEX_ASYNC_MEMORY); store_profiler_data(name, memcpy->correlationId, memcpy->start, memcpy->end, node); if (apex::apex_options::use_cuda_counters()) { @@ -594,7 +594,7 @@ static void memcpyActivity(CUpti_Activity *record) { } std::string name{getMemcpyKindString(memcpy->copyKind)}; apex::cuda_thread_node node(memcpy->deviceId, memcpy->contextId, - memcpy->streamId, APEX_CUDA_MEMORY); + memcpy->streamId, APEX_ASYNC_MEMORY); store_profiler_data(name, memcpy->correlationId, memcpy->start, memcpy->end, node); if (apex::apex_options::use_cuda_counters()) { @@ -618,7 +618,7 @@ static void unifiedMemoryActivity(CUpti_Activity *record) { uint32_t device = getUvmCounterDevice( (CUpti_ActivityUnifiedMemoryCounterKind) memcpy->counterKind, memcpy->srcId, memcpy->dstId); - apex::cuda_thread_node node(device, 0, 0, APEX_CUDA_MEMORY); + apex::cuda_thread_node node(device, 0, 0, APEX_ASYNC_MEMORY); if (memcpy->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD || memcpy->counterKind == @@ -659,7 +659,7 @@ static void memsetActivity(CUpti_Activity *record) { CUpti_ActivityMemset *memset = (CUpti_ActivityMemset *) record; static std::string name{"Memset"}; apex::cuda_thread_node node(memset->deviceId, memset->contextId, - memset->streamId, APEX_CUDA_MEMORY); + memset->streamId, APEX_ASYNC_MEMORY); store_profiler_data(name, memset->correlationId, memset->start, memset->end, node); } @@ -670,7 +670,7 @@ static void kernelActivity(CUpti_Activity *record) { std::string tmp = std::string(kernel->name); //DEBUG_PRINT("Kernel CorrelationId: %u\n", kernel->correlationId); apex::cuda_thread_node node(kernel->deviceId, kernel->contextId, - kernel->streamId, APEX_CUDA_KERNEL); + kernel->streamId, APEX_ASYNC_KERNEL); store_profiler_data(tmp, kernel->correlationId, kernel->start, kernel->end, node); if (apex::apex_options::use_cuda_counters()) { @@ -719,7 +719,7 @@ static void openaccDataActivity(CUpti_Activity *record) { CUpti_ActivityOpenAccData *data = (CUpti_ActivityOpenAccData *) record; std::string label{openacc_event_names[data->eventKind]}; apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId, - data->cuStreamId, APEX_CUDA_MEMORY); + data->cuStreamId, APEX_ASYNC_MEMORY); store_profiler_data(label, data->externalId, data->start, data->end, node); static std::string bytes{"Bytes Transferred"}; store_counter_data(label.c_str(), bytes, data->end, data->bytes, node); @@ -729,7 +729,7 @@ static void openaccKernelActivity(CUpti_Activity *record) { CUpti_ActivityOpenAccLaunch *data = (CUpti_ActivityOpenAccLaunch *) record; std::string label{openacc_event_names[data->eventKind]}; apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId, - data->cuStreamId, APEX_CUDA_KERNEL); + data->cuStreamId, APEX_ASYNC_KERNEL); store_profiler_data(label, data->externalId, data->start, data->end, node); static std::string gangs{"Num Gangs"}; @@ -744,7 +744,7 @@ static void openaccOtherActivity(CUpti_Activity *record) { CUpti_ActivityOpenAccOther *data = (CUpti_ActivityOpenAccOther *) record; std::string label{openacc_event_names[data->eventKind]}; apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId, - data->cuStreamId, APEX_CUDA_OTHER); + data->cuStreamId, APEX_ASYNC_OTHER); store_profiler_data(label, data->externalId, data->start, data->end, node); } @@ -771,7 +771,7 @@ static void syncActivity(CUpti_Activity *record) { data->type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE) { stream = data->streamId; } - apex::cuda_thread_node node(device, context, stream, APEX_CUDA_SYNCHRONIZE); + apex::cuda_thread_node node(device, context, stream, APEX_ASYNC_SYNCHRONIZE); /* Event Synchronize doesn't have a stream ID, and can come from any thread, * and can overlap. So if we are OTF2 tracing, ignore them. */ if (apex::apex_options::use_otf2() && diff --git a/src/apex/hip_trace.cpp b/src/apex/hip_trace.cpp index d58dba72..0c8cf770 100644 --- a/src/apex/hip_trace.cpp +++ b/src/apex/hip_trace.cpp @@ -37,15 +37,21 @@ using namespace std; #include #include "apex_api.hpp" +#include "apex.hpp" +#include "async_thread_node.hpp" +#include "trace_event_listener.hpp" +#ifdef APEX_HAVE_OTF2 +#include "otf2_listener.hpp" +#endif #include #include #include +#if 0 static thread_local const size_t msg_size = 512; static thread_local char* msg_buf = NULL; static thread_local char* message = NULL; -#if 0 void SPRINT(const char* fmt, ...) { if (msg_buf == NULL) { msg_buf = (char*) calloc(msg_size, 1); @@ -65,7 +71,7 @@ void SFLUSH() { fflush(stdout); } #else -void SPRINT(const char* fmt, ...) { } +void SPRINT(const char* fmt, ...) { APEX_UNUSED(fmt); } void SFLUSH() { } #endif @@ -84,24 +90,31 @@ static void __attribute__((constructor)) init_tracing(void); #include /* For SYS_xxx definitions */ // Macro to check ROC-tracer calls status -#define ROCTRACER_CALL(call) \ - do { \ - int err = call; \ - if (err != 0) { \ - fprintf(stderr, "%s\n", roctracer_error_string()); \ - abort(); \ - } \ +#define ROCTRACER_CALL(call) \ + do { \ + int err = call; \ + if (err != 0) { \ + fprintf(stderr, "%s\n", roctracer_error_string()); \ + abort(); \ + } \ } while (0) static inline uint32_t GetTid() { return syscall(__NR_gettid); } static inline uint32_t GetPid() { return syscall(__NR_getpid); } +// Timestamp at trace initialization time. Used to normalized other +// timestamps +static uint64_t startTimestampGPU{0}; +static uint64_t startTimestampCPU{0}; +static int64_t deltaTimestamp{0}; + /* This is like the CUDA NVTX API. User-added instrumentation for * ranges that can be pushed/popped on a stack (and common to a thread * of execution) or started/stopped (and can be started by one thread * and stopped by another). */ void handle_roctx(uint32_t cid, const void* callback_data, void* arg) { + APEX_UNUSED(arg); static thread_local std::stack timer_stack; static std::map timer_map; static std::mutex map_lock; @@ -149,8 +162,15 @@ void handle_roctx(uint32_t cid, const void* callback_data, void* arg) { return; } +/* The map that holds correlation IDs and matches them to GUIDs */ +std::unordered_map> correlation_map; +std::unordered_map correlation_kernel_map; +std::mutex correlation_map_mutex; + /* This is the "low level" API - lots of events if interested. */ void handle_roc_kfd(uint32_t cid, const void* callback_data, void* arg) { + APEX_UNUSED(arg); + static APEX_NATIVE_TLS std::stack > timer_stack; const kfd_api_data_t* data = (const kfd_api_data_t*)(callback_data); SPRINT("<%s id(%u)\tcorrelation_id(%lu) %s pid(%d) tid(%d)>\n", roctracer_op_string(ACTIVITY_DOMAIN_KFD_API, cid, 0), @@ -158,17 +178,29 @@ void handle_roc_kfd(uint32_t cid, const void* callback_data, void* arg) { data->correlation_id, (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit", GetPid(), GetTid()); + if (data->phase == ACTIVITY_API_PHASE_ENTER) { + auto timer = apex::new_task( + roctracer_op_string(ACTIVITY_DOMAIN_KFD_API, cid, 0)); + apex::start(timer); + timer_stack.push(timer); + correlation_map_mutex.lock(); + correlation_map[data->correlation_id] = timer; + correlation_map_mutex.unlock(); + } else { + if (!timer_stack.empty()) { + auto timer = timer_stack.top(); + apex::stop(timer); + timer_stack.pop(); + } + } return; } -/* The map that holds correlation IDs and matches them to GUIDs */ -std::unordered_map> correlation_map; -std::mutex correlation_map_mutex; - /* The HIP callback API. For these events, we have to check whether it's * the entry or exit event, and act accordingly. */ void handle_hip(uint32_t cid, const void* callback_data, void* arg) { + APEX_UNUSED(arg); static APEX_NATIVE_TLS std::stack > timer_stack; const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); SPRINT("<%s id(%u)\tcorrelation_id(%lu) %s pid(%d) tid(%d)> ", @@ -202,10 +234,30 @@ void handle_hip(uint32_t cid, const void* callback_data, void* arg) { case HIP_API_ID_hipFree: SPRINT("ptr(%p)", data->args.hipFree.ptr); break; + case HIP_API_ID_hipLaunchKernel: + correlation_map_mutex.lock(); + correlation_kernel_map[data->correlation_id] = (const void*)data->args.hipLaunchKernel.function_address; + correlation_map_mutex.unlock(); + break; case HIP_API_ID_hipModuleLaunchKernel: - SPRINT("kernel(\"%s\") stream(%p)", - hipKernelNameRef(data->args.hipModuleLaunchKernel.f), - data->args.hipModuleLaunchKernel.stream); + correlation_map_mutex.lock(); + correlation_kernel_map[data->correlation_id] = (const void*)data->args.hipModuleLaunchKernel.f; + correlation_map_mutex.unlock(); + break; + case HIP_API_ID_hipHccModuleLaunchKernel: + correlation_map_mutex.lock(); + correlation_kernel_map[data->correlation_id] = (const void*)data->args.hipHccModuleLaunchKernel.f; + correlation_map_mutex.unlock(); + break; + case HIP_API_ID_hipExtModuleLaunchKernel: + correlation_map_mutex.lock(); + correlation_kernel_map[data->correlation_id] = (const void*)data->args.hipExtModuleLaunchKernel.f; + correlation_map_mutex.unlock(); + break; + case HIP_API_ID_hipExtLaunchKernel: + correlation_map_mutex.lock(); + correlation_kernel_map[data->correlation_id] = (const void*)data->args.hipExtLaunchKernel.function_address; + correlation_map_mutex.unlock(); break; default: break; @@ -229,10 +281,27 @@ void handle_hip(uint32_t cid, const void* callback_data, void* arg) { SFLUSH(); } +bool run_once() { + // synchronize timestamps + // We'll take a CPU timestamp before and after taking a GPU timestmp, then + // take the average of those two, hoping that it's roughly at the same time + // as the GPU timestamp. + startTimestampCPU = apex::profiler::now_ns(); + roctracer_get_timestamp(&startTimestampGPU); + startTimestampCPU += apex::profiler::now_ns(); + startTimestampCPU = startTimestampCPU / 2; + + // assume CPU timestamp is greater than GPU + deltaTimestamp = (int64_t)(startTimestampCPU) - (int64_t)(startTimestampGPU); + return true; +} + // Runtime API callback function void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) { (void)arg; + static bool dummy = run_once(); + APEX_UNUSED(dummy); if (domain == ACTIVITY_DOMAIN_ROCTX) { handle_roctx(cid, callback_data, arg); @@ -246,9 +315,95 @@ void api_callback(uint32_t domain, uint32_t cid, handle_hip(cid, callback_data, arg); } +void store_profiler_data(const std::string &name, uint32_t correlationId, + uint64_t start, uint64_t end, apex::hip_thread_node &node, + bool otf2_trace = true) { + apex::in_apex prevent_deadlocks; + // Get the singleton APEX instance + static apex::apex* instance = apex::apex::instance(); + // get the parent GUID, then erase the correlation from the map + std::shared_ptr parent = nullptr; + if (correlationId > 0) { + correlation_map_mutex.lock(); + parent = correlation_map[correlationId]; + correlation_map.erase(correlationId); + correlation_map_mutex.unlock(); + } + // Build the name + std::stringstream ss; + ss << "GPU: " << std::string(name); + std::string tmp{ss.str()}; + // create a task_wrapper, as a GPU child of the parent on the CPU side + auto tt = apex::new_task(tmp, UINT64_MAX, parent); + // create an APEX profiler to store this data - we can't start + // then stop because we have timestamps already. + auto prof = std::make_shared(tt); + prof->set_start(start + deltaTimestamp); + prof->set_end(end + deltaTimestamp); + // important! Otherwise we might get the wrong end timestamp. + prof->stopped = true; + // fake out the profiler_listener + instance->the_profiler_listener->push_profiler_public(prof); + // Handle tracing, if necessary + if (apex::apex_options::use_trace_event()) { + apex::trace_event_listener * tel = + (apex::trace_event_listener*)instance->the_trace_event_listener; + tel->on_async_event(node, prof); + } +#ifdef APEX_HAVE_OTF2 + if (apex::apex_options::use_otf2() && otf2_trace) { + apex::otf2_listener * tol = + (apex::otf2_listener*)instance->the_otf2_listener; + tol->on_async_event(node, prof); + } +#else + APEX_UNUSED(otf2_trace); +#endif + // have the listeners handle the end of this task + instance->complete_task(tt); +} + +void handle_hip_activity(const roctracer_record_t* record) { + const char * name = roctracer_op_string(record->domain, record->op, record->kind); + switch(record->op) { + case HIP_OP_ID_DISPATCH: { + correlation_map_mutex.lock(); + const void* f = correlation_kernel_map[record->correlation_id]; + correlation_kernel_map.erase(record->correlation_id); + correlation_map_mutex.unlock(); + apex::hip_thread_node node(record->device_id, record->queue_id, APEX_ASYNC_KERNEL); + std::stringstream ss; + ss << "UNRESOLVED ADDR " << std::hex << f ; + store_profiler_data(ss.str(), record->correlation_id, record->begin_ns, + record->end_ns, node); + break; + } + case HIP_OP_ID_COPY: { + apex::hip_thread_node node(record->device_id, record->queue_id, APEX_ASYNC_MEMORY); + store_profiler_data(name, record->correlation_id, record->begin_ns, + record->end_ns, node); + break; + } + case HIP_OP_ID_BARRIER: { + apex::hip_thread_node node(record->device_id, record->queue_id, APEX_ASYNC_SYNCHRONIZE); + store_profiler_data(name, record->correlation_id, record->begin_ns, + record->end_ns, node); + break; + } + case HIP_OP_ID_NUMBER: + default: { + apex::hip_thread_node node(record->device_id, record->queue_id, APEX_ASYNC_OTHER); + store_profiler_data(name, record->correlation_id, record->begin_ns, + record->end_ns, node); + break; + } + } +} + // Activity tracing callback // hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) void activity_callback(const char* begin, const char* end, void* arg) { + APEX_UNUSED(arg); const roctracer_record_t* record = (const roctracer_record_t*)(begin); const roctracer_record_t* end_record = (const roctracer_record_t*)(end); @@ -260,21 +415,27 @@ void activity_callback(const char* begin, const char* end, void* arg) { record->correlation_id, record->begin_ns, record->end_ns); + /* Not interested in these. This is the same as the callback call + * and we don't need to record it twice. */ + /* if ((record->domain == ACTIVITY_DOMAIN_HIP_API) || (record->domain == ACTIVITY_DOMAIN_KFD_API)) { SPRINT(" process_id(%u) thread_id(%u)", record->process_id, record->thread_id); - } else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { + } else */ if (record->domain == ACTIVITY_DOMAIN_HIP_OPS) { + // FYI, ACTIVITY_DOMAIN_HIP_OPS = ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_VDI... SPRINT(" device_id(%d) queue_id(%lu)", record->device_id, record->queue_id); + handle_hip_activity(record); if (record->op == HIP_OP_ID_COPY) SPRINT(" bytes(0x%zx)", record->bytes); - } else if (record->domain == ACTIVITY_DOMAIN_HSA_OPS) { + } /* We have no interest in the samples, either - for now */ + /* else if (record->domain == ACTIVITY_DOMAIN_HSA_OPS) { SPRINT(" se(%u) cycle(%lu) pc(%lx)", record->pc_sample.se, record->pc_sample.cycle, record->pc_sample.pc); - } else if (record->domain == ACTIVITY_DOMAIN_EXT_API) { + } */ else if (record->domain == ACTIVITY_DOMAIN_EXT_API) { SPRINT(" external_id(%lu)", record->external_id); } else { fprintf(stderr, "Bad domain %d\n\n", record->domain); @@ -298,20 +459,24 @@ void init_tracing() { properties.buffer_size = 0x1000; properties.buffer_callback_fun = activity_callback; ROCTRACER_CALL(roctracer_open_pool(&properties)); + // Enable HIP API callbacks ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, api_callback, NULL)); - // Enable HIP activity tracing - //#if HIP_API_ACTIVITY_ON - ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); - //#endif - ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); - // Enable PC sampling - ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); // Enable KFD API tracing - //ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_KFD_API, api_callback, NULL)); + if (apex::apex_options::use_hip_kfd_api()) { + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_KFD_API, api_callback, NULL)); + } // Enable rocTX ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, NULL)); + + // Enable HIP activity tracing + //ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + // FYI, ACTIVITY_DOMAIN_HIP_OPS = ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_VDI... + ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)); + // Enable PC sampling + //ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); roctracer_start(); + } namespace apex { @@ -322,14 +487,19 @@ namespace apex { void stop_hip_trace() { roctracer_stop(); + /* CAllbacks */ ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)); - //#if HIP_API_ACTIVITY_ON - ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); - //#endif - ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); - ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_OPS)); - ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_KFD_API)); + if (apex_options::use_hip_kfd_api()) { + ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_KFD_API)); + } ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX)); + + /* Activity */ + //ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + // FYI, ACTIVITY_DOMAIN_HIP_OPS = ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_VDI... + ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)); + ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_EXT_API)); + //ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_OPS)); ROCTRACER_CALL(roctracer_flush_activity()); printf("# STOP #############################\n"); } diff --git a/src/apex/otf2_listener.cpp b/src/apex/otf2_listener.cpp index 58e7234c..ae90aefa 100644 --- a/src/apex/otf2_listener.cpp +++ b/src/apex/otf2_listener.cpp @@ -49,16 +49,25 @@ using namespace std; namespace apex { - uint32_t otf2_listener::make_vtid (cuda_thread_node &node) { + uint32_t otf2_listener::make_vtid (async_thread_node &node) { size_t tid; /* There is a potential for overlap here, but not a high potential. The CPU and the GPU * would BOTH have to spawn 64k+ threads/streams for this to happen. */ if (vthread_map.count(node) == 0) { // build the thread name for viewers std::stringstream ss; +#ifdef APEX_WITH_CUPTI ss << "CUDA[" << node._device; ss << ":" << node._context; ss << ":" << node._stream; +#endif +#ifdef APEX_WITH_HIP + ss << "HIP[" << node._device; + ss << ":" << node._queue; +#endif +#if !defined(APEX_WITH_CUPTI) && !defined(APEX_WITH_HIP) + ss << "GPU[" << node._device; +#endif ss << "] " << activity_to_string(node._activity); std::string name{ss.str()}; // lock the archive lock, we need to make an event writer @@ -77,7 +86,7 @@ namespace apex { // done with the set of event threads, so unlock. _event_set_mutex.unlock(); // use the OTF2 thread index (not reversed) for the vthread_map - vthread_map.insert(std::pair(node,id)); + vthread_map.insert(std::pair(node,id)); // construct a globally unique ID for this thread on this rank uint64_t my_node_id = my_saved_node_id; my_node_id = (my_node_id << 32) + id; @@ -2601,7 +2610,7 @@ namespace apex { #endif - void otf2_listener::on_async_event(cuda_thread_node &node, + void otf2_listener::on_async_event(thread_node &node, std::shared_ptr &p) { // This could be a callback from a library before APEX is ready // Something like OpenMP or CUDA/CUPTI or...? @@ -2662,7 +2671,7 @@ namespace apex { } - void otf2_listener::on_async_metric(cuda_thread_node &node, + void otf2_listener::on_async_metric(thread_node &node, std::shared_ptr &p) { // This could be a callback from a library before APEX is ready // Something like OpenMP or CUDA/CUPTI or...? diff --git a/src/apex/otf2_listener.hpp b/src/apex/otf2_listener.hpp index a22cb315..ce91ffcf 100644 --- a/src/apex/otf2_listener.hpp +++ b/src/apex/otf2_listener.hpp @@ -20,7 +20,7 @@ #include #include "apex_cxx_shared_lock.hpp" #include "profiler.hpp" -#include "cuda_thread_node.hpp" +#include "async_thread_node.hpp" namespace apex { @@ -193,9 +193,9 @@ namespace apex { uint64_t stamp, bool is_enter); #endif std::mutex _vthread_mutex; - std::map vthread_map; + std::map vthread_map; std::map vthread_evt_writer_map; - uint32_t make_vtid (cuda_thread_node &node); + uint32_t make_vtid (async_thread_node &node); std::map last_ts; uint64_t dropped; public: @@ -238,9 +238,9 @@ namespace apex { { APEX_UNUSED(data); }; void on_send(message_event_data &data); void on_recv(message_event_data &data); - void on_async_event(cuda_thread_node &node, + void on_async_event(async_thread_node &node, std::shared_ptr &p); - void on_async_metric(cuda_thread_node &node, + void on_async_metric(async_thread_node &node, std::shared_ptr &p); }; diff --git a/src/apex/trace_event_listener.cpp b/src/apex/trace_event_listener.cpp index 907a35c0..82b6e235 100644 --- a/src/apex/trace_event_listener.cpp +++ b/src/apex/trace_event_listener.cpp @@ -190,7 +190,7 @@ void trace_event_listener::set_metadata(const char * name, const char * value) { APEX_UNUSED(value); } -std::string trace_event_listener::make_tid (cuda_thread_node &node) { +std::string trace_event_listener::make_tid (async_thread_node &node) { size_t tid; /* There is a potential for overlap here, but not a high potential. The CPU and the GPU * would BOTH have to spawn 64k+ threads/streams for this to happen. */ @@ -198,14 +198,23 @@ std::string trace_event_listener::make_tid (cuda_thread_node &node) { size_t id = vthread_map.size()+1; //uint32_t id_reversed = simple_reverse(id); uint32_t id_shifted = id << 16; - vthread_map.insert(std::pair(node,id_shifted)); + vthread_map.insert(std::pair(node,id_shifted)); std::stringstream ss; ss << "{\"name\":\"thread_name\"" << ",\"ph\":\"M\",\"pid\":" << saved_node_id << ",\"tid\":" << id_shifted - << ",\"args\":{\"name\":" - << "\"CUDA [" << node._device << ":" << node._context + << ",\"args\":{\"name\":"; +#ifdef APEX_WITH_CUDA + ss << "\"CUDA [" << node._device << ":" << node._context << ":" << std::setfill('0') << setw(5) << node._stream << "]"; +#endif +#ifdef APEX_WITH_HIP + ss << "\"HIP [" << node._device + << ":" << std::setfill('0') << setw(5) << node._queue << "]"; +#endif +#if !defined(APEX_WITH_CUDA) && !defined(APEX_WITH_HIP) + ss << "\"GPU [" << node._device << "]"; +#endif //ss << "" << activity_to_string(node._activity); ss << "\""; ss << "}},\n"; @@ -224,7 +233,7 @@ std::string trace_event_listener::make_tid (cuda_thread_node &node) { return label; } -void trace_event_listener::on_async_event(cuda_thread_node &node, +void trace_event_listener::on_async_event(async_thread_node &node, std::shared_ptr &p) { if (!_terminate) { std::stringstream ss; @@ -244,7 +253,7 @@ void trace_event_listener::on_async_event(cuda_thread_node &node, } } -void trace_event_listener::on_async_metric(cuda_thread_node &node, +void trace_event_listener::on_async_metric(async_thread_node &node, std::shared_ptr &p) { if (!_terminate) { std::stringstream ss; diff --git a/src/apex/trace_event_listener.hpp b/src/apex/trace_event_listener.hpp index 93861333..32e545f1 100644 --- a/src/apex/trace_event_listener.hpp +++ b/src/apex/trace_event_listener.hpp @@ -9,7 +9,7 @@ #pragma once #include "event_listener.hpp" -#include "cuda_thread_node.hpp" +#include "async_thread_node.hpp" #include #include #include @@ -49,8 +49,8 @@ class trace_event_listener : public event_listener { void on_recv(message_event_data &data) { APEX_UNUSED(data); }; void set_node_id(int node_id, int node_count); void set_metadata(const char * name, const char * value); - void on_async_event(cuda_thread_node &node, std::shared_ptr &p); - void on_async_metric(cuda_thread_node &node, std::shared_ptr &p); + void on_async_event(async_thread_node &node, std::shared_ptr &p); + void on_async_metric(async_thread_node &node, std::shared_ptr &p); void end_trace_time(void); private: @@ -60,7 +60,7 @@ class trace_event_listener : public event_listener { void close_trace(void); void flush_trace_if_necessary(void); void _common_stop(std::shared_ptr &p); - std::string make_tid (cuda_thread_node &node); + std::string make_tid (async_thread_node &node); int get_thread_id_metadata(); static bool _initialized; size_t get_thread_index(void); @@ -74,7 +74,7 @@ class trace_event_listener : public event_listener { std::map mutexes; std::map streams; std::mutex _vthread_mutex; - std::map vthread_map; + std::map vthread_map; double _end_time; }; diff --git a/src/apex/utils.cpp b/src/apex/utils.cpp index 66fbb556..564ce1fc 100644 --- a/src/apex/utils.cpp +++ b/src/apex/utils.cpp @@ -530,20 +530,20 @@ uint64_t test_for_MPI_comm_size(uint64_t commsize) { return commsize; } -std::string activity_to_string(apex_cuda_async_activity_t activity) { +std::string activity_to_string(apex_async_activity_t activity) { static std::string kernel{"Compute"}; static std::string memory{"Memory"}; static std::string sync{"Sync"}; static std::string other{"Other"}; static std::string empty{""}; switch (activity) { - case APEX_CUDA_KERNEL: + case APEX_ASYNC_KERNEL: return kernel; - case APEX_CUDA_MEMORY: + case APEX_ASYNC_MEMORY: return memory; - case APEX_CUDA_SYNCHRONIZE: + case APEX_ASYNC_SYNCHRONIZE: return sync; - case APEX_CUDA_OTHER: + case APEX_ASYNC_OTHER: return other; default: return empty; diff --git a/src/apex/utils.hpp b/src/apex/utils.hpp index 84c3ab62..8f33e30c 100644 --- a/src/apex/utils.hpp +++ b/src/apex/utils.hpp @@ -257,7 +257,7 @@ inline char filesystem_separator() uint64_t test_for_MPI_comm_rank(uint64_t commrank); uint64_t test_for_MPI_comm_size(uint64_t commsize); -std::string activity_to_string(apex_cuda_async_activity_t activity); +std::string activity_to_string(apex_async_activity_t activity); class node_color { public: diff --git a/src/unit_tests/HIP/MatrixTranspose.cpp b/src/unit_tests/HIP/MatrixTranspose.cpp index b5980a74..3224266e 100644 --- a/src/unit_tests/HIP/MatrixTranspose.cpp +++ b/src/unit_tests/HIP/MatrixTranspose.cpp @@ -104,6 +104,7 @@ int main() { // Memory transfer from host to device RUNTIME_API_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice)); + RUNTIME_API_CALL(hipDeviceSynchronize()); roctracer_mark("before HIP LaunchKernel"); roctxMark("before hipLaunchKernel"); @@ -115,11 +116,13 @@ int main() { gpuMatrix, WIDTH); roctracer_mark("after HIP LaunchKernel"); roctxMark("after hipLaunchKernel"); + RUNTIME_API_CALL(hipDeviceSynchronize()); // Memory transfer from device to host roctxRangePush("hipMemcpy"); RUNTIME_API_CALL(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost)); + RUNTIME_API_CALL(hipDeviceSynchronize()); roctxRangePop(); // for "hipMemcpy" roctxRangePop(); // for "hipLaunchKernel" @@ -147,6 +150,7 @@ int main() { // free the resources on device side RUNTIME_API_CALL(hipFree(gpuMatrix)); RUNTIME_API_CALL(hipFree(gpuTransposeMatrix)); + RUNTIME_API_CALL(hipDeviceSynchronize()); // free the resources on host side free(Matrix);