Skip to content

Commit

Permalink
Working HIP with actions
Browse files Browse the repository at this point in the history
  • Loading branch information
khuck committed May 25, 2021
1 parent d3f9375 commit 35e3638
Show file tree
Hide file tree
Showing 12 changed files with 378 additions and 120 deletions.
17 changes: 12 additions & 5 deletions src/apex/apex_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,11 @@ typedef enum {APEX_SIMPLE_HYSTERESIS, /*!< optimize using sliding window of
/**
* Typedef for enumerating the different asynchronous activity types
*/
typedef enum {APEX_CUDA_KERNEL, /* CUDA Kernel */
APEX_CUDA_MEMORY, /* CUDA memory copy */
APEX_CUDA_SYNCHRONIZE, /* CUDA Synchronization events */
APEX_CUDA_OTHER /* CUDA OpenACC "other" events */
} apex_cuda_async_activity_t;
typedef enum {APEX_ASYNC_KERNEL, /* Kernel */
APEX_ASYNC_MEMORY, /* memory copy */
APEX_ASYNC_SYNCHRONIZE, /* Synchronization events */
APEX_ASYNC_OTHER /* OpenACC or "other" events */
} apex_async_activity_t;

/**
* Structure that holds a profiler ID
Expand Down Expand Up @@ -319,6 +319,13 @@ inline unsigned int sc_nprocessors_onln()
macro (APEX_CUDA_SYNC_ACTIVITY, use_cuda_sync_activity, bool, true) \
macro (APEX_CUDA_MEMORY_ACTIVITY, use_cuda_memory_activity, bool, true) \
macro (APEX_CUDA_KERNEL_ACTIVITY, use_cuda_kernel_activity, bool, true) \
macro (APEX_HIP_COUNTERS, use_hip_counters, int, false) \
macro (APEX_HIP_KERNEL_DETAILS, use_hip_kernel_details, int, false) \
macro (APEX_HIP_RUNTIME_API, use_hip_runtime_api, bool, true) \
macro (APEX_HIP_KFD_API, use_hip_kfd_api, bool, false) \
macro (APEX_HIP_SYNC_ACTIVITY, use_hip_sync_activity, bool, true) \
macro (APEX_HIP_MEMORY_ACTIVITY, use_hip_memory_activity, bool, true) \
macro (APEX_HIP_KERNEL_ACTIVITY, use_hip_kernel_activity, bool, true) \
macro (APEX_JUPYTER_SUPPORT, use_jupyter_support, int, false) \
macro (APEX_KOKKOS_VERBOSE, use_kokkos_verbose, bool, false) \
macro (APEX_KOKKOS_TUNING, use_kokkos_tuning, bool, true) \
Expand Down
105 changes: 105 additions & 0 deletions src/apex/async_thread_node.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) 2014-2021 Kevin Huck
* Copyright (c) 2014-2021 University of Oregon
*
* Distributed under the Boost Software License, Version 1.0. (See accompanying
* file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
*/

#pragma once

namespace apex {

class cuda_thread_node {
public:
uint32_t _device;
uint32_t _context;
uint32_t _stream;
apex_async_activity_t _activity;
cuda_thread_node(uint32_t device, uint32_t context, uint32_t stream,
apex_async_activity_t activity) :
_device(device), _context(context), _stream(stream),
_activity(activity) { }
bool operator==(const cuda_thread_node &rhs) const {
return (_device == rhs._device &&
_context == rhs._context &&
_stream == rhs._stream &&
_activity == rhs._activity);
}
bool operator<(const cuda_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device && _context < rhs._context) {
return true;
} else if (_device == rhs._device && _context == rhs._context &&
_stream < rhs._stream) {
return true;
} else if (_device == rhs._device && _context == rhs._context &&
_stream == rhs._stream && _activity < rhs._activity &&
apex_options::use_otf2()) {
return true;
}
return false;
}
};

class hip_thread_node {
public:
uint32_t _device;
uint32_t _queue;
apex_async_activity_t _activity;
hip_thread_node(uint32_t device, uint32_t command_queue,
apex_async_activity_t activity) :
_device(device), _queue(command_queue),
_activity(activity) { }
bool operator==(const hip_thread_node &rhs) const {
return (_device == rhs._device &&
_queue == rhs._queue &&
_activity == rhs._activity);
}
bool operator<(const hip_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device && _queue < rhs._queue) {
return true;
} else if (_device == rhs._device && _queue == rhs._queue &&
_activity < rhs._activity && apex_options::use_otf2()) {
return true;
}
return false;
}
};

class dummy_thread_node {
public:
uint32_t _device;
apex_async_activity_t _activity;
dummy_thread_node(uint32_t device, apex_async_activity_t activity) :
_device(device), _activity(activity) { }
bool operator==(const dummy_thread_node &rhs) const {
return (_device == rhs._device && _activity == rhs._activity);
}
bool operator<(const dummy_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device &&
_activity < rhs._activity && apex_options::use_otf2()) {
return true;
}
return false;
}
};

}

#ifdef APEX_WITH_CUDA
using async_thread_node = apex::hip_thread_node;
#endif

#ifdef APEX_WITH_HIP
using async_thread_node = apex::hip_thread_node;
#endif

#if !defined(APEX_WITH_CUDA) && !defined(APEX_WITH_HIP)
using async_thread_node = apex::dummy_thread_node;
#endif
46 changes: 0 additions & 46 deletions src/apex/cuda_thread_node.hpp

This file was deleted.

20 changes: 10 additions & 10 deletions src/apex/cupti_trace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#ifdef APEX_HAVE_OTF2
#include "otf2_listener.hpp"
#endif
#include "cuda_thread_node.hpp"
#include "async_thread_node.hpp"

#include <cuda.h>
#include <cupti.h>
Expand Down Expand Up @@ -570,7 +570,7 @@ static void memcpyActivity2(CUpti_Activity *record) {
<< memcpy->deviceId << "->" << memcpy->dstDeviceId;
std::string name{ss.str()};
apex::cuda_thread_node node(memcpy->deviceId, memcpy->contextId,
memcpy->streamId, APEX_CUDA_MEMORY);
memcpy->streamId, APEX_ASYNC_MEMORY);
store_profiler_data(name, memcpy->correlationId, memcpy->start,
memcpy->end, node);
if (apex::apex_options::use_cuda_counters()) {
Expand All @@ -594,7 +594,7 @@ static void memcpyActivity(CUpti_Activity *record) {
}
std::string name{getMemcpyKindString(memcpy->copyKind)};
apex::cuda_thread_node node(memcpy->deviceId, memcpy->contextId,
memcpy->streamId, APEX_CUDA_MEMORY);
memcpy->streamId, APEX_ASYNC_MEMORY);
store_profiler_data(name, memcpy->correlationId, memcpy->start,
memcpy->end, node);
if (apex::apex_options::use_cuda_counters()) {
Expand All @@ -618,7 +618,7 @@ static void unifiedMemoryActivity(CUpti_Activity *record) {
uint32_t device = getUvmCounterDevice(
(CUpti_ActivityUnifiedMemoryCounterKind) memcpy->counterKind,
memcpy->srcId, memcpy->dstId);
apex::cuda_thread_node node(device, 0, 0, APEX_CUDA_MEMORY);
apex::cuda_thread_node node(device, 0, 0, APEX_ASYNC_MEMORY);
if (memcpy->counterKind ==
CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
|| memcpy->counterKind ==
Expand Down Expand Up @@ -659,7 +659,7 @@ static void memsetActivity(CUpti_Activity *record) {
CUpti_ActivityMemset *memset = (CUpti_ActivityMemset *) record;
static std::string name{"Memset"};
apex::cuda_thread_node node(memset->deviceId, memset->contextId,
memset->streamId, APEX_CUDA_MEMORY);
memset->streamId, APEX_ASYNC_MEMORY);
store_profiler_data(name, memset->correlationId, memset->start,
memset->end, node);
}
Expand All @@ -670,7 +670,7 @@ static void kernelActivity(CUpti_Activity *record) {
std::string tmp = std::string(kernel->name);
//DEBUG_PRINT("Kernel CorrelationId: %u\n", kernel->correlationId);
apex::cuda_thread_node node(kernel->deviceId, kernel->contextId,
kernel->streamId, APEX_CUDA_KERNEL);
kernel->streamId, APEX_ASYNC_KERNEL);
store_profiler_data(tmp, kernel->correlationId, kernel->start,
kernel->end, node);
if (apex::apex_options::use_cuda_counters()) {
Expand Down Expand Up @@ -719,7 +719,7 @@ static void openaccDataActivity(CUpti_Activity *record) {
CUpti_ActivityOpenAccData *data = (CUpti_ActivityOpenAccData *) record;
std::string label{openacc_event_names[data->eventKind]};
apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId,
data->cuStreamId, APEX_CUDA_MEMORY);
data->cuStreamId, APEX_ASYNC_MEMORY);
store_profiler_data(label, data->externalId, data->start, data->end, node);
static std::string bytes{"Bytes Transferred"};
store_counter_data(label.c_str(), bytes, data->end, data->bytes, node);
Expand All @@ -729,7 +729,7 @@ static void openaccKernelActivity(CUpti_Activity *record) {
CUpti_ActivityOpenAccLaunch *data = (CUpti_ActivityOpenAccLaunch *) record;
std::string label{openacc_event_names[data->eventKind]};
apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId,
data->cuStreamId, APEX_CUDA_KERNEL);
data->cuStreamId, APEX_ASYNC_KERNEL);
store_profiler_data(label, data->externalId, data->start,
data->end, node);
static std::string gangs{"Num Gangs"};
Expand All @@ -744,7 +744,7 @@ static void openaccOtherActivity(CUpti_Activity *record) {
CUpti_ActivityOpenAccOther *data = (CUpti_ActivityOpenAccOther *) record;
std::string label{openacc_event_names[data->eventKind]};
apex::cuda_thread_node node(data->cuDeviceId, data->cuContextId,
data->cuStreamId, APEX_CUDA_OTHER);
data->cuStreamId, APEX_ASYNC_OTHER);
store_profiler_data(label, data->externalId, data->start, data->end, node);
}

Expand All @@ -771,7 +771,7 @@ static void syncActivity(CUpti_Activity *record) {
data->type == CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE) {
stream = data->streamId;
}
apex::cuda_thread_node node(device, context, stream, APEX_CUDA_SYNCHRONIZE);
apex::cuda_thread_node node(device, context, stream, APEX_ASYNC_SYNCHRONIZE);
/* Event Synchronize doesn't have a stream ID, and can come from any thread,
* and can overlap. So if we are OTF2 tracing, ignore them. */
if (apex::apex_options::use_otf2() &&
Expand Down
Loading

0 comments on commit 35e3638

Please sign in to comment.