Skip to content

Commit

Permalink
[NPKIT] Adding the NPKIT support for kernel allreduce7 in mscclpp-nccl (
Browse files Browse the repository at this point in the history
  • Loading branch information
PedramAlizadeh authored Jan 3, 2025
1 parent ba0d0d6 commit 97eaca2
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 7 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Enable NPKIT" OFF)

if(MSCCLPP_BYPASS_GPU_CHECK)
if(MSCCLPP_USE_CUDA)
Expand Down Expand Up @@ -122,8 +123,8 @@ endif()
if(MSCCLPP_ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
endif()
if(NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${NPKIT_FLAGS})
if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()

# libmscclpp
Expand Down
4 changes: 3 additions & 1 deletion apps/nccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ if(MSCCLPP_USE_CUDA)
elseif(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM)
endif()

if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
add_library(mscclpp_nccl SHARED)
target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
Expand Down
53 changes: 52 additions & 1 deletion apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#include <mscclpp/sm_channel.hpp>
#include <mscclpp/sm_channel_device.hpp>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif

#include "common.hpp"

template <typename To, typename From>
Expand Down Expand Up @@ -238,10 +242,40 @@ template <typename T>
__global__ void __launch_bounds__(1024, 1)
allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
size_t nelems, uint32_t flag) {
size_t nelems, uint32_t flag
#if defined(ENABLE_NPKIT)
,
NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
#else
) {
#endif
// This version of allreduce only works for single nodes
if (worldSize != nRanksPerNode) return;

#if defined(ENABLE_NPKIT)
extern __shared__ int4 NpkitSharedMem[];
NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem);
uint64_t event_buffer_head = 0;
#if defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
uint64_t npkit_timestamp_entry = 0;
if (threadIdx.x == 0) {
npkit_timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
#if defined(MSCCLPP_DEVICE_HIP)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x),
#else
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
#endif
event_buffer, &event_buffer_head);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif

if (sizeof(T) == 2)
nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
else
Expand Down Expand Up @@ -312,6 +346,16 @@ __global__ void __launch_bounds__(1024, 1)
result[idx].x = data.x;
result[idx].y = data.y;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY, 0, 0, npkit_timestamp_entry, event_buffer,
&event_buffer_head);
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif
#if defined(ENABLE_NPKIT)
NpKit::StoreGpuEventShm(npKitEventCollectContexts, event_buffer, event_buffer_head);
#endif
}

template <typename T>
Expand Down Expand Up @@ -470,9 +514,16 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
nBlocks = 56;
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
}
#if defined(ENABLE_NPKIT)
size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
#else
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++);
#endif
} else {
int nBlocks = 35;
int nThreadsPerBlock = 512;
Expand Down
17 changes: 16 additions & 1 deletion apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
#include <sstream>
#include <unordered_map>
#include <vector>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif
#include "allgather.hpp"
#include "allreduce.hpp"
#include "broadcast.hpp"
Expand Down Expand Up @@ -427,6 +429,12 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
}

*comm = commPtr;
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Init(rank);
}
#endif
return ncclSuccess;
}

Expand All @@ -446,6 +454,13 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
WARN("comm is nullptr");
return ncclInvalidArgument;
}
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Dump(npkitDumpDir);
NpKit::Shutdown();
}
#endif
delete comm;
return ncclSuccess;
}
Expand Down
7 changes: 5 additions & 2 deletions include/mscclpp/npkit/npkit_event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
#define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x19
#define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x1A

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1B
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x2E
#define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x1B
#define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x30

#endif

0 comments on commit 97eaca2

Please sign in to comment.