From bc897f2080d7cf2957e66c53fa8c2376892bb14c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 11 Nov 2024 09:11:18 +0000 Subject: [PATCH 1/4] update with syncer --- src/include/execution_common.hpp | 19 ++++++++++++++----- src/include/execution_kernel.hpp | 8 +++++++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 99bf36a4f..fd576e8d2 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -68,11 +68,20 @@ struct Operation { uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; BufferType outputBufferType; }; - uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; - uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; - uint32_t srcOffset; - uint32_t dstOffset; - uint32_t size; + union { + // For Barrier operation + struct { + uint32_t deviceSyncerIndex; + uint32_t nThreadBlocks; + }; + struct { + uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; + uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; + uint32_t srcOffset; + uint32_t dstOffset; + uint32_t size; + }; + }; }; // total size = 1920 + 6400 + 4 + 4(padding) + 12(align) = 8336 bytes diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 0b64da197..b45ec0a1d 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -8,6 +8,7 @@ #if defined(ENABLE_NPKIT) #include #endif +#include #include #include #include @@ -143,6 +144,9 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) { namespace mscclpp { +#define MAX_DEVICE_SYNCERS 16 +__device__ DeviceSyncer deviceSyncers[MAX_DEVICE_SYNCERS]; + #if defined(MSCCLPP_DEVICE_COMPILE) template @@ -465,7 +469,9 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu #endif if (op.type == OperationType::BARRIER) { - __syncthreads(); + int nThreadBlocks = op.nThreadBlocks; + int syncStateIndex = op.deviceSyncerIndex; + deviceSyncers[syncStateIndex].sync(nThreadBlocks); } else if (op.type == OperationType::SIGNAL) { handleSignal(smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); } else if (op.type == OperationType::WAIT) { From 2fe679e11cde72d80a10b5e2ffd47ca97db8f74f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 24 Nov 2024 07:02:13 +0000 Subject: [PATCH 2/4] WIP --- src/executor/execution_plan.cc | 8 ++++++++ src/include/execution_common.hpp | 1 + src/include/execution_kernel.hpp | 4 +++- tools/npkit/npkit_trace_generator.py | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 49ceddf0a..b8cd08cf7 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -17,6 +17,8 @@ std::vector filter(const std::vector& vec, Predicate pred) { auto getOpType = [](const std::string& str) { if (str == "nop") { + return mscclpp::OperationType::THREADBLOCK_BARRIER; + } else if (str == "barrier") { return mscclpp::OperationType::BARRIER; } else if (str == "put") { return mscclpp::OperationType::PUT; @@ -456,6 +458,12 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse operation.size = this->getNChunkSize(rank, this->inputSize, this->outputSize, (uint32_t)op["cnt"], chunkIndexes); } + if (op.contains("barrier_id")) { + operation.deviceSyncerIndex = op["barrier_id"]; + } + if (op.contains("nthread_blocks")) { + operation.nThreadBlocks = op["nthread_blocks"]; + } ops.push_back(operation); } this->operations[rank].push_back(ops); diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 5fb2dbf90..00073e185 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -30,6 +30,7 @@ enum class ChannelType : uint8_t { // NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`. enum class OperationType : uint8_t { + THREADBLOCK_BARRIER, BARRIER, PUT, PUT_PACKET, diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 6bf7462f4..193651c47 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -530,7 +530,9 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu event_buffer, &event_buffer_head); #endif - if (op.type == OperationType::BARRIER) { + if (op.type == OperationType::THREADBLOCK_BARRIER) { + __syncthreads(); + } else if (op.type == OperationType::BARRIER) { int nThreadBlocks = op.nThreadBlocks; int syncStateIndex = op.deviceSyncerIndex; deviceSyncers[syncStateIndex].sync(nThreadBlocks); diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py index 9a5b88b44..5547bd771 100644 --- a/tools/npkit/npkit_trace_generator.py +++ b/tools/npkit/npkit_trace_generator.py @@ -11,6 +11,7 @@ def parse_npkit_event_header(npkit_event_header_path): npkit_event_def = {"id_to_type": {}, "type_to_id": {}} executor_ops = [ + "THREADBLOCK_BARRIER", "BARRIER", "PUT", "PUT_PACKET", From 32db36e3b77bb175f2ceeac843796af0690d0eb0 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 25 Nov 2024 01:19:18 +0000 Subject: [PATCH 3/4] change threadblock_barrier to nop --- src/executor/execution_plan.cc | 2 +- src/include/execution_common.hpp | 2 +- src/include/execution_kernel.hpp | 2 +- test/executor_test.cc | 11 ++++++----- tools/npkit/npkit_trace_generator.py | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index b8cd08cf7..20226b661 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -17,7 +17,7 @@ std::vector filter(const std::vector& vec, Predicate pred) { auto getOpType = [](const std::string& str) { if (str == "nop") { - return mscclpp::OperationType::THREADBLOCK_BARRIER; + return mscclpp::OperationType::NOP; } else if (str == "barrier") { return mscclpp::OperationType::BARRIER; } else if (str == "put") { diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 00073e185..d0d0dc30d 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -30,7 +30,7 @@ enum class ChannelType : uint8_t { // NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`. enum class OperationType : uint8_t { - THREADBLOCK_BARRIER, + NOP, BARRIER, PUT, PUT_PACKET, diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 193651c47..1b0490f91 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -530,7 +530,7 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu event_buffer, &event_buffer_head); #endif - if (op.type == OperationType::THREADBLOCK_BARRIER) { + if (op.type == OperationType::NOP) { __syncthreads(); } else if (op.type == OperationType::BARRIER) { int nThreadBlocks = op.nThreadBlocks; diff --git a/test/executor_test.cc b/test/executor_test.cc index 3fc0b1e21..e4ebcc972 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -131,11 +131,12 @@ int main(int argc, char* argv[]) { } mscclpp::ExecutionPlan plan(executionPlanName, executionPlanPath); -#if (CUDA_NVLS_SUPPORTED) - std::shared_ptr sendbuff = mscclpp::allocSharedPhysicalCuda(bufferSize); -#else - std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); -#endif + std::shared_ptr sendbuff; + if (mscclpp::isNvlsSupported()) { + sendbuff = mscclpp::allocSharedPhysicalCuda(bufferSize); + } else { + sendbuff = mscclpp::allocExtSharedCuda(bufferSize); + } std::vector dataHost(bufferSize / sizeof(int), rank); MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice)); double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType); diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py index 5547bd771..31c2e1622 100644 --- a/tools/npkit/npkit_trace_generator.py +++ b/tools/npkit/npkit_trace_generator.py @@ -11,7 +11,7 @@ def parse_npkit_event_header(npkit_event_header_path): npkit_event_def = {"id_to_type": {}, "type_to_id": {}} executor_ops = [ - "THREADBLOCK_BARRIER", + "NOP", "BARRIER", "PUT", "PUT_PACKET", From bf45397a8e91ff5679cd2599e7d5ac93ccd293e1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 26 Nov 2024 03:37:42 +0000 Subject: [PATCH 4/4] fix npkit ut --- include/mscclpp/npkit/npkit_event.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mscclpp/npkit/npkit_event.hpp b/include/mscclpp/npkit/npkit_event.hpp index 1a24b241f..cb1925626 100644 --- a/include/mscclpp/npkit/npkit_event.hpp +++ b/include/mscclpp/npkit/npkit_event.hpp @@ -13,6 +13,6 @@ #define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x4 #define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x5 -#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x17 +#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x18 #endif