microsoft · Binyang2014 · Nov 26, 2024 · Nov 11, 2024 · Nov 22, 2024 · Nov 23, 2024
diff --git a/include/mscclpp/npkit/npkit_event.hpp b/include/mscclpp/npkit/npkit_event.hpp
@@ -13,6 +13,6 @@
 #define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x4
 
 #define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x5
-#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x17
+#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x18
 
 #endif
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
@@ -17,6 +17,8 @@ std::vector<T> filter(const std::vector<T>& vec, Predicate pred) {
 
 auto getOpType = [](const std::string& str) {
   if (str == "nop") {
+    return mscclpp::OperationType::NOP;
+  } else if (str == "barrier") {
     return mscclpp::OperationType::BARRIER;
   } else if (str == "put") {
     return mscclpp::OperationType::PUT;
@@ -456,6 +458,12 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
           operation.size =
               this->getNChunkSize(rank, this->inputSize, this->outputSize, (uint32_t)op["cnt"], chunkIndexes);
         }
+        if (op.contains("barrier_id")) {
+          operation.deviceSyncerIndex = op["barrier_id"];
+        }
+        if (op.contains("nthread_blocks")) {
+          operation.nThreadBlocks = op["nthread_blocks"];
+        }
         ops.push_back(operation);
       }
       this->operations[rank].push_back(ops);

diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
@@ -30,6 +30,7 @@ enum class ChannelType : uint8_t {
 
 // NOTE(chhwang): any modification here requires corresponding updates in `tools/npkit/npkit_trace_generator.py`.
 enum class OperationType : uint8_t {
+  NOP,
   BARRIER,
   PUT,
   PUT_PACKET,
@@ -78,11 +79,20 @@ struct Operation {
     BufferType outputBufferType;
     uint8_t nvlsOutputIndex;
   };
-  uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
-  uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
-  uint32_t srcOffset;
-  uint32_t dstOffset;
-  uint32_t size;
+  union {
+    // For Barrier operation
+    struct {
+      uint32_t deviceSyncerIndex;
+      uint32_t nThreadBlocks;
+    };
+    struct {
+      uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION];
+      uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION];
+      uint32_t srcOffset;
+      uint32_t dstOffset;
+      uint32_t size;
+    };
+  };
 };
 
 // total size = 2304 + 6400 + 4 + 12(padding) = 8720 bytes

diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
@@ -8,6 +8,7 @@
 #if defined(ENABLE_NPKIT)
 #include <mscclpp/npkit/npkit.hpp>
 #endif
+#include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/packet_device.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
@@ -172,6 +173,9 @@ struct VectorType<float> {
 
 namespace mscclpp {
 
+#define MAX_DEVICE_SYNCERS 16
+__device__ DeviceSyncer deviceSyncers[MAX_DEVICE_SYNCERS];
+
 #if defined(MSCCLPP_DEVICE_COMPILE)
 
 template <typename T>
@@ -526,8 +530,12 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
                               event_buffer, &event_buffer_head);
 #endif
 
-    if (op.type == OperationType::BARRIER) {
+    if (op.type == OperationType::NOP) {
       __syncthreads();
+    } else if (op.type == OperationType::BARRIER) {
+      int nThreadBlocks = op.nThreadBlocks;
+      int syncStateIndex = op.deviceSyncerIndex;
+      deviceSyncers[syncStateIndex].sync(nThreadBlocks);
     } else if (op.type == OperationType::SIGNAL) {
       handleSignal(smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
     } else if (op.type == OperationType::WAIT) {

diff --git a/test/executor_test.cc b/test/executor_test.cc
@@ -131,11 +131,12 @@ int main(int argc, char* argv[]) {
   }
 
   mscclpp::ExecutionPlan plan(executionPlanName, executionPlanPath);
-#if (CUDA_NVLS_SUPPORTED)
-  std::shared_ptr<char> sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
-#else
-  std::shared_ptr<char> sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
-#endif
+  std::shared_ptr<char> sendbuff;
+  if (mscclpp::isNvlsSupported()) {
+    sendbuff = mscclpp::allocSharedPhysicalCuda<char>(bufferSize);
+  } else {
+    sendbuff = mscclpp::allocExtSharedCuda<char>(bufferSize);
+  }
   std::vector<int> dataHost(bufferSize / sizeof(int), rank);
   MSCCLPP_CUDATHROW(cudaMemcpy(sendbuff.get(), dataHost.data(), bufferSize, cudaMemcpyHostToDevice));
   double deltaSec = benchTime(rank, bootstrap, executor, plan, sendbuff, bufferSize, niters, ngraphIters, packetType);

diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py
@@ -11,6 +11,7 @@
 def parse_npkit_event_header(npkit_event_header_path):
     npkit_event_def = {"id_to_type": {}, "type_to_id": {}}
     executor_ops = [
+        "NOP",
         "BARRIER",
         "PUT",
         "PUT_PACKET",