Skip to content

Commit

Permalink
Fix nccl-test failure issue (#421)
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 authored Dec 19, 2024
1 parent 776f24e commit 6fedb7c
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 10 deletions.
27 changes: 18 additions & 9 deletions src/executor/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -510,8 +510,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
}
}

std::pair<size_t, u_int32_t> ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const {
std::pair<size_t, u_int32_t> sizePerRank;
std::pair<size_t, uint32_t> ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize,
size_t outputSize) const {
std::pair<size_t, uint32_t> sizePerRank;
if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) {
throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError);
} else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) {
Expand All @@ -534,15 +535,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS
}

const int nGroups = this->chunkGroups.at(rank);
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nInputChunks = sizePerRank.second;
uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t));
auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize);
uint32_t nChunks = rankSizeAndChunks.second;
uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t));
if (nelems % nGroups != 0) {
throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError);
}

int nelemsPerGroup = nelems / nGroups;
int nChunksPerGroup = nInputChunks / nGroups;
int nChunksPerGroup = nChunks / nGroups;
uint32_t minNelems = nelemsPerGroup / nChunksPerGroup;
uint32_t remainder = nelemsPerGroup % nChunksPerGroup;
uint32_t groupIdx = chunkIndex / nChunksPerGroup;
Expand All @@ -568,9 +569,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out
}

size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const {
auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize);
uint32_t nChunks = sizePerRank.second;
return (sizePerRank.first + nChunks - 1) / nChunks;
size_t nInputChunks = this->inputChunks.at(rank);
size_t nOutputChunks = this->outputChunks.at(rank);
size_t inputChunkSize = 0;
size_t outputChunkSize = 0;
if (nInputChunks != 0) {
inputChunkSize = inputSize / nInputChunks;
}
if (nOutputChunks != 0) {
outputChunkSize = outputSize / nOutputChunks;
}
return std::max(inputChunkSize, outputChunkSize);
}

void ExecutionPlan::Impl::reset() {
Expand Down
2 changes: 1 addition & 1 deletion src/include/execution_plan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ struct ExecutionPlan::Impl {
bool isInPlace;

private:
std::pair<size_t, u_int32_t> calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const;
std::pair<size_t, uint32_t> getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const;
size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const;
size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
const std::vector<uint32_t> offsets) const;
Expand Down

0 comments on commit 6fedb7c

Please sign in to comment.