From d011ff16a6339d79b611ca5c9bc0d35b71524ecf Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Wed, 24 Nov 2021 00:33:45 +0800 Subject: [PATCH 1/6] new balancer adopted for zone rules --- .linters/cpp/checkKeyword.py | 1 + .../executor/admin/SubmitJobExecutor.cpp | 3 +- src/graph/validator/AdminJobValidator.h | 1 + src/interface/meta.thrift | 1 + src/meta/CMakeLists.txt | 5 + .../processors/job/BalanceJobExecutor.cpp | 1097 +----------- src/meta/processors/job/BalanceJobExecutor.h | 226 +-- src/meta/processors/job/BalancePlan.cpp | 17 +- src/meta/processors/job/BalancePlan.h | 5 +- src/meta/processors/job/BalanceTask.cpp | 61 +- src/meta/processors/job/BalanceTask.h | 6 +- .../processors/job/DataBalanceJobExecutor.cpp | 190 +++ .../processors/job/DataBalanceJobExecutor.h | 42 + src/meta/processors/job/JobExecutor.cpp | 84 + src/meta/processors/job/JobExecutor.h | 70 + src/meta/processors/job/JobManager.cpp | 113 +- src/meta/processors/job/JobManager.h | 9 +- .../job/LeaderBalanceJobExecutor.cpp | 550 ++++++ .../processors/job/LeaderBalanceJobExecutor.h | 94 ++ src/meta/processors/job/MetaJobExecutor.cpp | 248 +-- src/meta/processors/job/MetaJobExecutor.h | 69 +- src/meta/processors/job/RebuildJobExecutor.h | 6 +- .../job/SimpleConcurrentJobExecutor.cpp | 2 +- .../job/SimpleConcurrentJobExecutor.h | 4 +- src/meta/processors/job/StatsJobExecutor.h | 6 +- .../processors/job/StorageJobExecutor.cpp | 199 +++ src/meta/processors/job/StorageJobExecutor.h | 88 + .../processors/job/ZoneBalanceJobExecutor.cpp | 305 ++++ .../processors/job/ZoneBalanceJobExecutor.h | 44 + src/meta/test/BalancerTest.cpp | 1470 ++++++----------- src/meta/test/GetStatsTest.cpp | 11 +- src/meta/test/JobManagerTest.cpp | 59 +- src/meta/test/TestUtils.h | 56 + src/parser/AdminSentences.cpp | 11 + src/parser/parser.yy | 24 +- src/parser/scanner.lex | 1 + 36 files changed, 2537 insertions(+), 2641 deletions(-) create mode 100644 src/meta/processors/job/DataBalanceJobExecutor.cpp create mode 100644 src/meta/processors/job/DataBalanceJobExecutor.h create mode 100644 src/meta/processors/job/JobExecutor.cpp create mode 100644 src/meta/processors/job/JobExecutor.h create mode 100644 src/meta/processors/job/LeaderBalanceJobExecutor.cpp create mode 100644 src/meta/processors/job/LeaderBalanceJobExecutor.h create mode 100644 src/meta/processors/job/StorageJobExecutor.cpp create mode 100644 src/meta/processors/job/StorageJobExecutor.h create mode 100644 src/meta/processors/job/ZoneBalanceJobExecutor.cpp create mode 100644 src/meta/processors/job/ZoneBalanceJobExecutor.h diff --git a/.linters/cpp/checkKeyword.py b/.linters/cpp/checkKeyword.py index 06efcee08c6..c8b9083929d 100755 --- a/.linters/cpp/checkKeyword.py +++ b/.linters/cpp/checkKeyword.py @@ -118,6 +118,7 @@ 'KW_IGNORE_EXISTED_INDEX', 'KW_GEOGRAPHY', 'KW_DURATION', + 'KW_ACROSS', ] diff --git a/src/graph/executor/admin/SubmitJobExecutor.cpp b/src/graph/executor/admin/SubmitJobExecutor.cpp index 64946b394b8..0d78361d60f 100644 --- a/src/graph/executor/admin/SubmitJobExecutor.cpp +++ b/src/graph/executor/admin/SubmitJobExecutor.cpp @@ -109,7 +109,8 @@ Value SubmitJobExecutor::convertJobTimestampToDateTime(int64_t timestamp) { nebula::DataSet SubmitJobExecutor::buildShowResultData( const nebula::meta::cpp2::JobDesc &jd, const std::vector &td) { - if (jd.get_cmd() == meta::cpp2::AdminCmd::DATA_BALANCE) { + if (jd.get_cmd() == meta::cpp2::AdminCmd::DATA_BALANCE || + jd.get_cmd() == meta::cpp2::AdminCmd::ZONE_BALANCE) { nebula::DataSet v( {"Job Id(spaceId:partId)", "Command(src->dst)", "Status", "Start Time", "Stop Time"}); const auto ¶s = jd.get_paras(); diff --git a/src/graph/validator/AdminJobValidator.h b/src/graph/validator/AdminJobValidator.h index 7d755da88e7..1f8ff4e5a70 100644 --- a/src/graph/validator/AdminJobValidator.h +++ b/src/graph/validator/AdminJobValidator.h @@ -38,6 +38,7 @@ class AdminJobValidator final : public Validator { case meta::cpp2::AdminCmd::FLUSH: case meta::cpp2::AdminCmd::DATA_BALANCE: case meta::cpp2::AdminCmd::LEADER_BALANCE: + case meta::cpp2::AdminCmd::ZONE_BALANCE: return true; // TODO: Also space related, but not available in CreateJobExecutor now. case meta::cpp2::AdminCmd::DOWNLOAD: diff --git a/src/interface/meta.thrift b/src/interface/meta.thrift index 907426e9cea..4e4a02b7e26 100644 --- a/src/interface/meta.thrift +++ b/src/interface/meta.thrift @@ -235,6 +235,7 @@ enum AdminCmd { DOWNLOAD = 7, INGEST = 8, LEADER_BALANCE = 9, + ZONE_BALANCE = 10, UNKNOWN = 99, } (cpp.enum_strict) diff --git a/src/meta/CMakeLists.txt b/src/meta/CMakeLists.txt index e9e6bf19903..05bed4b1f98 100644 --- a/src/meta/CMakeLists.txt +++ b/src/meta/CMakeLists.txt @@ -68,11 +68,16 @@ nebula_add_library( processors/job/AdminJobProcessor.cpp processors/job/ReportTaskProcessor.cpp processors/job/JobUtils.cpp + processors/job/StorageJobExecutor.cpp + processors/job/JobExecutor.cpp processors/job/MetaJobExecutor.cpp processors/job/SimpleConcurrentJobExecutor.cpp processors/job/CompactJobExecutor.cpp processors/job/FlushJobExecutor.cpp processors/job/BalanceJobExecutor.cpp + processors/job/ZoneBalanceJobExecutor.cpp + processors/job/DataBalanceJobExecutor.cpp + processors/job/LeaderBalanceJobExecutor.cpp processors/job/RebuildJobExecutor.cpp processors/job/RebuildTagJobExecutor.cpp processors/job/RebuildEdgeJobExecutor.cpp diff --git a/src/meta/processors/job/BalanceJobExecutor.cpp b/src/meta/processors/job/BalanceJobExecutor.cpp index d2d74400c39..f7392d068ba 100644 --- a/src/meta/processors/job/BalanceJobExecutor.cpp +++ b/src/meta/processors/job/BalanceJobExecutor.cpp @@ -11,25 +11,13 @@ #include "kvstore/NebulaStore.h" #include "meta/processors/job/JobUtils.h" -DEFINE_double(leader_balance_deviation, - 0.05, - "after leader balance, leader count should in range " - "[avg * (1 - deviation), avg * (1 + deviation)]"); - namespace nebula { namespace meta { -std::atomic_bool BalanceJobExecutor::running_ = false; -std::atomic_bool LeaderBalanceJobExecutor::inLeaderBalance_ = false; -std::unique_ptr DataBalanceJobExecutor::plan_ = nullptr; -std::mutex BalanceJobExecutor::lock_; BalanceJobExecutor::BalanceJobExecutor(JobID jobId, kvstore::KVStore* kvstore, AdminClient* adminClient, const std::vector& paras) - : MetaJobExecutor(jobId, kvstore, adminClient, paras) { - executor_.reset(new folly::CPUThreadPoolExecutor(1)); - toHost_ = TargetHosts::NONE; -} + : MetaJobExecutor(jobId, kvstore, adminClient, paras) {} bool BalanceJobExecutor::check() { return !paras_.empty(); @@ -44,22 +32,7 @@ nebula::cpp2::ErrorCode BalanceJobExecutor::stop() { return nebula::cpp2::ErrorCode::SUCCEEDED; } -folly::Future BalanceJobExecutor::executeInternal(HostAddr&& address, - std::vector&& parts) { - UNUSED(address); - UNUSED(parts); - return Status::OK(); -} - -bool BalanceJobExecutor::runInMeta() { - return true; -} - nebula::cpp2::ErrorCode BalanceJobExecutor::recovery() { - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::recovery() { if (kvstore_ == nullptr) { return nebula::cpp2::ErrorCode::SUCCEEDED; } @@ -80,46 +53,25 @@ nebula::cpp2::ErrorCode DataBalanceJobExecutor::recovery() { auto optJobRet = JobDescription::makeJobDescription(jobKey, value); auto optJob = nebula::value(optJobRet); plan_.reset(new BalancePlan(optJob, kvstore_, adminClient_)); - plan_->onFinished_ = [this]() { - std::lock_guard lg(lock_); + plan_->setFinishCallBack([this](meta::cpp2::JobStatus status) { if (LastUpdateTimeMan::update(kvstore_, time::WallClock::fastNowInMilliSec()) != nebula::cpp2::ErrorCode::SUCCEEDED) { LOG(ERROR) << "Balance plan " << plan_->id() << " update meta failed"; } - finishInternal(); - }; + executorOnFinished_(status); + }); auto recRet = plan_->recovery(); if (recRet != nebula::cpp2::ErrorCode::SUCCEEDED) { plan_.reset(nullptr); return recRet; } + plan_->saveInStore(); return nebula::cpp2::ErrorCode::SUCCEEDED; } nebula::cpp2::ErrorCode BalanceJobExecutor::finish(bool ret) { UNUSED(ret); - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -nebula::cpp2::ErrorCode BalanceJobExecutor::getAllSpaces( - std::vector>& spaces) { - // Get all spaces - folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); - const auto& prefix = MetaKeyUtils::spacePrefix(); - std::unique_ptr iter; - auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Get all spaces failed, error: " << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } - - while (iter->valid()) { - auto spaceId = MetaKeyUtils::spaceId(iter->key()); - auto properties = MetaKeyUtils::parseSpace(iter->val()); - bool zoned = !properties.get_zone_names().empty(); - spaces.emplace_back(spaceId, *properties.replica_factor_ref(), zoned); - iter->next(); - } + plan_.reset(nullptr); return nebula::cpp2::ErrorCode::SUCCEEDED; } @@ -136,1024 +88,81 @@ nebula::cpp2::ErrorCode BalanceJobExecutor::save(const std::string& k, const std return rc; } -nebula::cpp2::ErrorCode DataBalanceJobExecutor::buildBalancePlan() { - if (plan_ != nullptr) { - LOG(ERROR) << "Balance plan should be nullptr now"; - return nebula::cpp2::ErrorCode::E_BALANCED; - } - std::vector> spaces; - auto spacesRet = getAllSpaces(spaces); - if (spacesRet != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Can't get all spaces"; - return spacesRet; - } - - plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); - for (const auto& spaceInfo : spaces) { - auto spaceId = std::get<0>(spaceInfo); - auto spaceReplica = std::get<1>(spaceInfo); - auto dependentOnZone = std::get<2>(spaceInfo); - LOG(INFO) << "Balance Space " << spaceId; - auto taskRet = genTasks(spaceId, spaceReplica, dependentOnZone, lostHosts_); - if (!ok(taskRet)) { - LOG(ERROR) << "Generate tasks on space " << std::get<0>(spaceInfo) << " failed"; - return error(taskRet); - } - - auto tasks = std::move(value(taskRet)); - for (auto& task : tasks) { - plan_->addTask(std::move(task)); - } - } - - plan_->onFinished_ = [this]() { - std::lock_guard lg(lock_); - if (LastUpdateTimeMan::update(kvstore_, time::WallClock::fastNowInMilliSec()) != - nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Balance plan " << plan_->id() << " update meta failed"; - } - finishInternal(); - }; - if (plan_->tasks_.empty()) { - return nebula::cpp2::ErrorCode::E_BALANCED; - } - return plan_->saveInStore(); -} - -ErrorOr> DataBalanceJobExecutor::genTasks( - GraphSpaceID spaceId, - int32_t spaceReplica, - bool dependentOnZone, - std::vector& lostHosts) { - HostParts hostParts; - int32_t totalParts = 0; - // hostParts is current part allocation map - auto result = getHostParts(spaceId, dependentOnZone, hostParts, totalParts); - if (!nebula::ok(result)) { - return nebula::error(result); - } - - auto retVal = nebula::value(result); - if (!retVal || totalParts == 0 || hostParts.empty()) { - LOG(ERROR) << "Invalid space " << spaceId; - return nebula::cpp2::ErrorCode::E_KEY_NOT_FOUND; - } - - auto fetchHostPartsRet = fetchHostParts(spaceId, dependentOnZone, hostParts, lostHosts); - if (!nebula::ok(fetchHostPartsRet)) { - LOG(ERROR) << "Fetch hosts and parts failed"; - return nebula::error(fetchHostPartsRet); - } - - auto hostPartsRet = nebula::value(fetchHostPartsRet); - auto confirmedHostParts = hostPartsRet.first; - auto activeHosts = hostPartsRet.second; - LOG(INFO) << "Now, try to balance the confirmedHostParts"; - - // We have two parts need to balance, the first one is parts on lost hosts and - // deleted hosts The seconds one is parts on unbalanced host in - // confirmedHostParts. - std::vector tasks; - // 1. Iterate through all hosts that would not be included in - // confirmedHostParts, - // move all parts in them to host with minimum part in confirmedHostParts - for (auto& lostHost : lostHosts) { - auto& lostParts = hostParts[lostHost]; - for (auto& partId : lostParts) { - LOG(INFO) << "Try balance part " << partId << " for lost host " << lostHost; - // check whether any peers which is alive - auto alive = checkReplica(hostParts, activeHosts, spaceReplica, partId); - if (!alive.ok()) { - LOG(ERROR) << "Check Replica failed: " << alive << " Part: " << partId; - return nebula::cpp2::ErrorCode::E_NO_VALID_HOST; - } - - auto retCode = - transferLostHost(tasks, confirmedHostParts, lostHost, spaceId, partId, dependentOnZone); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Transfer lost host " << lostHost << " failed"; - return retCode; - } - } - } - - // 2. Make all hosts in confirmedHostParts balanced - if (balanceParts(spaceId, confirmedHostParts, totalParts, tasks, dependentOnZone)) { - return tasks; - } else { - return nebula::cpp2::ErrorCode::E_BAD_BALANCE_PLAN; - } -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::transferLostHost(std::vector& tasks, - HostParts& confirmedHostParts, - const HostAddr& source, - GraphSpaceID spaceId, - PartitionID partId, - bool dependentOnZone) { - // find a host with minimum parts which doesn't have this part - ErrorOr result; - if (dependentOnZone) { - result = hostWithMinimalPartsForZone(source, confirmedHostParts, partId); - } else { - result = hostWithMinimalParts(confirmedHostParts, partId); - } - - if (!nebula::ok(result)) { - LOG(ERROR) << "Can't find a host which doesn't have part: " << partId; - return nebula::error(result); - } - const auto& targetHost = nebula::value(result); - confirmedHostParts[targetHost].emplace_back(partId); - tasks.emplace_back(plan_->id(), spaceId, partId, source, targetHost, kvstore_, adminClient_); - zoneParts_[targetHost].second.emplace_back(partId); - auto zoneIt = - std::find(zoneParts_[source].second.begin(), zoneParts_[source].second.end(), partId); - if (zoneIt == zoneParts_[source].second.end()) { - LOG(ERROR) << "part not find " << partId << " at " << source; - } - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -ErrorOr>> -DataBalanceJobExecutor::fetchHostParts(GraphSpaceID spaceId, - bool dependentOnZone, - const HostParts& hostParts, - std::vector& lostHosts) { - ErrorOr> activeHostsRet; - if (dependentOnZone) { - activeHostsRet = ActiveHostsMan::getActiveHostsWithZones(kvstore_, spaceId); - } else { - activeHostsRet = ActiveHostsMan::getActiveHosts(kvstore_); - } - - if (!nebula::ok(activeHostsRet)) { - return nebula::error(activeHostsRet); - } - - std::vector expand; - auto activeHosts = nebula::value(activeHostsRet); - calDiff(hostParts, activeHosts, expand, lostHosts); - // confirmedHostParts is new part allocation map after balance, it would - // include newlyAdded and exclude lostHosts - HostParts confirmedHostParts(hostParts); - for (const auto& h : expand) { - LOG(INFO) << "Found new host " << h; - confirmedHostParts.emplace(h, std::vector()); - } - for (const auto& h : lostHosts) { - LOG(INFO) << "Lost host " << h; - confirmedHostParts.erase(h); - } - return std::make_pair(confirmedHostParts, activeHosts); -} - -bool DataBalanceJobExecutor::balanceParts(GraphSpaceID spaceId, - HostParts& confirmedHostParts, - int32_t totalParts, - std::vector& tasks, - bool dependentOnZone) { - auto avgLoad = static_cast(totalParts) / confirmedHostParts.size(); - VLOG(3) << "The expect avg load is " << avgLoad; - int32_t minLoad = std::floor(avgLoad); - int32_t maxLoad = std::ceil(avgLoad); - VLOG(3) << "The min load is " << minLoad << " max load is " << maxLoad; - - auto sortedHosts = sortedHostsByParts(confirmedHostParts); - if (sortedHosts.empty()) { - LOG(ERROR) << "Host is empty"; - return false; - } - - auto maxPartsHost = sortedHosts.back(); - auto minPartsHost = sortedHosts.front(); - auto& sourceHost = maxPartsHost.first; - auto& targetHost = minPartsHost.first; - if (innerBalance_) { - LOG(INFO) << "maxPartsHost.first " << maxPartsHost.first << " minPartsHost.first " - << minPartsHost.first; - while (!checkZoneLegal(maxPartsHost.first, minPartsHost.first)) { - sortedHosts.pop_back(); - maxPartsHost = sortedHosts.back(); - } - - auto& source = maxPartsHost.first; - auto iter = std::find_if(zoneParts_.begin(), zoneParts_.end(), [&source](const auto& pair) { - return source == pair.first; - }); - - auto& zoneName = iter->second.first; - int32_t hostsSize = zoneHosts_[zoneName].size(); - int32_t totalPartsZone = 0; - for (auto& host : zoneHosts_[zoneName]) { - auto it = confirmedHostParts.find(host); - if (it == confirmedHostParts.end()) { - LOG(ERROR) << "Host " << host << "not in confirmedHostParts"; - continue; - } - totalPartsZone += it->second.size(); - } - - avgLoad = static_cast(totalPartsZone) / hostsSize; - minLoad = std::floor(avgLoad); - maxLoad = std::ceil(avgLoad); - LOG(INFO) << "Update min and max loading Total parts in zone " << totalPartsZone - << ", total hosts " << hostsSize << " The expect avg load is " << avgLoad - << " The min load is " << minLoad << " max load is " << maxLoad; - } - - while (maxPartsHost.second > maxLoad || minPartsHost.second < minLoad) { - auto& partsFrom = confirmedHostParts[maxPartsHost.first]; - auto& partsTo = confirmedHostParts[minPartsHost.first]; - std::sort(partsFrom.begin(), partsFrom.end()); - std::sort(partsTo.begin(), partsTo.end()); - - LOG(INFO) << maxPartsHost.first << ":" << partsFrom.size() << " -> " << minPartsHost.first - << ":" << partsTo.size(); - std::vector diff; - std::set_difference(partsFrom.begin(), - partsFrom.end(), - partsTo.begin(), - partsTo.end(), - std::inserter(diff, diff.begin())); - bool noAction = true; - for (auto& partId : diff) { - LOG(INFO) << "partsFrom size " << partsFrom.size() << " partsTo size " << partsTo.size() - << " minLoad " << minLoad << " maxLoad " << maxLoad; - if (partsFrom.size() == partsTo.size() + 1 || - partsFrom.size() == static_cast(minLoad) || - partsTo.size() == static_cast(maxLoad)) { - VLOG(3) << "No need to move any parts from " << maxPartsHost.first << " to " - << minPartsHost.first; - break; - } - - LOG(INFO) << "[space:" << spaceId << ", part:" << partId << "] " << maxPartsHost.first << "->" - << minPartsHost.first; - auto it = std::find(partsFrom.begin(), partsFrom.end(), partId); - if (it == partsFrom.end()) { - LOG(ERROR) << "Part " << partId << " not found in partsFrom"; - return false; - } - - if (std::find(partsTo.begin(), partsTo.end(), partId) != partsTo.end()) { - LOG(ERROR) << "Part " << partId << " already existed in partsTo"; - return false; - } - - if (dependentOnZone) { - if (!checkZoneLegal(sourceHost, targetHost)) { - LOG(INFO) << "sourceHost " << sourceHost << " targetHost " << targetHost - << " not same zone"; - - auto& parts = relatedParts_[targetHost]; - auto minIt = std::find(parts.begin(), parts.end(), partId); - if (minIt != parts.end()) { - LOG(INFO) << "Part " << partId << " have existed"; - continue; - } - } - - auto& sourceNoneName = zoneParts_[sourceHost].first; - auto sourceHosts = zoneHosts_.find(sourceNoneName); - for (auto& sh : sourceHosts->second) { - auto& parts = relatedParts_[sh]; - auto maxIt = std::find(parts.begin(), parts.end(), partId); - if (maxIt == parts.end()) { - LOG(INFO) << "Part " << partId << " not found on " << sh; - continue; - } - parts.erase(maxIt); - } - - auto& targetNoneName = zoneParts_[targetHost].first; - auto targetHosts = zoneHosts_.find(targetNoneName); - for (auto& th : targetHosts->second) { - relatedParts_[th].emplace_back(partId); - } - } - - partsFrom.erase(it); - partsTo.emplace_back(partId); - tasks.emplace_back( - jobId_, spaceId, partId, maxPartsHost.first, minPartsHost.first, kvstore_, adminClient_); - noAction = false; - } - - if (noAction) { - LOG(INFO) << "Here is no action"; - break; - } - sortedHosts = sortedHostsByParts(confirmedHostParts); - maxPartsHost = sortedHosts.back(); - minPartsHost = sortedHosts.front(); - if (innerBalance_) { - while (!checkZoneLegal(maxPartsHost.first, minPartsHost.first)) { - sortedHosts.pop_back(); - maxPartsHost = sortedHosts.back(); - } - - auto& source = maxPartsHost.first; - auto iter = std::find_if(zoneParts_.begin(), zoneParts_.end(), [&source](const auto& pair) { - return source == pair.first; - }); - - auto& zoneName = iter->second.first; - int32_t hostsSize = zoneHosts_[zoneName].size(); - int32_t totalPartsZone = 0; - for (auto& host : zoneHosts_[zoneName]) { - auto it = confirmedHostParts.find(host); - if (it == confirmedHostParts.end()) { - LOG(ERROR) << "Host " << host << "not in confirmedHostParts"; - continue; - } - totalPartsZone += it->second.size(); - } - - avgLoad = static_cast(totalPartsZone) / hostsSize; - minLoad = std::floor(avgLoad); - maxLoad = std::ceil(avgLoad); - LOG(INFO) << "Update min and max loading Total parts in zone " << totalPartsZone - << ", total hosts " << hostsSize << " The expect avg load is " << avgLoad - << " The min load is " << minLoad << " max load is " << maxLoad; - } - } - LOG(INFO) << "Balance tasks num: " << tasks.size(); - for (auto& task : tasks) { - LOG(INFO) << task.taskIdStr(); - } - - relatedParts_.clear(); - return true; -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::stop() { - std::lock_guard lg(lock_); - if (!running_) { - return nebula::cpp2::ErrorCode::E_KEY_NOT_FOUND; - } - stopped_ = true; - plan_->stop(); - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -ErrorOr BalanceJobExecutor::getHostParts(GraphSpaceID spaceId, - bool dependentOnZone, - HostParts& hostParts, - int32_t& totalParts) { - folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); - const auto& prefix = MetaKeyUtils::partPrefix(spaceId); - std::unique_ptr iter; - auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId << " " - << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } - - while (iter->valid()) { - auto key = iter->key(); - PartitionID partId; - memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); - auto partHosts = MetaKeyUtils::parsePartVal(iter->val()); - for (auto& ph : partHosts) { - hostParts[ph].emplace_back(partId); - } - totalParts++; - iter->next(); - } - - LOG(INFO) << "Host size: " << hostParts.size(); - auto key = MetaKeyUtils::spaceKey(spaceId); - std::string value; - retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, key, &value); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId - << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } - - auto properties = MetaKeyUtils::parseSpace(value); - if (totalParts != properties.get_partition_num()) { - LOG(ERROR) << "Partition number not equals " << totalParts << " : " - << properties.get_partition_num(); - return false; - } - - int32_t replica = properties.get_replica_factor(); - LOG(INFO) << "Replica " << replica; - if (dependentOnZone && !properties.get_zone_names().empty()) { - auto zoneNames = properties.get_zone_names(); - int32_t zoneSize = zoneNames.size(); - LOG(INFO) << "Zone Size " << zoneSize; - innerBalance_ = (replica == zoneSize); - - auto activeHostsRet = ActiveHostsMan::getActiveHostsWithZones(kvstore_, spaceId); - if (!nebula::ok(activeHostsRet)) { - return nebula::error(activeHostsRet); - } - - std::vector expand; - auto activeHosts = nebula::value(activeHostsRet); - std::vector lostHosts; - calDiff(hostParts, activeHosts, expand, lostHosts); - // confirmedHostParts is new part allocation map after balance, it would include newlyAdded - // and exclude lostHosts - HostParts confirmedHostParts(hostParts); - for (const auto& h : expand) { - LOG(INFO) << "Found new host " << h; - confirmedHostParts.emplace(h, std::vector()); - } - for (const auto& h : lostHosts) { - LOG(INFO) << "Lost host " << h; - confirmedHostParts.erase(h); - } - - auto zonePartsRet = assembleZoneParts(zoneNames, confirmedHostParts); - if (zonePartsRet != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Assemble Zone Parts failed"; - return zonePartsRet; - } - } - - totalParts *= replica; - return true; -} - -nebula::cpp2::ErrorCode BalanceJobExecutor::assembleZoneParts( - const std::vector& zoneNames, HostParts& hostParts) { - // zoneHosts use to record this host belong to zone's hosts - std::unordered_map, std::vector> zoneHosts; - for (const auto& zoneName : zoneNames) { - LOG(INFO) << "Zone Name: " << zoneName; - auto zoneKey = MetaKeyUtils::zoneKey(zoneName); +nebula::cpp2::ErrorCode SpaceInfo::getInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore) { + spaceId_ = spaceId; + std::string spaceKey = MetaKeyUtils::spaceKey(spaceId); + std::string spaceVal; + kvstore->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); + name_ = properties.get_space_name(); + replica_ = properties.get_replica_factor(); + const std::vector& zones = properties.get_zone_names(); + for (const std::string& zoneName : zones) { std::string zoneValue; - auto retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, zoneKey, &zoneValue); + auto zoneKey = MetaKeyUtils::zoneKey(zoneName); + auto retCode = kvstore->get(kDefaultSpaceId, kDefaultPartId, zoneKey, &zoneValue); if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { LOG(ERROR) << "Get zone " << zoneName - << " failed: " << apache::thrift::util::enumNameSafe(retCode); + << " failed, error: " << apache::thrift::util::enumNameSafe(retCode); return retCode; } - - auto hosts = MetaKeyUtils::parseZoneHosts(std::move(zoneValue)); - for (const auto& host : hosts) { - LOG(INFO) << "Host for zone " << host; - auto pair = std::pair(std::move(host), zoneName); - auto& hs = zoneHosts[std::move(pair)]; - hs.insert(hs.end(), hosts.begin(), hosts.end()); - } - } - - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - auto host = it->first; - LOG(INFO) << "Host: " << host; - auto zoneIter = - std::find_if(zoneHosts.begin(), zoneHosts.end(), [host](const auto& pair) -> bool { - return host == pair.first.first; - }); - - if (zoneIter == zoneHosts.end()) { - LOG(INFO) << it->first << " have lost"; - continue; - } - - auto& hosts = zoneIter->second; - auto name = zoneIter->first.second; - zoneHosts_[name] = hosts; - for (auto hostIter = hosts.begin(); hostIter != hosts.end(); hostIter++) { - auto partIter = hostParts.find(*hostIter); - LOG(INFO) << "Zone " << name << " have the host " << it->first; - if (partIter == hostParts.end()) { - zoneParts_[it->first] = ZoneNameAndParts(name, std::vector()); - } else { - zoneParts_[it->first] = ZoneNameAndParts(name, partIter->second); - } - } - } - - for (auto it = zoneHosts.begin(); it != zoneHosts.end(); it++) { - auto host = it->first.first; - auto& hosts = it->second; - for (auto hostIter = hosts.begin(); hostIter != hosts.end(); hostIter++) { - auto h = *hostIter; - auto iter = std::find_if(hostParts.begin(), hostParts.end(), [h](const auto& pair) -> bool { - return h == pair.first; - }); - - if (iter == hostParts.end()) { - continue; - } - - auto& parts = iter->second; - auto& hp = relatedParts_[host]; - hp.insert(hp.end(), parts.begin(), parts.end()); - } - } - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -void BalanceJobExecutor::calDiff(const HostParts& hostParts, - const std::vector& activeHosts, - std::vector& expand, - std::vector& lost) { - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - VLOG(1) << "Original Host " << it->first << ", parts " << it->second.size(); - if (std::find(activeHosts.begin(), activeHosts.end(), it->first) == activeHosts.end() && - std::find(lost.begin(), lost.end(), it->first) == lost.end()) { - lost.emplace_back(it->first); - } - } - for (auto& h : activeHosts) { - VLOG(1) << "Active host " << h; - if (hostParts.find(h) == hostParts.end()) { - expand.emplace_back(h); - } - } -} - -std::vector> DataBalanceJobExecutor::sortedHostsByParts( - const HostParts& hostParts) { - std::vector> hosts; - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - LOG(INFO) << "Host " << it->first << " parts " << it->second.size(); - hosts.emplace_back(it->first, it->second.size()); - } - std::sort(hosts.begin(), hosts.end(), [](const auto& l, const auto& r) { - if (l.second != r.second) { - return l.second < r.second; - } else { - return l.first.host < r.first.host; - } - }); - return hosts; -} - -Status DataBalanceJobExecutor::checkReplica(const HostParts& hostParts, - const std::vector& activeHosts, - int32_t replica, - PartitionID partId) { - // check host hold the part and alive - auto checkPart = [&](const auto& entry) { - auto& host = entry.first; - auto& parts = entry.second; - return std::find(parts.begin(), parts.end(), partId) != parts.end() && - std::find(activeHosts.begin(), activeHosts.end(), host) != activeHosts.end(); - }; - auto aliveReplica = std::count_if(hostParts.begin(), hostParts.end(), checkPart); - if (aliveReplica >= replica / 2 + 1) { - return Status::OK(); - } - return Status::Error("Not enough alive host hold the part %d", partId); -} - -ErrorOr DataBalanceJobExecutor::hostWithMinimalParts( - const HostParts& hostParts, PartitionID partId) { - auto hosts = sortedHostsByParts(hostParts); - for (auto& h : hosts) { - auto it = hostParts.find(h.first); - if (it == hostParts.end()) { - LOG(ERROR) << "Host " << h.first << " not found"; - return nebula::cpp2::ErrorCode::E_NO_HOSTS; - } - - if (std::find(it->second.begin(), it->second.end(), partId) == it->second.end()) { - return h.first; - } - } - return nebula::cpp2::ErrorCode::E_NO_HOSTS; -} - -ErrorOr DataBalanceJobExecutor::hostWithMinimalPartsForZone( - const HostAddr& source, const HostParts& hostParts, PartitionID partId) { - auto hosts = sortedHostsByParts(hostParts); - for (auto& h : hosts) { - auto it = hostParts.find(h.first); - if (it == hostParts.end()) { - LOG(ERROR) << "Host " << h.first << " not found"; - return nebula::cpp2::ErrorCode::E_NO_HOSTS; - } - - LOG(INFO) << "source " << source << " h.first " << h.first; - if (std::find(it->second.begin(), it->second.end(), partId) == it->second.end()) { - return h.first; - } - } - return nebula::cpp2::ErrorCode::E_NO_HOSTS; -} - -bool DataBalanceJobExecutor::checkZoneLegal(const HostAddr& source, const HostAddr& target) { - VLOG(3) << "Check " << source << " : " << target; - auto sourceIter = std::find_if(zoneParts_.begin(), zoneParts_.end(), [&source](const auto& pair) { - return source == pair.first; - }); - - if (sourceIter == zoneParts_.end()) { - LOG(INFO) << "Source " << source << " not found"; - return false; - } - - auto targetIter = std::find_if(zoneParts_.begin(), zoneParts_.end(), [&target](const auto& pair) { - return target == pair.first; - }); - - if (targetIter == zoneParts_.end()) { - LOG(INFO) << "Target " << target << " not found"; - return false; - } - - LOG(INFO) << sourceIter->second.first << " : " << targetIter->second.first; - return sourceIter->second.first == targetIter->second.first; -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::prepare() { - auto activeHostsRet = ActiveHostsMan::getActiveHosts(kvstore_); - if (!nebula::ok(activeHostsRet)) { - auto retCode = nebula::error(activeHostsRet); - LOG(ERROR) << "Get active hosts failed, error: " << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } - auto hosts = std::move(nebula::value(activeHostsRet)); - - if (hosts.empty()) { - LOG(ERROR) << "There is no active hosts"; - return nebula::cpp2::ErrorCode::E_NO_HOSTS; - } - lostHosts_.reserve(paras_.size() - 1); - for (size_t i = 0; i < paras_.size() - 1; i++) { - lostHosts_.emplace_back(HostAddr::fromString(paras_[i])); - } - return nebula::cpp2::ErrorCode::SUCCEEDED; -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::finish(bool ret) { - std::lock_guard lg(lock_); - return finishInternal(ret); -} - -nebula::cpp2::ErrorCode DataBalanceJobExecutor::finishInternal(bool ret) { - CHECK(!lock_.try_lock()); - plan_.reset(nullptr); - running_ = false; - auto rc = onFinished_(ret); - if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { - return rc; - } - return ret ? nebula::cpp2::ErrorCode::SUCCEEDED : nebula::cpp2::ErrorCode::E_BALANCER_FAILURE; -} - -folly::Future DataBalanceJobExecutor::executeInternal(HostAddr&& address, - std::vector&& parts) { - UNUSED(address); - UNUSED(parts); - std::unique_lock lg(lock_); - if (!running_) { - if (plan_ == nullptr) { - auto retCode = buildBalancePlan(); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - if (retCode == nebula::cpp2::ErrorCode::E_BALANCED) { - finishInternal(true); - return Status::OK(); - } else { - return Status::Error(apache::thrift::util::enumNameSafe(retCode)); - } - } - } - LOG(INFO) << "Start to invoke balance plan " << plan_->id(); - running_ = true; - auto fut = folly::via(executor_.get(), std::bind(&BalancePlan::invoke, plan_.get())); - lg.unlock(); - fut.wait(); - return Status::OK(); - } - CHECK(plan_ != nullptr); - LOG(INFO) << "Balance job " << plan_->id() << " is still running"; - return Status::Error(folly::sformat("Balance job {} is still running", plan_->id())); -} - -folly::Future LeaderBalanceJobExecutor::executeInternal(HostAddr&& address, - std::vector&& parts) { - UNUSED(address); - UNUSED(parts); - if (running_.load(std::memory_order_acquire)) { - LOG(INFO) << "Balance process still running"; - return Status::OK(); - } - - folly::Promise promise; - auto future = promise.getFuture(); - // Space ID, Replica Factor and Dependent On Group - std::vector> spaces; - auto ret = getAllSpaces(spaces); - if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { - if (ret != nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { - ret = nebula::cpp2::ErrorCode::E_STORE_FAILURE; + std::vector hosts = MetaKeyUtils::parseZoneHosts(std::move(zoneValue)); + Zone zone(zoneName); + for (HostAddr& ha : hosts) { + zone.hosts_.emplace(ha, Host(ha)); } - return Status::Error("Can't get spaces"); + zones_.emplace(zoneName, zone); } - - bool expected = false; - if (inLeaderBalance_.compare_exchange_strong(expected, true)) { - hostLeaderMap_.reset(new HostLeaderMap); - auto status = adminClient_->getLeaderDist(hostLeaderMap_.get()).get(); - if (!status.ok() || hostLeaderMap_->empty()) { - inLeaderBalance_ = false; - return Status::Error("Get leader distribution failed"); - } - - std::vector> futures; - for (const auto& spaceInfo : spaces) { - auto spaceId = std::get<0>(spaceInfo); - auto replicaFactor = std::get<1>(spaceInfo); - auto dependentOnZone = std::get<2>(spaceInfo); - LeaderBalancePlan plan; - auto balanceResult = buildLeaderBalancePlan( - hostLeaderMap_.get(), spaceId, replicaFactor, dependentOnZone, plan); - if (!nebula::ok(balanceResult) || !nebula::value(balanceResult)) { - LOG(ERROR) << "Building leader balance plan failed " - << "Space: " << spaceId; - continue; - } - simplifyLeaderBalnacePlan(spaceId, plan); - for (const auto& task : plan) { - futures.emplace_back(adminClient_->transLeader(std::get<0>(task), - std::get<1>(task), - std::move(std::get<2>(task)), - std::move(std::get<3>(task)))); - } - } - - int32_t failed = 0; - folly::collectAll(futures) - .via(executor_.get()) - .thenTry([&](const auto& result) { - auto tries = result.value(); - for (const auto& t : tries) { - if (!t.value().ok()) { - ++failed; - } - } - }) - .wait(); - - inLeaderBalance_ = false; - if (failed != 0) { - LOG(ERROR) << failed << " partiton failed to transfer leader"; - } - onFinished_(false); - return Status::Error("partiton failed to transfer leader"); - } - onFinished_(true); - return Status::OK(); -} - -ErrorOr LeaderBalanceJobExecutor::buildLeaderBalancePlan( - HostLeaderMap* hostLeaderMap, - GraphSpaceID spaceId, - int32_t replicaFactor, - bool dependentOnZone, - LeaderBalancePlan& plan, - bool useDeviation) { - PartAllocation peersMap; - HostParts leaderHostParts; - size_t leaderParts = 0; - // store peers of all paritions in peerMap - folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); const auto& prefix = MetaKeyUtils::partPrefix(spaceId); std::unique_ptr iter; - auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + auto retCode = kvstore->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId << static_cast(retCode); + LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId << " " + << apache::thrift::util::enumNameSafe(retCode); return retCode; } - - while (iter->valid()) { + for (; iter->valid(); iter->next()) { auto key = iter->key(); PartitionID partId; memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); - auto peers = MetaKeyUtils::parsePartVal(iter->val()); - peersMap[partId] = std::move(peers); - ++leaderParts; - iter->next(); - } - - int32_t totalParts = 0; - HostParts allHostParts; - auto result = getHostParts(spaceId, dependentOnZone, allHostParts, totalParts); - if (!nebula::ok(result)) { - return nebula::error(result); - } else { - auto retVal = nebula::value(result); - if (!retVal || totalParts == 0 || allHostParts.empty()) { - LOG(ERROR) << "Invalid space " << spaceId; - return false; - } - } - - std::unordered_set activeHosts; - for (const auto& host : *hostLeaderMap) { - // only balance leader between hosts which have valid partition - if (!allHostParts[host.first].empty()) { - activeHosts.emplace(host.first); - leaderHostParts[host.first] = (*hostLeaderMap)[host.first][spaceId]; - } - } - - if (activeHosts.empty()) { - LOG(ERROR) << "No active hosts"; - return false; - } - - if (dependentOnZone) { - for (auto it = allHostParts.begin(); it != allHostParts.end(); it++) { - auto min = it->second.size() / replicaFactor; - VLOG(3) << "Host: " << it->first << " Bounds: " << min << " : " << min + 1; - hostBounds_[it->first] = std::make_pair(min, min + 1); - } - } else { - size_t activeSize = activeHosts.size(); - size_t globalAvg = leaderParts / activeSize; - size_t globalMin = globalAvg; - size_t globalMax = globalAvg; - if (leaderParts % activeSize != 0) { - globalMax += 1; - } - - if (useDeviation) { - globalMin = std::ceil(static_cast(leaderParts) / activeSize * - (1 - FLAGS_leader_balance_deviation)); - globalMax = std::floor(static_cast(leaderParts) / activeSize * - (1 + FLAGS_leader_balance_deviation)); - } - VLOG(3) << "Build leader balance plan, expected min load: " << globalMin - << ", max load: " << globalMax << " avg: " << globalAvg; - - for (auto it = allHostParts.begin(); it != allHostParts.end(); it++) { - hostBounds_[it->first] = std::make_pair(globalMin, globalMax); - } - } - - while (true) { - int32_t taskCount = 0; - bool hasUnbalancedHost = false; - for (const auto& hostEntry : leaderHostParts) { - auto host = hostEntry.first; - auto& hostMinLoad = hostBounds_[host].first; - auto& hostMaxLoad = hostBounds_[host].second; - int32_t partSize = hostEntry.second.size(); - if (hostMinLoad <= partSize && partSize <= hostMaxLoad) { - VLOG(3) << partSize << " is between min load " << hostMinLoad << " and max load " - << hostMaxLoad; - continue; - } - - hasUnbalancedHost = true; - if (partSize < hostMinLoad) { - // need to acquire leader from other hosts - LOG(INFO) << "Acquire leaders to host: " << host << " loading: " << partSize - << " min loading " << hostMinLoad; - taskCount += acquireLeaders( - allHostParts, leaderHostParts, peersMap, activeHosts, host, plan, spaceId); - } else { - // need to transfer leader to other hosts - LOG(INFO) << "Giveup leaders from host: " << host << " loading: " << partSize - << " max loading " << hostMaxLoad; - taskCount += giveupLeaders(leaderHostParts, peersMap, activeHosts, host, plan, spaceId); + std::vector partHosts = MetaKeyUtils::parsePartVal(iter->val()); + for (HostAddr& ha : partHosts) { + for (auto& [zn, zone] : zones_) { + auto it = zone.hosts_.find(ha); + if (it != zone.hosts_.end()) { + it->second.parts_.emplace(partId); + } } } - - // If every host is balanced or no more task during this loop, then the plan - // is done - if (!hasUnbalancedHost || taskCount == 0) { - LOG(INFO) << "Not need balance"; - break; - } } - return true; + return nebula::cpp2::ErrorCode::SUCCEEDED; } -int32_t LeaderBalanceJobExecutor::acquireLeaders(HostParts& allHostParts, - HostParts& leaderHostParts, - PartAllocation& peersMap, - std::unordered_set& activeHosts, - const HostAddr& target, - LeaderBalancePlan& plan, - GraphSpaceID spaceId) { - // host will loop for the partition which is not leader, and try to acuire the - // leader - int32_t taskCount = 0; - std::vector diff; - std::set_difference(allHostParts[target].begin(), - allHostParts[target].end(), - leaderHostParts[target].begin(), - leaderHostParts[target].end(), - std::back_inserter(diff)); - auto& targetLeaders = leaderHostParts[target]; - size_t minLoad = hostBounds_[target].first; - for (const auto& partId : diff) { - VLOG(3) << "Try acquire leader for part " << partId; - // find the leader of partId - auto sources = peersMap[partId]; - for (const auto& source : sources) { - if (source == target || !activeHosts.count(source)) { - continue; - } - - // if peer is the leader of partId and can transfer, then transfer it to - // host - auto& sourceLeaders = leaderHostParts[source]; - VLOG(3) << "Check peer: " << source << " min load: " << minLoad - << " peerLeaders size: " << sourceLeaders.size(); - auto it = std::find(sourceLeaders.begin(), sourceLeaders.end(), partId); - if (it != sourceLeaders.end() && minLoad < sourceLeaders.size()) { - sourceLeaders.erase(it); - targetLeaders.emplace_back(partId); - plan.emplace_back(spaceId, partId, source, target); - LOG(INFO) << "acquire plan trans leader space: " << spaceId << " part: " << partId - << " from " << source.host << ":" << source.port << " to " << target.host << ":" - << target.port; - ++taskCount; - break; - } - } - - // if host has enough leader, just return - if (targetLeaders.size() == minLoad) { - LOG(INFO) << "Host: " << target << "'s leader reach " << minLoad; - break; - } +int32_t Zone::calPartNum() { + int32_t num = 0; + for (auto& p : hosts_) { + num += p.second.parts_.size(); } - return taskCount; + partNum_ = num; + return partNum_; } -int32_t LeaderBalanceJobExecutor::giveupLeaders(HostParts& leaderParts, - PartAllocation& peersMap, - std::unordered_set& activeHosts, - const HostAddr& source, - LeaderBalancePlan& plan, - GraphSpaceID spaceId) { - int32_t taskCount = 0; - auto& sourceLeaders = leaderParts[source]; - size_t maxLoad = hostBounds_[source].second; - - // host will try to transfer the extra leaders to other peers - for (auto it = sourceLeaders.begin(); it != sourceLeaders.end();) { - // find the leader of partId - auto partId = *it; - const auto& targets = peersMap[partId]; - bool isErase = false; - - // leader should move to the peer with lowest loading - auto target = - std::min_element(targets.begin(), targets.end(), [&](const auto& l, const auto& r) -> bool { - if (source == l || !activeHosts.count(l)) { - return false; - } - return leaderParts[l].size() < leaderParts[r].size(); - }); - - // If peer can accept this partition leader, than host will transfer to the - // peer - if (target != targets.end()) { - auto& targetLeaders = leaderParts[*target]; - int32_t targetLeaderSize = targetLeaders.size(); - if (targetLeaderSize < hostBounds_[*target].second) { - it = sourceLeaders.erase(it); - targetLeaders.emplace_back(partId); - plan.emplace_back(spaceId, partId, source, *target); - LOG(INFO) << "giveup plan trans leader space: " << spaceId << " part: " << partId - << " from " << source.host << ":" << source.port << " to " << target->host << ":" - << target->port; - ++taskCount; - isErase = true; - } - } - - // if host has enough leader, just return - if (sourceLeaders.size() == maxLoad) { - LOG(INFO) << "Host: " << source << "'s leader reach " << maxLoad; - break; - } - - if (!isErase) { - ++it; +bool Zone::partExist(PartitionID partId) { + for (auto& p : hosts_) { + if (p.second.parts_.count(partId)) { + return true; } } - return taskCount; + return false; } -void LeaderBalanceJobExecutor::simplifyLeaderBalnacePlan(GraphSpaceID spaceId, - LeaderBalancePlan& plan) { - std::unordered_map buckets; - for (auto& task : plan) { - buckets[std::get<1>(task)].emplace_back(task); - } - plan.clear(); - for (const auto& partEntry : buckets) { - plan.emplace_back(spaceId, - partEntry.first, - std::get<2>(partEntry.second.front()), - std::get<3>(partEntry.second.back())); +bool SpaceInfo::hasHost(const HostAddr& ha) { + for (auto p : zones_) { + if (p.second.hasHost(ha)) { + return true; + } } + return false; } } // namespace meta diff --git a/src/meta/processors/job/BalanceJobExecutor.h b/src/meta/processors/job/BalanceJobExecutor.h index e7f1653d6a5..ec9ac8f0de1 100644 --- a/src/meta/processors/job/BalanceJobExecutor.h +++ b/src/meta/processors/job/BalanceJobExecutor.h @@ -8,20 +8,43 @@ #include "meta/processors/job/BalancePlan.h" #include "meta/processors/job/BalanceTask.h" +#include "meta/processors/job/MetaJobExecutor.h" #include "meta/processors/job/SimpleConcurrentJobExecutor.h" namespace nebula { namespace meta { - -using ZoneParts = std::pair>; using HostParts = std::unordered_map>; -using PartAllocation = std::unordered_map>; using LeaderBalancePlan = std::vector>; -using ZoneNameAndParts = std::pair>; -class BalanceJobExecutor : public MetaJobExecutor { - friend void testRestBlancer(); +struct Host { + explicit Host(const HostAddr& ha) : ha_(ha) {} + Host() = default; + + HostAddr ha_; + std::set parts_; +}; +struct Zone { + Zone() = default; + explicit Zone(const std::string name) : zoneName_(name) {} + bool hasHost(const HostAddr& ha) { return hosts_.find(ha) != hosts_.end(); } + int32_t calPartNum(); + bool partExist(PartitionID partId); + + std::string zoneName_; + std::map hosts_; + int32_t partNum_; +}; +struct SpaceInfo { + nebula::cpp2::ErrorCode getInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore); + bool hasHost(const HostAddr& ha); + + std::string name_; + GraphSpaceID spaceId_; + int32_t replica_; + std::map zones_; +}; +class BalanceJobExecutor : public MetaJobExecutor { public: BalanceJobExecutor(JobID jobId, kvstore::KVStore* kvstore, @@ -36,200 +59,17 @@ class BalanceJobExecutor : public MetaJobExecutor { nebula::cpp2::ErrorCode finish(bool ret = true) override; - folly::Future executeInternal(HostAddr&& address, - std::vector&& parts) override; - - bool runInMeta() override; - nebula::cpp2::ErrorCode recovery() override; protected: - nebula::cpp2::ErrorCode getAllSpaces( - std::vector>& spaces); - - ErrorOr getHostParts(GraphSpaceID spaceId, - bool dependentOnGroup, - HostParts& hostParts, - int32_t& totalParts); - - void calDiff(const HostParts& hostParts, - const std::vector& activeHosts, - std::vector& expand, - std::vector& lost); - - nebula::cpp2::ErrorCode assembleZoneParts(const std::vector& zoneNames, - HostParts& hostParts); - nebula::cpp2::ErrorCode save(const std::string& k, const std::string& v); - protected: - static std::atomic_bool running_; - static std::mutex lock_; - bool innerBalance_ = false; - std::unique_ptr executor_; - std::unordered_map zoneParts_; - std::unordered_map> zoneHosts_; - std::unordered_map> relatedParts_; -}; - -class DataBalanceJobExecutor : public BalanceJobExecutor { - FRIEND_TEST(BalanceTest, BalancePartsTest); - FRIEND_TEST(BalanceTest, NormalTest); - FRIEND_TEST(BalanceTest, SimpleTestWithZone); - FRIEND_TEST(BalanceTest, SpecifyHostTest); - FRIEND_TEST(BalanceTest, SpecifyMultiHostTest); - FRIEND_TEST(BalanceTest, MockReplaceMachineTest); - FRIEND_TEST(BalanceTest, SingleReplicaTest); - FRIEND_TEST(BalanceTest, TryToRecoveryTest); - FRIEND_TEST(BalanceTest, RecoveryTest); - FRIEND_TEST(BalanceTest, StopPlanTest); - FRIEND_TEST(BalanceTest, CleanLastInvalidBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, SimpleLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, IntersectHostsLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalanceTest); - FRIEND_TEST(BalanceTest, ManyHostsLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalanceWithZoneTest); - FRIEND_TEST(BalanceTest, LeaderBalanceWithLargerZoneTest); - FRIEND_TEST(BalanceTest, DISABLED_LeaderBalanceWithComplexZoneTest); - FRIEND_TEST(BalanceTest, ExpansionZoneTest); - FRIEND_TEST(BalanceTest, ExpansionHostIntoZoneTest); - FRIEND_TEST(BalanceTest, ShrinkZoneTest); - FRIEND_TEST(BalanceTest, ShrinkHostFromZoneTest); - FRIEND_TEST(BalanceTest, DISABLED_BalanceWithComplexZoneTest); - FRIEND_TEST(BalanceIntegrationTest, LeaderBalanceTest); - FRIEND_TEST(BalanceIntegrationTest, BalanceTest); - friend void testRestBlancer(); - - public: - DataBalanceJobExecutor(JobDescription jobDescription, - kvstore::KVStore* kvstore, - AdminClient* adminClient, - const std::vector& params) - : BalanceJobExecutor(jobDescription.getJobId(), kvstore, adminClient, params), - jobDescription_(jobDescription) {} - nebula::cpp2::ErrorCode recovery() override; - nebula::cpp2::ErrorCode prepare() override; - nebula::cpp2::ErrorCode finish(bool ret = true) override; - nebula::cpp2::ErrorCode stop() override; + virtual Status buildBalancePlan() { return Status::OK(); } protected: - folly::Future executeInternal(HostAddr&& address, - std::vector&& parts) override; - nebula::cpp2::ErrorCode buildBalancePlan(); - ErrorOr> genTasks( - GraphSpaceID spaceId, - int32_t spaceReplica, - bool dependentOnZone, - std::vector& lostHosts); - ErrorOr hostWithMinimalParts(const HostParts& hostParts, - PartitionID partId); - - ErrorOr hostWithMinimalPartsForZone(const HostAddr& source, - const HostParts& hostParts, - PartitionID partId); - bool balanceParts(GraphSpaceID spaceId, - HostParts& confirmedHostParts, - int32_t totalParts, - std::vector& tasks, - bool dependentOnZone); - - ErrorOr>> fetchHostParts( - GraphSpaceID spaceId, - bool dependentOnGroup, - const HostParts& hostParts, - std::vector& lostHosts); - - nebula::cpp2::ErrorCode transferLostHost(std::vector& tasks, - HostParts& confirmedHostParts, - const HostAddr& source, - GraphSpaceID spaceId, - PartitionID partId, - bool dependentOnZone); - - Status checkReplica(const HostParts& hostParts, - const std::vector& activeHosts, - int32_t replica, - PartitionID partId); - - std::vector> sortedHostsByParts(const HostParts& hostParts); - bool checkZoneLegal(const HostAddr& source, const HostAddr& target); - nebula::cpp2::ErrorCode finishInternal(bool ret = true); - - private: - static std::unique_ptr plan_; - std::vector lostHosts_; - JobDescription jobDescription_; -}; - -class LeaderBalanceJobExecutor : public BalanceJobExecutor { - FRIEND_TEST(BalanceTest, BalancePartsTest); - FRIEND_TEST(BalanceTest, NormalTest); - FRIEND_TEST(BalanceTest, SimpleTestWithZone); - FRIEND_TEST(BalanceTest, SpecifyHostTest); - FRIEND_TEST(BalanceTest, SpecifyMultiHostTest); - FRIEND_TEST(BalanceTest, MockReplaceMachineTest); - FRIEND_TEST(BalanceTest, SingleReplicaTest); - FRIEND_TEST(BalanceTest, TryToRecoveryTest); - FRIEND_TEST(BalanceTest, RecoveryTest); - FRIEND_TEST(BalanceTest, StopPlanTest); - FRIEND_TEST(BalanceTest, CleanLastInvalidBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, SimpleLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, IntersectHostsLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalanceTest); - FRIEND_TEST(BalanceTest, ManyHostsLeaderBalancePlanTest); - FRIEND_TEST(BalanceTest, LeaderBalanceWithZoneTest); - FRIEND_TEST(BalanceTest, LeaderBalanceWithLargerZoneTest); - FRIEND_TEST(BalanceTest, DISABLED_LeaderBalanceWithComplexZoneTest); - FRIEND_TEST(BalanceTest, ExpansionZoneTest); - FRIEND_TEST(BalanceTest, ExpansionHostIntoZoneTest); - FRIEND_TEST(BalanceTest, ShrinkZoneTest); - FRIEND_TEST(BalanceTest, ShrinkHostFromZoneTest); - FRIEND_TEST(BalanceTest, DISABLED_BalanceWithComplexZoneTest); - FRIEND_TEST(BalanceIntegrationTest, LeaderBalanceTest); - FRIEND_TEST(BalanceIntegrationTest, BalanceTest); - friend void testRestBlancer(); - - public: - LeaderBalanceJobExecutor(JobID jobId, - kvstore::KVStore* kvstore, - AdminClient* adminClient, - const std::vector& params) - : BalanceJobExecutor(jobId, kvstore, adminClient, params) {} - - protected: - folly::Future executeInternal(HostAddr&& address, - std::vector&& parts) override; - - ErrorOr buildLeaderBalancePlan(HostLeaderMap* hostLeaderMap, - GraphSpaceID spaceId, - int32_t replicaFactor, - bool dependentOnZone, - LeaderBalancePlan& plan, - bool useDeviation = true); - - int32_t acquireLeaders(HostParts& allHostParts, - HostParts& leaderHostParts, - PartAllocation& peersMap, - std::unordered_set& activeHosts, - const HostAddr& target, - LeaderBalancePlan& plan, - GraphSpaceID spaceId); - - int32_t giveupLeaders(HostParts& leaderParts, - PartAllocation& peersMap, - std::unordered_set& activeHosts, - const HostAddr& source, - LeaderBalancePlan& plan, - GraphSpaceID spaceId); - - void simplifyLeaderBalnacePlan(GraphSpaceID spaceId, LeaderBalancePlan& plan); - - private: - static std::atomic_bool inLeaderBalance_; - std::unique_ptr hostLeaderMap_; - std::unordered_map> hostBounds_; + std::unique_ptr plan_; + std::unique_ptr executor_; + SpaceInfo spaceInfo_; }; } // namespace meta diff --git a/src/meta/processors/job/BalancePlan.cpp b/src/meta/processors/job/BalancePlan.cpp index 397e142a82a..45ebd0278aa 100644 --- a/src/meta/processors/job/BalancePlan.cpp +++ b/src/meta/processors/job/BalancePlan.cpp @@ -68,7 +68,9 @@ void BalancePlan::invoke() { if (finished) { CHECK_EQ(j, this->buckets_[i].size() - 1); saveInStore(true); - onFinished_(); + onFinished_(stopped ? meta::cpp2::JobStatus::STOPPED + : (failed_ ? meta::cpp2::JobStatus::FAILED + : meta::cpp2::JobStatus::FINISHED)); } else if (j + 1 < this->buckets_[i].size()) { auto& task = this->tasks_[this->buckets_[i][j + 1]]; if (stopped) { @@ -83,6 +85,7 @@ void BalancePlan::invoke() { { std::lock_guard lg(lock_); finishedTaskNum_++; + failed_ = true; VLOG(1) << "Balance " << id() << " has completed " << finishedTaskNum_ << " task"; setStatus(meta::cpp2::JobStatus::FAILED); if (finishedTaskNum_ == tasks_.size()) { @@ -93,7 +96,7 @@ void BalancePlan::invoke() { } if (finished) { CHECK_EQ(j, this->buckets_[i].size() - 1); - onFinished_(); + onFinished_(stopped ? meta::cpp2::JobStatus::STOPPED : meta::cpp2::JobStatus::FAILED); } else if (j + 1 < this->buckets_[i].size()) { auto& task = this->tasks_[this->buckets_[i][j + 1]]; if (tasks_[taskIndex].spaceId_ == task.spaceId_ && @@ -121,7 +124,6 @@ void BalancePlan::invoke() { nebula::cpp2::ErrorCode BalancePlan::saveInStore(bool onlyPlan) { CHECK_NOTNULL(kv_); std::vector data; - data.emplace_back(jobDescription_.jobKey(), jobDescription_.jobVal()); if (!onlyPlan) { for (auto& task : tasks_) { data.emplace_back(MetaKeyUtils::balanceTaskKey( @@ -149,7 +151,7 @@ nebula::cpp2::ErrorCode BalancePlan::saveInStore(bool onlyPlan) { ErrorOr> BalancePlan::show( JobID jobId, kvstore::KVStore* kv, AdminClient* client) { - auto ret = getBalanceTasks(jobId, kv, client, true); + auto ret = getBalanceTasks(jobId, kv, client, false); if (!ok(ret)) { return error(ret); } @@ -180,6 +182,10 @@ ErrorOr> BalancePlan::sh return thriftTasks; } +void BalancePlan::setFinishCallBack(std::function func) { + onFinished_ = func; +} + ErrorOr> BalancePlan::getBalanceTasks( JobID jobId, kvstore::KVStore* kv, AdminClient* client, bool resume) { CHECK_NOTNULL(kv); @@ -213,7 +219,7 @@ ErrorOr> BalancePlan::getBalan task.endTimeMs_ = std::get<3>(tup); if (resume && task.ret_ != BalanceTaskResult::SUCCEEDED) { // Resume the failed task, skip the in-progress and invalid tasks - if (task.ret_ == BalanceTaskResult::FAILED) { + if (task.ret_ == BalanceTaskResult::FAILED || task.ret_ == BalanceTaskResult::INVALID) { task.ret_ = BalanceTaskResult::IN_PROGRESS; } task.status_ = BalanceTaskStatus::START; @@ -229,6 +235,7 @@ ErrorOr> BalancePlan::getBalan task.ret_ = BalanceTaskResult::INVALID; } } + task.endTimeMs_ = 0; } } tasks.emplace_back(std::move(task)); diff --git a/src/meta/processors/job/BalancePlan.h b/src/meta/processors/job/BalancePlan.h index 6f506600e52..711a7e1e814 100644 --- a/src/meta/processors/job/BalancePlan.h +++ b/src/meta/processors/job/BalancePlan.h @@ -94,6 +94,8 @@ class BalancePlan { kvstore::KVStore* kv, AdminClient* client); + void setFinishCallBack(std::function func); + private: JobDescription jobDescription_; kvstore::KVStore* kv_ = nullptr; @@ -101,8 +103,9 @@ class BalancePlan { std::vector tasks_; std::mutex lock_; size_t finishedTaskNum_ = 0; - std::function onFinished_; + std::function onFinished_; bool stopped_ = false; + bool failed_ = false; // List of task index in tasks_; using Bucket = std::vector; diff --git a/src/meta/processors/job/BalanceTask.cpp b/src/meta/processors/job/BalanceTask.cpp index 49653bf8b78..fabf2c25344 100644 --- a/src/meta/processors/job/BalanceTask.cpp +++ b/src/meta/processors/job/BalanceTask.cpp @@ -26,7 +26,8 @@ void BalanceTask::invoke() { if (ret_ == BalanceTaskResult::INVALID) { endTimeMs_ = time::WallClock::fastNowInSec(); saveInStore(); - LOG(ERROR) << taskIdStr_ << " Task invalid, status " << static_cast(status_); + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Task invalid, status " + << static_cast(status_); // When a plan is stopped or dst is not alive any more, a task will be // marked as INVALID, the task will not be executed again. Balancer will // start a new plan instead. @@ -35,11 +36,12 @@ void BalanceTask::invoke() { } else if (ret_ == BalanceTaskResult::FAILED) { endTimeMs_ = time::WallClock::fastNowInSec(); saveInStore(); - LOG(ERROR) << taskIdStr_ << " Task failed, status " << static_cast(status_); + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Task failed, status " + << static_cast(status_); onError_(); return; } else { - VLOG(3) << taskIdStr_ << " still in processing"; + VLOG(3) << taskIdStr_ + "," + commandStr_ << " still in processing"; } switch (status_) { @@ -50,7 +52,8 @@ void BalanceTask::invoke() { SAVE_STATE(); client_->checkPeers(spaceId_, partId_).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Check the peers failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Check the peers failed, status " + << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::CHANGE_LEADER; @@ -61,7 +64,7 @@ void BalanceTask::invoke() { } // fallthrough case BalanceTaskStatus::CHANGE_LEADER: { - LOG(INFO) << taskIdStr_ << " Ask the src to give up the leadership."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Ask the src to give up the leadership."; SAVE_STATE(); auto srcLivedRet = ActiveHostsMan::isLived(kv_, src_); if (nebula::ok(srcLivedRet) && nebula::value(srcLivedRet)) { @@ -73,7 +76,8 @@ void BalanceTask::invoke() { LOG(WARNING) << "Can't find part " << partId_ << " on " << src_; status_ = BalanceTaskStatus::ADD_PART_ON_DST; } else { - LOG(ERROR) << taskIdStr_ << " Transfer leader failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Transfer leader failed, status " + << resp; ret_ = BalanceTaskResult::FAILED; } } else { @@ -83,17 +87,18 @@ void BalanceTask::invoke() { }); break; } else { - LOG(INFO) << taskIdStr_ << " Src host has been lost, so no need to transfer leader"; + LOG(INFO) << taskIdStr_ + "," + commandStr_ + << " Src host has been lost, so no need to transfer leader"; status_ = BalanceTaskStatus::ADD_PART_ON_DST; } } // fallthrough case BalanceTaskStatus::ADD_PART_ON_DST: { - LOG(INFO) << taskIdStr_ << " Open the part as learner on dst."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Open the part as learner on dst."; SAVE_STATE(); client_->addPart(spaceId_, partId_, dst_, true).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Open part failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Open part failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::ADD_LEARNER; @@ -103,11 +108,11 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::ADD_LEARNER: { - LOG(INFO) << taskIdStr_ << " Add learner dst."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Add learner dst."; SAVE_STATE(); client_->addLearner(spaceId_, partId_, dst_).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Add learner failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Add learner failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::CATCH_UP_DATA; @@ -117,11 +122,11 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::CATCH_UP_DATA: { - LOG(INFO) << taskIdStr_ << " Waiting for the data catch up."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Waiting for the data catch up."; SAVE_STATE(); client_->waitingForCatchUpData(spaceId_, partId_, dst_).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Catchup data failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Catchup data failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::MEMBER_CHANGE_ADD; @@ -131,12 +136,12 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::MEMBER_CHANGE_ADD: { - LOG(INFO) << taskIdStr_ << " Send member change request to the leader" + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Send member change request to the leader" << ", it will add the new member on dst host"; SAVE_STATE(); client_->memberChange(spaceId_, partId_, dst_, true).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Add peer failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Add peer failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::MEMBER_CHANGE_REMOVE; @@ -146,12 +151,12 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::MEMBER_CHANGE_REMOVE: { - LOG(INFO) << taskIdStr_ << " Send member change request to the leader" + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Send member change request to the leader" << ", it will remove the old member on src host"; SAVE_STATE(); client_->memberChange(spaceId_, partId_, src_, false).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Remove peer failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Remove peer failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::UPDATE_PART_META; @@ -161,16 +166,16 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::UPDATE_PART_META: { - LOG(INFO) << taskIdStr_ << " Update meta for part."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Update meta for part."; SAVE_STATE(); client_->updateMeta(spaceId_, partId_, src_, dst_).thenValue([this](auto&& resp) { // The callback will be called inside raft set value. So don't call // invoke directly here. if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Update meta failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Update meta failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { - LOG(INFO) << taskIdStr_ << " Update meta succeeded!"; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Update meta succeeded!"; status_ = BalanceTaskStatus::REMOVE_PART_ON_SRC; } invoke(); @@ -179,12 +184,12 @@ void BalanceTask::invoke() { } case BalanceTaskStatus::REMOVE_PART_ON_SRC: { auto srcLivedRet = ActiveHostsMan::isLived(kv_, src_); - LOG(INFO) << taskIdStr_ << " Close part on src host, srcLived."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Close part on src host, srcLived."; SAVE_STATE(); if (nebula::ok(srcLivedRet) && nebula::value(srcLivedRet)) { client_->removePart(spaceId_, partId_, src_).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Remove part failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Remove part failed, status " << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::CHECK; @@ -193,17 +198,18 @@ void BalanceTask::invoke() { }); break; } else { - LOG(INFO) << taskIdStr_ << " Don't remove part on src " << src_; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Don't remove part on src " << src_; status_ = BalanceTaskStatus::CHECK; } } // fallthrough case BalanceTaskStatus::CHECK: { - LOG(INFO) << taskIdStr_ << " Check the peers..."; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Check the peers..."; SAVE_STATE(); client_->checkPeers(spaceId_, partId_).thenValue([this](auto&& resp) { if (!resp.ok()) { - LOG(ERROR) << taskIdStr_ << " Check the peers failed, status " << resp; + LOG(ERROR) << taskIdStr_ + "," + commandStr_ << " Check the peers failed, status " + << resp; ret_ = BalanceTaskResult::FAILED; } else { status_ = BalanceTaskStatus::END; @@ -213,7 +219,7 @@ void BalanceTask::invoke() { break; } case BalanceTaskStatus::END: { - LOG(INFO) << taskIdStr_ << " Part has been moved successfully!"; + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " Part has been moved successfully!"; endTimeMs_ = time::WallClock::fastNowInSec(); ret_ = BalanceTaskResult::SUCCEEDED; SAVE_STATE(); @@ -250,6 +256,9 @@ bool BalanceTask::saveInStore() { baton.post(); }); baton.wait(); + if (ret_ == BalanceTaskResult::INVALID) + LOG(INFO) << taskIdStr_ + "," + commandStr_ << " save task: " << static_cast(status_) + << " " << static_cast(ret_); return ret; } diff --git a/src/meta/processors/job/BalanceTask.h b/src/meta/processors/job/BalanceTask.h index 84be7b33ae3..b9fbc36acfd 100644 --- a/src/meta/processors/job/BalanceTask.h +++ b/src/meta/processors/job/BalanceTask.h @@ -46,9 +46,11 @@ class BalanceTask { partId_(partId), src_(src), dst_(dst), - taskIdStr_(buildTaskId()), kv_(kv), - client_(client) {} + client_(client) { + taskIdStr_ = buildTaskId(); + commandStr_ = buildCommand(); + } const std::string& taskIdStr() const { return taskIdStr_; diff --git a/src/meta/processors/job/DataBalanceJobExecutor.cpp b/src/meta/processors/job/DataBalanceJobExecutor.cpp new file mode 100644 index 00000000000..c3ffc0ac3a7 --- /dev/null +++ b/src/meta/processors/job/DataBalanceJobExecutor.cpp @@ -0,0 +1,190 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "meta/processors/job/DataBalanceJobExecutor.h" + +#include + +#include "common/utils/MetaKeyUtils.h" +#include "kvstore/NebulaStore.h" +#include "meta/processors/job/JobUtils.h" + +namespace nebula { +namespace meta { + +folly::Future DataBalanceJobExecutor::executeInternal() { + if (plan_ == nullptr) { + Status status = buildBalancePlan(); + if (status != Status::OK()) { + return status; + } + } + plan_->setFinishCallBack([this](meta::cpp2::JobStatus status) { + if (LastUpdateTimeMan::update(kvstore_, time::WallClock::fastNowInMilliSec()) != + nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Balance plan " << plan_->id() << " update meta failed"; + } + executorOnFinished_(status); + }); + plan_->invoke(); + return Status::OK(); +} + +Status DataBalanceJobExecutor::buildBalancePlan() { + std::map> lostZoneHost; + std::map> activeSortedHost; + for (auto& p : spaceInfo_.zones_) { + for (auto& ph : p.second.hosts_) { + activeSortedHost[p.first].push_back(&ph.second); + } + } + for (HostAddr ha : lostHosts_) { + if (!spaceInfo_.hasHost(ha)) { + return Status::Error( + "Host %s does not belong to space %d", ha.toString().c_str(), spaceInfo_.spaceId_); + } + for (auto& zoneMapEntry : spaceInfo_.zones_) { + auto it = zoneMapEntry.second.hosts_.find(ha); + if (it != zoneMapEntry.second.hosts_.end()) { + lostZoneHost[zoneMapEntry.first].push_back(&it->second); + std::vector& hvec = activeSortedHost[zoneMapEntry.first]; + hvec.erase(std::find(hvec.begin(), hvec.end(), &it->second)); + break; + } + } + } + for (auto& hostMapEntry : activeSortedHost) { + std::vector& hvec = hostMapEntry.second; + std::sort(hvec.begin(), hvec.end(), [](Host*& l, Host*& r) -> bool { + return l->parts_.size() < r->parts_.size(); + }); + } + plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); + for (auto& p : lostZoneHost) { + std::vector& hvec = activeSortedHost[p.first]; + for (Host* h : p.second) { + for (PartitionID partId : h->parts_) { + Host* dstHost = hvec.front(); + dstHost->parts_.insert(partId); + plan_->addTask(BalanceTask( + jobId_, spaceInfo_.spaceId_, partId, h->ha_, dstHost->ha_, kvstore_, adminClient_)); + for (size_t i = 0; i < hvec.size() - 1; i++) { + if (hvec[i]->parts_.size() > hvec[i + 1]->parts_.size()) { + std::swap(hvec[i], hvec[i + 1]); + } else { + break; + } + } + } + h->parts_.clear(); + } + } + lostZoneHost.clear(); + auto balanceHostVec = [this](std::vector& hostVec) -> std::vector { + size_t totalPartNum = 0; + size_t avgPartNum = 0; + for (Host* h : hostVec) { + totalPartNum += h->parts_.size(); + } + avgPartNum = totalPartNum / hostVec.size(); + size_t remainder = totalPartNum - avgPartNum * hostVec.size(); + size_t leftBegin = 0; + size_t leftEnd = 0; + size_t rightBegin = 0; + size_t rightEnd = hostVec.size(); + std::vector tasks; + for (size_t i = 0; i < hostVec.size(); i++) { + if (avgPartNum <= hostVec[i]->parts_.size()) { + leftEnd = i; + break; + } + } + for (size_t i = 0; i < hostVec.size(); i++) { + if (avgPartNum < hostVec[i]->parts_.size()) { + rightBegin = i; + break; + } + } + for (size_t right = rightBegin; right < rightEnd;) { + Host* srcHost = hostVec[right]; + if (srcHost->parts_.size() == avgPartNum + 1 && remainder) { + right++; + remainder--; + continue; + } + if (srcHost->parts_.size() == avgPartNum) { + right++; + continue; + } + PartitionID partId = *(srcHost->parts_.begin()); + hostVec[leftBegin]->parts_.insert(partId); + srcHost->parts_.erase(partId); + tasks.emplace_back(jobId_, + spaceInfo_.spaceId_, + partId, + srcHost->ha_, + hostVec[leftBegin]->ha_, + kvstore_, + adminClient_); + size_t leftIndex = leftBegin; + for (; leftIndex < leftEnd - 1; leftIndex++) { + if (hostVec[leftIndex]->parts_.size() > hostVec[leftIndex + 1]->parts_.size()) { + std::swap(hostVec[leftIndex], hostVec[leftIndex + 1]); + } else { + break; + } + } + if (leftIndex == leftEnd - 1 && hostVec[leftIndex]->parts_.size() >= avgPartNum) { + leftEnd--; + } + if (leftBegin == leftEnd) { + leftEnd = rightBegin; + } + } + return tasks; + }; + for (auto& p : activeSortedHost) { + std::vector& hvec = p.second; + std::vector tasks = balanceHostVec(hvec); + for (BalanceTask& task : tasks) { + plan_->addTask(std::move(task)); + } + } + if (plan_->tasks().empty()) { + return Status::Balanced(); + } + nebula::cpp2::ErrorCode rc = plan_->saveInStore(); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + return Status::Error("save balance zone plan failed"); + } + return Status::OK(); +} + +nebula::cpp2::ErrorCode DataBalanceJobExecutor::stop() { + stopped_ = true; + plan_->stop(); + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +nebula::cpp2::ErrorCode DataBalanceJobExecutor::prepare() { + auto spaceRet = getSpaceIdFromName(paras_.back()); + if (!nebula::ok(spaceRet)) { + LOG(ERROR) << "Can't find the space: " << paras_.back(); + return nebula::error(spaceRet); + } + GraphSpaceID spaceId = nebula::value(spaceRet); + nebula::cpp2::ErrorCode rc = spaceInfo_.getInfo(spaceId, kvstore_); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + return rc; + } + lostHosts_.reserve(paras_.size() - 1); + for (size_t i = 0; i < paras_.size() - 1; i++) { + lostHosts_.emplace_back(HostAddr::fromString(paras_[i])); + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/job/DataBalanceJobExecutor.h b/src/meta/processors/job/DataBalanceJobExecutor.h new file mode 100644 index 00000000000..f7759320227 --- /dev/null +++ b/src/meta/processors/job/DataBalanceJobExecutor.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef META_DATABALANCEJOBEXECUTOR_H_ +#define META_DATABALANCEJOBEXECUTOR_H_ + +#include "meta/processors/job/BalanceJobExecutor.h" + +namespace nebula { +namespace meta { + +class DataBalanceJobExecutor : public BalanceJobExecutor { + FRIEND_TEST(BalanceTest, BalanceDataPlanTest); + FRIEND_TEST(BalanceTest, NormalDataTest); + FRIEND_TEST(BalanceTest, RecoveryTest); + FRIEND_TEST(BalanceTest, StopPlanTest); + + public: + DataBalanceJobExecutor(JobDescription jobDescription, + kvstore::KVStore* kvstore, + AdminClient* adminClient, + const std::vector& params) + : BalanceJobExecutor(jobDescription.getJobId(), kvstore, adminClient, params), + jobDescription_(jobDescription) {} + nebula::cpp2::ErrorCode prepare() override; + nebula::cpp2::ErrorCode stop() override; + + protected: + folly::Future executeInternal() override; + Status buildBalancePlan() override; + + private: + std::vector lostHosts_; + JobDescription jobDescription_; +}; + +} // namespace meta +} // namespace nebula + +#endif // META_BALANCEJOBEXECUTOR_H_ diff --git a/src/meta/processors/job/JobExecutor.cpp b/src/meta/processors/job/JobExecutor.cpp new file mode 100644 index 00000000000..d940afdc7c2 --- /dev/null +++ b/src/meta/processors/job/JobExecutor.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2019 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "common/network/NetworkUtils.h" +#include "common/utils/MetaKeyUtils.h" +#include "common/utils/Utils.h" +#include "interface/gen-cpp2/common_types.h" +#include "meta/ActiveHostsMan.h" +#include "meta/common/MetaCommon.h" +#include "meta/processors/Common.h" +#include "meta/processors/admin/AdminClient.h" +#include "meta/processors/job/CompactJobExecutor.h" +#include "meta/processors/job/DataBalanceJobExecutor.h" +#include "meta/processors/job/FlushJobExecutor.h" +#include "meta/processors/job/LeaderBalanceJobExecutor.h" +#include "meta/processors/job/RebuildEdgeJobExecutor.h" +#include "meta/processors/job/RebuildFTJobExecutor.h" +#include "meta/processors/job/RebuildTagJobExecutor.h" +#include "meta/processors/job/StatsJobExecutor.h" +#include "meta/processors/job/StorageJobExecutor.h" +#include "meta/processors/job/TaskDescription.h" +#include "meta/processors/job/ZoneBalanceJobExecutor.h" + +DECLARE_int32(heartbeat_interval_secs); +DECLARE_uint32(expired_time_factor); + +namespace nebula { +namespace meta { + +ErrorOr JobExecutor::getSpaceIdFromName( + const std::string& spaceName) { + auto indexKey = MetaKeyUtils::indexSpaceKey(spaceName); + std::string val; + auto retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, indexKey, &val); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get space failed, space name: " << spaceName + << " error: " << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + return *reinterpret_cast(val.c_str()); +} + +std::unique_ptr JobExecutorFactory::createJobExecutor(const JobDescription& jd, + kvstore::KVStore* store, + AdminClient* client) { + std::unique_ptr ret; + switch (jd.getCmd()) { + case cpp2::AdminCmd::COMPACT: + ret.reset(new CompactJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::DATA_BALANCE: + ret.reset(new DataBalanceJobExecutor(jd, store, client, jd.getParas())); + break; + case cpp2::AdminCmd::ZONE_BALANCE: + ret.reset(new ZoneBalanceJobExecutor(jd, store, client, jd.getParas())); + break; + case cpp2::AdminCmd::LEADER_BALANCE: + ret.reset(new LeaderBalanceJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::FLUSH: + ret.reset(new FlushJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::REBUILD_TAG_INDEX: + ret.reset(new RebuildTagJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::REBUILD_EDGE_INDEX: + ret.reset(new RebuildEdgeJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::REBUILD_FULLTEXT_INDEX: + ret.reset(new RebuildFTJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + case cpp2::AdminCmd::STATS: + ret.reset(new StatsJobExecutor(jd.getJobId(), store, client, jd.getParas())); + break; + default: + break; + } + return ret; +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/job/JobExecutor.h b/src/meta/processors/job/JobExecutor.h new file mode 100644 index 00000000000..9c482dd272b --- /dev/null +++ b/src/meta/processors/job/JobExecutor.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef META_JOBEXECUTOR_H_ +#define META_JOBEXECUTOR_H_ + +#include + +#include "common/base/ErrorOr.h" +#include "kvstore/KVStore.h" +#include "meta/processors/admin/AdminClient.h" +#include "meta/processors/job/JobDescription.h" + +namespace nebula { +namespace meta { + +class JobExecutor { + public: + JobExecutor() = default; + explicit JobExecutor(kvstore::KVStore* kv) : kvstore_(kv) {} + virtual ~JobExecutor() = default; + + // Check the arguments about the job. + virtual bool check() = 0; + + // Prepare the Job info from the arguments. + virtual nebula::cpp2::ErrorCode prepare() = 0; + + // The skeleton to run the job. + // You should rewrite the executeInternal to trigger the calling. + virtual nebula::cpp2::ErrorCode execute() = 0; + + // Stop the job when the user cancel it. + virtual nebula::cpp2::ErrorCode stop() = 0; + + virtual nebula::cpp2::ErrorCode finish(bool) = 0; + + virtual nebula::cpp2::ErrorCode recovery() = 0; + + virtual void setSpaceId(GraphSpaceID spaceId) = 0; + + virtual bool isMetaJob() = 0; + + virtual void setFinishCallBack( + std::function func) { + UNUSED(func); + } + + virtual nebula::cpp2::ErrorCode saveSpecialTaskStatus(const cpp2::ReportTaskReq&) = 0; + + protected: + ErrorOr getSpaceIdFromName(const std::string& spaceName); + + protected: + kvstore::KVStore* kvstore_{nullptr}; +}; + +class JobExecutorFactory { + public: + static std::unique_ptr createJobExecutor(const JobDescription& jd, + kvstore::KVStore* store, + AdminClient* client); +}; + +} // namespace meta +} // namespace nebula + +#endif // META_JOBEXECUTOR_H_ diff --git a/src/meta/processors/job/JobManager.cpp b/src/meta/processors/job/JobManager.cpp index 69247726e0b..e833b735126 100644 --- a/src/meta/processors/job/JobManager.cpp +++ b/src/meta/processors/job/JobManager.cpp @@ -45,8 +45,7 @@ bool JobManager::init(nebula::kvstore::KVStore* store) { if (store == nullptr) { return false; } - std::lock_guard lk(statusGuard_); - if (status_ != JbmgrStatus::NOT_START) { + if (status_.load(std::memory_order_acquire) != JbmgrStatus::NOT_START) { return false; } kvStore_ = store; @@ -54,7 +53,7 @@ bool JobManager::init(nebula::kvstore::KVStore* store) { lowPriorityQueue_ = std::make_unique, true>>(); highPriorityQueue_ = std::make_unique, true>>(); - status_ = JbmgrStatus::IDLE; + status_.store(JbmgrStatus::IDLE, std::memory_order_release); if (handleRemainingJobs() != nebula::cpp2::ErrorCode::SUCCEEDED) { return false; } @@ -86,10 +85,10 @@ nebula::cpp2::ErrorCode JobManager::handleRemainingJobs() { auto optJobRet = JobDescription::makeJobDescription(iter->key(), iter->val()); if (nebula::ok(optJobRet)) { auto optJob = nebula::value(optJobRet); - std::unique_ptr je = - MetaJobExecutorFactory::createMetaJobExecutor(optJob, kvStore_, adminClient_); + std::unique_ptr je = + JobExecutorFactory::createJobExecutor(optJob, kvStore_, adminClient_); // Only balance has been recovered - if (optJob.getStatus() == cpp2::JobStatus::RUNNING && je->runInMeta()) { + if (optJob.getStatus() == cpp2::JobStatus::RUNNING && je->isMetaJob()) { jds.emplace_back(optJob); } } @@ -103,24 +102,22 @@ nebula::cpp2::ErrorCode JobManager::handleRemainingJobs() { void JobManager::shutDown() { LOG(INFO) << "JobManager::shutDown() begin"; - if (status_ == JbmgrStatus::STOPPED) { // in case of shutdown more than once + if (status_.load(std::memory_order_acquire) == + JbmgrStatus::STOPPED) { // in case of shutdown more than once LOG(INFO) << "JobManager not running, exit"; return; } - { - std::lock_guard lk(statusGuard_); - status_ = JbmgrStatus::STOPPED; - } + status_.store(JbmgrStatus::STOPPED, std::memory_order_release); bgThread_.join(); LOG(INFO) << "JobManager::shutDown() end"; } void JobManager::scheduleThread() { LOG(INFO) << "JobManager::runJobBackground() enter"; - while (status_ != JbmgrStatus::STOPPED) { + while (status_.load(std::memory_order_acquire) != JbmgrStatus::STOPPED) { std::pair opJobId; - while (status_ == JbmgrStatus::BUSY || !try_dequeue(opJobId)) { - if (status_ == JbmgrStatus::STOPPED) { + while (status_.load(std::memory_order_acquire) == JbmgrStatus::BUSY || !try_dequeue(opJobId)) { + if (status_.load(std::memory_order_acquire) == JbmgrStatus::STOPPED) { LOG(INFO) << "[JobManager] detect shutdown called, exit"; break; } @@ -133,17 +130,12 @@ void JobManager::scheduleThread() { continue; // leader change or archive happened } auto jobDesc = nebula::value(jobDescRet); - if (!jobDesc.setStatus(cpp2::JobStatus::RUNNING)) { + if (!jobDesc.setStatus(cpp2::JobStatus::RUNNING, opJobId.first == JbOp::RECOVER)) { LOG(INFO) << "[JobManager] skip job " << opJobId.second; continue; } save(jobDesc.jobKey(), jobDesc.jobVal()); - { - std::lock_guard lk(statusGuard_); - if (status_ == JbmgrStatus::IDLE) { - status_ = JbmgrStatus::BUSY; - } - } + compareChangeStatus(JbmgrStatus::IDLE, JbmgrStatus::BUSY); if (!runJobInternal(jobDesc, opJobId.first)) { jobFinished(opJobId.second, cpp2::JobStatus::FAILED); } @@ -152,8 +144,11 @@ void JobManager::scheduleThread() { // @return: true if all task dispatched, else false bool JobManager::runJobInternal(const JobDescription& jobDesc, JbOp op) { - std::unique_ptr jobExec = - MetaJobExecutorFactory::createMetaJobExecutor(jobDesc, kvStore_, adminClient_); + std::lock_guard lk(muJobFinished_); + std::unique_ptr je = + JobExecutorFactory::createJobExecutor(jobDesc, kvStore_, adminClient_); + JobExecutor* jobExec = je.get(); + runningJobs_.emplace(jobDesc.getJobId(), std::move(je)); if (jobExec == nullptr) { LOG(ERROR) << "unreconized job cmd " << apache::thrift::util::enumNameSafe(jobDesc.getCmd()); return false; @@ -176,24 +171,14 @@ bool JobManager::runJobInternal(const JobDescription& jobDesc, JbOp op) { if (op == JbOp::RECOVER) { jobExec->recovery(); } - if (jobExec->runInMeta()) { - jobExec->setFinishCallBack([this, &jobDesc](bool ret) { - SCOPE_EXIT { + if (jobExec->isMetaJob()) { + jobExec->setFinishCallBack([this, jobDesc](meta::cpp2::JobStatus status) { + if (status == meta::cpp2::JobStatus::STOPPED) { + std::lock_guard lkg(muJobFinished_); cleanJob(jobDesc.getJobId()); - }; - if (ret) { - JobDescription jd = jobDesc; - if (!jd.setStatus(cpp2::JobStatus::FINISHED)) { - return nebula::cpp2::ErrorCode::E_SAVE_JOB_FAILURE; - } - statusGuard_.lock(); - if (status_ == JbmgrStatus::BUSY) { - status_ = JbmgrStatus::IDLE; - } - statusGuard_.unlock(); - return save(jd.jobKey(), jd.jobVal()); - } else { return nebula::cpp2::ErrorCode::SUCCEEDED; + } else { + return jobFinished(jobDesc.getJobId(), status); } }); } @@ -211,6 +196,10 @@ void JobManager::cleanJob(JobID jobId) { if (it != inFlightJobs_.end()) { inFlightJobs_.erase(it); } + auto itr = runningJobs_.find(jobId); + if (itr != runningJobs_.end()) { + runningJobs_.erase(itr); + } } nebula::cpp2::ErrorCode JobManager::jobFinished(JobID jobId, cpp2::JobStatus jobStatus) { @@ -218,9 +207,6 @@ nebula::cpp2::ErrorCode JobManager::jobFinished(JobID jobId, cpp2::JobStatus job "{}, jobId={}, result={}", __func__, jobId, apache::thrift::util::enumNameSafe(jobStatus)); // normal job finish may race to job stop std::lock_guard lk(muJobFinished_); - SCOPE_EXIT { - cleanJob(jobId); - }; auto optJobDescRet = JobDescription::loadJobDescription(jobId, kvStore_); if (!nebula::ok(optJobDescRet)) { LOG(WARNING) << folly::sformat("can't load job, jobId={}", jobId); @@ -228,10 +214,7 @@ nebula::cpp2::ErrorCode JobManager::jobFinished(JobID jobId, cpp2::JobStatus job // there is a rare condition, that when job finished, // the job description is deleted(default more than a week) // but stop an invalid job should not set status to idle. - std::lock_guard statusLk(statusGuard_); - if (status_ == JbmgrStatus::BUSY) { - status_ = JbmgrStatus::IDLE; - } + compareChangeStatus(JbmgrStatus::BUSY, JbmgrStatus::IDLE); } return nebula::error(optJobDescRet); } @@ -242,23 +225,18 @@ nebula::cpp2::ErrorCode JobManager::jobFinished(JobID jobId, cpp2::JobStatus job // job already been set as finished, failed or stopped return nebula::cpp2::ErrorCode::E_SAVE_JOB_FAILURE; } - { - std::lock_guard statusLk(statusGuard_); - if (status_ == JbmgrStatus::BUSY) { - status_ = JbmgrStatus::IDLE; - } - } + compareChangeStatus(JbmgrStatus::BUSY, JbmgrStatus::IDLE); auto rc = save(optJobDesc.jobKey(), optJobDesc.jobVal()); if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { return rc; } - auto jobExec = MetaJobExecutorFactory::createMetaJobExecutor(optJobDesc, kvStore_, adminClient_); - - if (!jobExec) { - LOG(WARNING) << folly::sformat("unable to create jobExecutor, jobId={}", jobId); + auto it = runningJobs_.find(jobId); + if (it == runningJobs_.end()) { + LOG(WARNING) << folly::sformat("can't find jobExecutor, jobId={}", jobId); return nebula::cpp2::ErrorCode::E_UNKNOWN; } + std::unique_ptr& jobExec = it->second; if (!optJobDesc.getParas().empty()) { auto spaceName = optJobDesc.getParas().back(); auto spaceIdRet = getSpaceId(spaceName); @@ -276,9 +254,13 @@ nebula::cpp2::ErrorCode JobManager::jobFinished(JobID jobId, cpp2::JobStatus job jobExec->setSpaceId(spaceId); } if (jobStatus == cpp2::JobStatus::STOPPED) { - return jobExec->stop(); + jobExec->stop(); + if (!jobExec->isMetaJob()) { + cleanJob(jobId); + } } else { jobExec->finish(jobStatus == cpp2::JobStatus::FINISHED); + cleanJob(jobId); } return nebula::cpp2::ErrorCode::SUCCEEDED; @@ -301,7 +283,7 @@ nebula::cpp2::ErrorCode JobManager::saveTaskStatus(TaskDescription& td, } auto optJobDesc = nebula::value(optJobDescRet); - auto jobExec = MetaJobExecutorFactory::createMetaJobExecutor(optJobDesc, kvStore_, adminClient_); + auto jobExec = JobExecutorFactory::createJobExecutor(optJobDesc, kvStore_, adminClient_); if (!jobExec) { LOG(WARNING) << folly::sformat("createMetaJobExecutor failed(), jobId={}", jobId); @@ -330,6 +312,11 @@ nebula::cpp2::ErrorCode JobManager::saveTaskStatus(TaskDescription& td, return jobExec->saveSpecialTaskStatus(req); } +void JobManager::compareChangeStatus(JbmgrStatus expected, JbmgrStatus despire) { + JbmgrStatus ex = expected; + status_.compare_exchange_strong(ex, despire, std::memory_order_acq_rel); +} + /** * @brief * client should retry if any persist attempt @@ -341,7 +328,8 @@ nebula::cpp2::ErrorCode JobManager::reportTaskFinish(const cpp2::ReportTaskReq& auto jobId = req.get_job_id(); auto taskId = req.get_task_id(); // only an active job manager will accept task finish report - if (status_ == JbmgrStatus::STOPPED || status_ == JbmgrStatus::NOT_START) { + if (status_.load(std::memory_order_acquire) == JbmgrStatus::STOPPED || + status_.load(std::memory_order_acquire) == JbmgrStatus::NOT_START) { LOG(INFO) << folly::sformat( "report to an in-active job manager, job={}, task={}", jobId, taskId); return nebula::cpp2::ErrorCode::E_UNKNOWN; @@ -579,7 +567,8 @@ JobManager::showJob(JobID iJob, const std::string& spaceName) { ret.second.emplace_back(td.toTaskDesc()); } } - if (ret.first.get_cmd() == meta::cpp2::AdminCmd::DATA_BALANCE) { + if (ret.first.get_cmd() == meta::cpp2::AdminCmd::DATA_BALANCE || + ret.first.get_cmd() == meta::cpp2::AdminCmd::ZONE_BALANCE) { auto res = BalancePlan::show(iJob, kvStore_, adminClient_); if (ok(res)) { std::vector thriftTasks = value(res); @@ -659,7 +648,9 @@ ErrorOr JobManager::recoverJob( if (optJob.getParas().back() != spaceName) { continue; } - if (optJob.getStatus() == cpp2::JobStatus::QUEUE) { + if (optJob.getStatus() == cpp2::JobStatus::QUEUE || + (jobIds.size() && (optJob.getStatus() == cpp2::JobStatus::FAILED || + optJob.getStatus() == cpp2::JobStatus::STOPPED))) { // Check if the job exists JobID jId = 0; auto jobExist = checkJobExist(optJob.getCmd(), optJob.getParas(), jId); diff --git a/src/meta/processors/job/JobManager.h b/src/meta/processors/job/JobManager.h index 59eac5d2dff..2b4e0056a81 100644 --- a/src/meta/processors/job/JobManager.h +++ b/src/meta/processors/job/JobManager.h @@ -19,7 +19,7 @@ #include "kvstore/NebulaStore.h" #include "meta/processors/job/JobDescription.h" #include "meta/processors/job/JobStatus.h" -#include "meta/processors/job/MetaJobExecutor.h" +#include "meta/processors/job/StorageJobExecutor.h" #include "meta/processors/job/TaskDescription.h" namespace nebula { @@ -46,6 +46,7 @@ class JobManager : public nebula::cpp::NonCopyable, public nebula::cpp::NonMovab FRIEND_TEST(GetStatsTest, StatsJob); FRIEND_TEST(GetStatsTest, MockSingleMachineTest); FRIEND_TEST(GetStatsTest, MockMultiMachineTest); + friend struct JobCallBack; public: ~JobManager(); @@ -140,23 +141,25 @@ class JobManager : public nebula::cpp::NonCopyable, public nebula::cpp::NonMovab nebula::cpp2::ErrorCode saveTaskStatus(TaskDescription& td, const cpp2::ReportTaskReq& req); + void compareChangeStatus(JbmgrStatus expected, JbmgrStatus despire); + private: // Todo(pandasheep) // When folly is upgraded, PriorityUMPSCQueueSet can be used // Use two queues to simulate priority queue, Divide by job cmd std::unique_ptr, true>> lowPriorityQueue_; std::unique_ptr, true>> highPriorityQueue_; + std::map> runningJobs_; // The job in running or queue folly::ConcurrentHashMap inFlightJobs_; std::thread bgThread_; - std::mutex statusGuard_; - JbmgrStatus status_{JbmgrStatus::NOT_START}; nebula::kvstore::KVStore* kvStore_{nullptr}; AdminClient* adminClient_{nullptr}; std::mutex muReportFinish_; std::mutex muJobFinished_; + std::atomic status_ = JbmgrStatus::NOT_START; }; } // namespace meta diff --git a/src/meta/processors/job/LeaderBalanceJobExecutor.cpp b/src/meta/processors/job/LeaderBalanceJobExecutor.cpp new file mode 100644 index 00000000000..cd17cda99ed --- /dev/null +++ b/src/meta/processors/job/LeaderBalanceJobExecutor.cpp @@ -0,0 +1,550 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "meta/processors/job/LeaderBalanceJobExecutor.h" + +#include + +#include "common/utils/MetaKeyUtils.h" +#include "kvstore/NebulaStore.h" +#include "meta/processors/job/JobUtils.h" + +DEFINE_double(leader_balance_deviation, + 0.05, + "after leader balance, leader count should in range " + "[avg * (1 - deviation), avg * (1 + deviation)]"); + +namespace nebula { +namespace meta { + +nebula::cpp2::ErrorCode LeaderBalanceJobExecutor::getAllSpaces( + std::vector>& spaces) { + // Get all spaces + folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); + const auto& prefix = MetaKeyUtils::spacePrefix(); + std::unique_ptr iter; + auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get all spaces failed, error: " << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + while (iter->valid()) { + auto spaceId = MetaKeyUtils::spaceId(iter->key()); + auto properties = MetaKeyUtils::parseSpace(iter->val()); + bool zoned = !properties.get_zone_names().empty(); + spaces.emplace_back(spaceId, *properties.replica_factor_ref(), zoned); + iter->next(); + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +ErrorOr LeaderBalanceJobExecutor::getHostParts(GraphSpaceID spaceId, + bool dependentOnZone, + HostParts& hostParts, + int32_t& totalParts) { + folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); + const auto& prefix = MetaKeyUtils::partPrefix(spaceId); + std::unique_ptr iter; + auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId << " " + << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + while (iter->valid()) { + auto key = iter->key(); + PartitionID partId; + memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); + auto partHosts = MetaKeyUtils::parsePartVal(iter->val()); + for (auto& ph : partHosts) { + hostParts[ph].emplace_back(partId); + } + totalParts++; + iter->next(); + } + + LOG(INFO) << "Host size: " << hostParts.size(); + auto key = MetaKeyUtils::spaceKey(spaceId); + std::string value; + retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, key, &value); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId + << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + auto properties = MetaKeyUtils::parseSpace(value); + if (totalParts != properties.get_partition_num()) { + LOG(ERROR) << "Partition number not equals " << totalParts << " : " + << properties.get_partition_num(); + return false; + } + + int32_t replica = properties.get_replica_factor(); + LOG(INFO) << "Replica " << replica; + if (dependentOnZone && !properties.get_zone_names().empty()) { + auto zoneNames = properties.get_zone_names(); + int32_t zoneSize = zoneNames.size(); + LOG(INFO) << "Zone Size " << zoneSize; + auto activeHostsRet = ActiveHostsMan::getActiveHostsWithZones(kvstore_, spaceId); + if (!nebula::ok(activeHostsRet)) { + return nebula::error(activeHostsRet); + } + + std::vector expand; + auto activeHosts = nebula::value(activeHostsRet); + std::vector lostHosts; + calDiff(hostParts, activeHosts, expand, lostHosts); + // confirmedHostParts is new part allocation map after balance, it would include newlyAdded + // and exclude lostHosts + HostParts confirmedHostParts(hostParts); + for (const auto& h : expand) { + LOG(INFO) << "Found new host " << h; + confirmedHostParts.emplace(h, std::vector()); + } + for (const auto& h : lostHosts) { + LOG(INFO) << "Lost host " << h; + confirmedHostParts.erase(h); + } + + auto zonePartsRet = assembleZoneParts(zoneNames, confirmedHostParts); + if (zonePartsRet != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Assemble Zone Parts failed"; + return zonePartsRet; + } + } + + totalParts *= replica; + return true; +} + +nebula::cpp2::ErrorCode LeaderBalanceJobExecutor::assembleZoneParts( + const std::vector& zoneNames, HostParts& hostParts) { + // zoneHosts use to record this host belong to zone's hosts + std::unordered_map, std::vector> zoneHosts; + for (const auto& zoneName : zoneNames) { + LOG(INFO) << "Zone Name: " << zoneName; + auto zoneKey = MetaKeyUtils::zoneKey(zoneName); + std::string zoneValue; + auto retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, zoneKey, &zoneValue); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get zone " << zoneName + << " failed: " << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + auto hosts = MetaKeyUtils::parseZoneHosts(std::move(zoneValue)); + for (const auto& host : hosts) { + LOG(INFO) << "Host for zone " << host; + auto pair = std::pair(std::move(host), zoneName); + auto& hs = zoneHosts[std::move(pair)]; + hs.insert(hs.end(), hosts.begin(), hosts.end()); + } + } + + for (auto it = hostParts.begin(); it != hostParts.end(); it++) { + auto host = it->first; + LOG(INFO) << "Host: " << host; + auto zoneIter = + std::find_if(zoneHosts.begin(), zoneHosts.end(), [host](const auto& pair) -> bool { + return host == pair.first.first; + }); + + if (zoneIter == zoneHosts.end()) { + LOG(INFO) << it->first << " have lost"; + continue; + } + + auto& hosts = zoneIter->second; + auto name = zoneIter->first.second; + zoneHosts_[name] = hosts; + for (auto hostIter = hosts.begin(); hostIter != hosts.end(); hostIter++) { + auto partIter = hostParts.find(*hostIter); + LOG(INFO) << "Zone " << name << " have the host " << it->first; + if (partIter == hostParts.end()) { + zoneParts_[it->first] = ZoneNameAndParts(name, std::vector()); + } else { + zoneParts_[it->first] = ZoneNameAndParts(name, partIter->second); + } + } + } + + for (auto it = zoneHosts.begin(); it != zoneHosts.end(); it++) { + auto host = it->first.first; + auto& hosts = it->second; + for (auto hostIter = hosts.begin(); hostIter != hosts.end(); hostIter++) { + auto h = *hostIter; + auto iter = std::find_if(hostParts.begin(), hostParts.end(), [h](const auto& pair) -> bool { + return h == pair.first; + }); + + if (iter == hostParts.end()) { + continue; + } + + auto& parts = iter->second; + auto& hp = relatedParts_[host]; + hp.insert(hp.end(), parts.begin(), parts.end()); + } + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +void LeaderBalanceJobExecutor::calDiff(const HostParts& hostParts, + const std::vector& activeHosts, + std::vector& expand, + std::vector& lost) { + for (auto it = hostParts.begin(); it != hostParts.end(); it++) { + VLOG(1) << "Original Host " << it->first << ", parts " << it->second.size(); + if (std::find(activeHosts.begin(), activeHosts.end(), it->first) == activeHosts.end() && + std::find(lost.begin(), lost.end(), it->first) == lost.end()) { + lost.emplace_back(it->first); + } + } + for (auto& h : activeHosts) { + VLOG(1) << "Active host " << h; + if (hostParts.find(h) == hostParts.end()) { + expand.emplace_back(h); + } + } +} + +LeaderBalanceJobExecutor::LeaderBalanceJobExecutor(JobID jobId, + kvstore::KVStore* kvstore, + AdminClient* adminClient, + const std::vector& params) + : MetaJobExecutor(jobId, kvstore, adminClient, params), + inLeaderBalance_(false), + hostLeaderMap_(nullptr) { + executor_.reset(new folly::CPUThreadPoolExecutor(1)); +} + +nebula::cpp2::ErrorCode LeaderBalanceJobExecutor::finish(bool ret) { + UNUSED(ret); + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +folly::Future LeaderBalanceJobExecutor::executeInternal() { + folly::Promise promise; + auto future = promise.getFuture(); + // Space ID, Replica Factor and Dependent On Group + std::vector> spaces; + auto ret = getAllSpaces(spaces); + if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { + if (ret != nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { + ret = nebula::cpp2::ErrorCode::E_STORE_FAILURE; + } + return Status::Error("Can't get spaces"); + } + + bool expected = false; + if (inLeaderBalance_.compare_exchange_strong(expected, true)) { + hostLeaderMap_.reset(new HostLeaderMap); + auto status = adminClient_->getLeaderDist(hostLeaderMap_.get()).get(); + if (!status.ok() || hostLeaderMap_->empty()) { + inLeaderBalance_ = false; + return Status::Error("Get leader distribution failed"); + } + + std::vector> futures; + for (const auto& spaceInfo : spaces) { + auto spaceId = std::get<0>(spaceInfo); + auto replicaFactor = std::get<1>(spaceInfo); + auto dependentOnZone = std::get<2>(spaceInfo); + LeaderBalancePlan plan; + auto balanceResult = buildLeaderBalancePlan( + hostLeaderMap_.get(), spaceId, replicaFactor, dependentOnZone, plan); + if (!nebula::ok(balanceResult) || !nebula::value(balanceResult)) { + LOG(ERROR) << "Building leader balance plan failed " + << "Space: " << spaceId; + continue; + } + simplifyLeaderBalnacePlan(spaceId, plan); + for (const auto& task : plan) { + futures.emplace_back(adminClient_->transLeader(std::get<0>(task), + std::get<1>(task), + std::move(std::get<2>(task)), + std::move(std::get<3>(task)))); + } + } + + int32_t failed = 0; + folly::collectAll(futures) + .via(executor_.get()) + .thenTry([&](const auto& result) { + auto tries = result.value(); + for (const auto& t : tries) { + if (!t.value().ok()) { + ++failed; + } + } + }) + .wait(); + + inLeaderBalance_ = false; + if (failed != 0) { + return Status::Error("partiton failed to transfer leader"); + } + executorOnFinished_(meta::cpp2::JobStatus::FINISHED); + return Status::OK(); + } + executorOnFinished_(meta::cpp2::JobStatus::FINISHED); + return Status::OK(); +} + +ErrorOr LeaderBalanceJobExecutor::buildLeaderBalancePlan( + HostLeaderMap* hostLeaderMap, + GraphSpaceID spaceId, + int32_t replicaFactor, + bool dependentOnZone, + LeaderBalancePlan& plan, + bool useDeviation) { + PartAllocation peersMap; + HostParts leaderHostParts; + size_t leaderParts = 0; + // store peers of all paritions in peerMap + folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); + const auto& prefix = MetaKeyUtils::partPrefix(spaceId); + std::unique_ptr iter; + auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Access kvstore failed, spaceId " << spaceId << static_cast(retCode); + return retCode; + } + + while (iter->valid()) { + auto key = iter->key(); + PartitionID partId; + memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); + auto peers = MetaKeyUtils::parsePartVal(iter->val()); + peersMap[partId] = std::move(peers); + ++leaderParts; + iter->next(); + } + + int32_t totalParts = 0; + HostParts allHostParts; + auto result = getHostParts(spaceId, dependentOnZone, allHostParts, totalParts); + if (!nebula::ok(result)) { + return nebula::error(result); + } else { + auto retVal = nebula::value(result); + if (!retVal || totalParts == 0 || allHostParts.empty()) { + LOG(ERROR) << "Invalid space " << spaceId; + return false; + } + } + + std::unordered_set activeHosts; + for (const auto& host : *hostLeaderMap) { + // only balance leader between hosts which have valid partition + if (!allHostParts[host.first].empty()) { + activeHosts.emplace(host.first); + leaderHostParts[host.first] = (*hostLeaderMap)[host.first][spaceId]; + } + } + + if (activeHosts.empty()) { + LOG(ERROR) << "No active hosts"; + return false; + } + + if (dependentOnZone) { + for (auto it = allHostParts.begin(); it != allHostParts.end(); it++) { + auto min = it->second.size() / replicaFactor; + VLOG(3) << "Host: " << it->first << " Bounds: " << min << " : " << min + 1; + hostBounds_[it->first] = std::make_pair(min, min + 1); + } + } else { + size_t activeSize = activeHosts.size(); + size_t globalAvg = leaderParts / activeSize; + size_t globalMin = globalAvg; + size_t globalMax = globalAvg; + if (leaderParts % activeSize != 0) { + globalMax += 1; + } + + if (useDeviation) { + globalMin = std::ceil(static_cast(leaderParts) / activeSize * + (1 - FLAGS_leader_balance_deviation)); + globalMax = std::floor(static_cast(leaderParts) / activeSize * + (1 + FLAGS_leader_balance_deviation)); + } + VLOG(3) << "Build leader balance plan, expected min load: " << globalMin + << ", max load: " << globalMax << " avg: " << globalAvg; + + for (auto it = allHostParts.begin(); it != allHostParts.end(); it++) { + hostBounds_[it->first] = std::make_pair(globalMin, globalMax); + } + } + + while (true) { + int32_t taskCount = 0; + bool hasUnbalancedHost = false; + for (const auto& hostEntry : leaderHostParts) { + auto host = hostEntry.first; + auto& hostMinLoad = hostBounds_[host].first; + auto& hostMaxLoad = hostBounds_[host].second; + int32_t partSize = hostEntry.second.size(); + if (hostMinLoad <= partSize && partSize <= hostMaxLoad) { + VLOG(3) << partSize << " is between min load " << hostMinLoad << " and max load " + << hostMaxLoad; + continue; + } + + hasUnbalancedHost = true; + if (partSize < hostMinLoad) { + // need to acquire leader from other hosts + LOG(INFO) << "Acquire leaders to host: " << host << " loading: " << partSize + << " min loading " << hostMinLoad; + taskCount += acquireLeaders( + allHostParts, leaderHostParts, peersMap, activeHosts, host, plan, spaceId); + } else { + // need to transfer leader to other hosts + LOG(INFO) << "Giveup leaders from host: " << host << " loading: " << partSize + << " max loading " << hostMaxLoad; + taskCount += giveupLeaders(leaderHostParts, peersMap, activeHosts, host, plan, spaceId); + } + } + + // If every host is balanced or no more task during this loop, then the plan + // is done + if (!hasUnbalancedHost || taskCount == 0) { + LOG(INFO) << "Not need balance"; + break; + } + } + return true; +} + +int32_t LeaderBalanceJobExecutor::acquireLeaders(HostParts& allHostParts, + HostParts& leaderHostParts, + PartAllocation& peersMap, + std::unordered_set& activeHosts, + const HostAddr& target, + LeaderBalancePlan& plan, + GraphSpaceID spaceId) { + // host will loop for the partition which is not leader, and try to acuire the + // leader + int32_t taskCount = 0; + std::vector diff; + std::set_difference(allHostParts[target].begin(), + allHostParts[target].end(), + leaderHostParts[target].begin(), + leaderHostParts[target].end(), + std::back_inserter(diff)); + auto& targetLeaders = leaderHostParts[target]; + size_t minLoad = hostBounds_[target].first; + for (const auto& partId : diff) { + VLOG(3) << "Try acquire leader for part " << partId; + // find the leader of partId + auto sources = peersMap[partId]; + for (const auto& source : sources) { + if (source == target || !activeHosts.count(source)) { + continue; + } + + // if peer is the leader of partId and can transfer, then transfer it to + // host + auto& sourceLeaders = leaderHostParts[source]; + VLOG(3) << "Check peer: " << source << " min load: " << minLoad + << " peerLeaders size: " << sourceLeaders.size(); + auto it = std::find(sourceLeaders.begin(), sourceLeaders.end(), partId); + if (it != sourceLeaders.end() && minLoad < sourceLeaders.size()) { + sourceLeaders.erase(it); + targetLeaders.emplace_back(partId); + plan.emplace_back(spaceId, partId, source, target); + LOG(INFO) << "acquire plan trans leader space: " << spaceId << " part: " << partId + << " from " << source.host << ":" << source.port << " to " << target.host << ":" + << target.port; + ++taskCount; + break; + } + } + + // if host has enough leader, just return + if (targetLeaders.size() == minLoad) { + LOG(INFO) << "Host: " << target << "'s leader reach " << minLoad; + break; + } + } + return taskCount; +} + +int32_t LeaderBalanceJobExecutor::giveupLeaders(HostParts& leaderParts, + PartAllocation& peersMap, + std::unordered_set& activeHosts, + const HostAddr& source, + LeaderBalancePlan& plan, + GraphSpaceID spaceId) { + int32_t taskCount = 0; + auto& sourceLeaders = leaderParts[source]; + size_t maxLoad = hostBounds_[source].second; + + // host will try to transfer the extra leaders to other peers + for (auto it = sourceLeaders.begin(); it != sourceLeaders.end();) { + // find the leader of partId + auto partId = *it; + const auto& targets = peersMap[partId]; + bool isErase = false; + + // leader should move to the peer with lowest loading + auto target = + std::min_element(targets.begin(), targets.end(), [&](const auto& l, const auto& r) -> bool { + if (source == l || !activeHosts.count(l)) { + return false; + } + return leaderParts[l].size() < leaderParts[r].size(); + }); + + // If peer can accept this partition leader, than host will transfer to the + // peer + if (target != targets.end()) { + auto& targetLeaders = leaderParts[*target]; + int32_t targetLeaderSize = targetLeaders.size(); + if (targetLeaderSize < hostBounds_[*target].second) { + it = sourceLeaders.erase(it); + targetLeaders.emplace_back(partId); + plan.emplace_back(spaceId, partId, source, *target); + LOG(INFO) << "giveup plan trans leader space: " << spaceId << " part: " << partId + << " from " << source.host << ":" << source.port << " to " << target->host << ":" + << target->port; + ++taskCount; + isErase = true; + } + } + + // if host has enough leader, just return + if (sourceLeaders.size() == maxLoad) { + LOG(INFO) << "Host: " << source << "'s leader reach " << maxLoad; + break; + } + + if (!isErase) { + ++it; + } + } + return taskCount; +} + +void LeaderBalanceJobExecutor::simplifyLeaderBalnacePlan(GraphSpaceID spaceId, + LeaderBalancePlan& plan) { + std::unordered_map buckets; + for (auto& task : plan) { + buckets[std::get<1>(task)].emplace_back(task); + } + plan.clear(); + for (const auto& partEntry : buckets) { + plan.emplace_back(spaceId, + partEntry.first, + std::get<2>(partEntry.second.front()), + std::get<3>(partEntry.second.back())); + } +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/job/LeaderBalanceJobExecutor.h b/src/meta/processors/job/LeaderBalanceJobExecutor.h new file mode 100644 index 00000000000..113257d8093 --- /dev/null +++ b/src/meta/processors/job/LeaderBalanceJobExecutor.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef META_LEADERBALANCEJOBEXECUTOR_H_ +#define META_LEADERBALANCEJOBEXECUTOR_H_ + +#include "meta/processors/job/BalancePlan.h" +#include "meta/processors/job/BalanceTask.h" +#include "meta/processors/job/MetaJobExecutor.h" +#include "meta/processors/job/SimpleConcurrentJobExecutor.h" + +namespace nebula { +namespace meta { +using HostParts = std::unordered_map>; +using PartAllocation = std::unordered_map>; +using LeaderBalancePlan = std::vector>; +using ZoneNameAndParts = std::pair>; + +class LeaderBalanceJobExecutor : public MetaJobExecutor { + FRIEND_TEST(BalanceTest, SimpleLeaderBalancePlanTest); + FRIEND_TEST(BalanceTest, IntersectHostsLeaderBalancePlanTest); + FRIEND_TEST(BalanceTest, ManyHostsLeaderBalancePlanTest); + FRIEND_TEST(BalanceTest, LeaderBalanceTest); + FRIEND_TEST(BalanceTest, LeaderBalanceWithZoneTest); + FRIEND_TEST(BalanceTest, LeaderBalanceWithLargerZoneTest); + FRIEND_TEST(BalanceTest, LeaderBalanceWithComplexZoneTest); + + public: + LeaderBalanceJobExecutor(JobID jobId, + kvstore::KVStore* kvstore, + AdminClient* adminClient, + const std::vector& params); + + nebula::cpp2::ErrorCode finish(bool ret = true) override; + + protected: + folly::Future executeInternal() override; + + ErrorOr buildLeaderBalancePlan(HostLeaderMap* hostLeaderMap, + GraphSpaceID spaceId, + int32_t replicaFactor, + bool dependentOnZone, + LeaderBalancePlan& plan, + bool useDeviation = true); + + int32_t acquireLeaders(HostParts& allHostParts, + HostParts& leaderHostParts, + PartAllocation& peersMap, + std::unordered_set& activeHosts, + const HostAddr& target, + LeaderBalancePlan& plan, + GraphSpaceID spaceId); + + int32_t giveupLeaders(HostParts& leaderParts, + PartAllocation& peersMap, + std::unordered_set& activeHosts, + const HostAddr& source, + LeaderBalancePlan& plan, + GraphSpaceID spaceId); + + void simplifyLeaderBalnacePlan(GraphSpaceID spaceId, LeaderBalancePlan& plan); + + nebula::cpp2::ErrorCode getAllSpaces( + std::vector>& spaces); + + ErrorOr getHostParts(GraphSpaceID spaceId, + bool dependentOnGroup, + HostParts& hostParts, + int32_t& totalParts); + + void calDiff(const HostParts& hostParts, + const std::vector& activeHosts, + std::vector& expand, + std::vector& lost); + + nebula::cpp2::ErrorCode assembleZoneParts(const std::vector& zoneNames, + HostParts& hostParts); + + private: + std::atomic_bool inLeaderBalance_; + std::unique_ptr hostLeaderMap_; + std::unordered_map> hostBounds_; + std::unordered_map zoneParts_; + std::unordered_map> zoneHosts_; + std::unordered_map> relatedParts_; + std::unique_ptr executor_; +}; + +} // namespace meta +} // namespace nebula + +#endif // META_BALANCEJOBEXECUTOR_H_ diff --git a/src/meta/processors/job/MetaJobExecutor.cpp b/src/meta/processors/job/MetaJobExecutor.cpp index 4a1c648ed6a..dc3cc4e8d7d 100644 --- a/src/meta/processors/job/MetaJobExecutor.cpp +++ b/src/meta/processors/job/MetaJobExecutor.cpp @@ -5,248 +5,52 @@ #include "meta/processors/job/MetaJobExecutor.h" -#include "common/network/NetworkUtils.h" -#include "common/utils/MetaKeyUtils.h" #include "common/utils/Utils.h" -#include "interface/gen-cpp2/common_types.h" -#include "meta/ActiveHostsMan.h" -#include "meta/common/MetaCommon.h" -#include "meta/processors/Common.h" -#include "meta/processors/admin/AdminClient.h" -#include "meta/processors/job/BalanceJobExecutor.h" -#include "meta/processors/job/CompactJobExecutor.h" -#include "meta/processors/job/FlushJobExecutor.h" -#include "meta/processors/job/RebuildEdgeJobExecutor.h" -#include "meta/processors/job/RebuildFTJobExecutor.h" -#include "meta/processors/job/RebuildTagJobExecutor.h" -#include "meta/processors/job/StatsJobExecutor.h" -#include "meta/processors/job/TaskDescription.h" DECLARE_int32(heartbeat_interval_secs); DECLARE_uint32(expired_time_factor); namespace nebula { namespace meta { +bool MetaJobExecutor::check() { return true; } -std::unique_ptr MetaJobExecutorFactory::createMetaJobExecutor( - const JobDescription& jd, kvstore::KVStore* store, AdminClient* client) { - std::unique_ptr ret; - switch (jd.getCmd()) { - case cpp2::AdminCmd::COMPACT: - ret.reset(new CompactJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::DATA_BALANCE: - ret.reset(new DataBalanceJobExecutor(jd, store, client, jd.getParas())); - break; - case cpp2::AdminCmd::LEADER_BALANCE: - ret.reset(new LeaderBalanceJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::FLUSH: - ret.reset(new FlushJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::REBUILD_TAG_INDEX: - ret.reset(new RebuildTagJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::REBUILD_EDGE_INDEX: - ret.reset(new RebuildEdgeJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::REBUILD_FULLTEXT_INDEX: - ret.reset(new RebuildFTJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - case cpp2::AdminCmd::STATS: - ret.reset(new StatsJobExecutor(jd.getJobId(), store, client, jd.getParas())); - break; - default: - break; - } - return ret; -} - -ErrorOr MetaJobExecutor::getSpaceIdFromName( - const std::string& spaceName) { - auto indexKey = MetaKeyUtils::indexSpaceKey(spaceName); - std::string val; - auto retCode = kvstore_->get(kDefaultSpaceId, kDefaultPartId, indexKey, &val); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Get space failed, space name: " << spaceName - << " error: " << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } - return *reinterpret_cast(val.c_str()); -} - -ErrOrHosts MetaJobExecutor::getTargetHost(GraphSpaceID spaceId) { - std::unique_ptr iter; - const auto& partPrefix = MetaKeyUtils::partPrefix(spaceId); - auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, partPrefix, &iter); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Fetch Parts Failed, error: " << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } +// Prepare the Job info from the arguments. +nebula::cpp2::ErrorCode MetaJobExecutor::prepare() { return nebula::cpp2::ErrorCode::SUCCEEDED; } - // use vector instead of set because this can convenient for next step - std::unordered_map> hostAndPart; - std::vector>> hosts; - while (iter->valid()) { - auto part = MetaKeyUtils::parsePartKeyPartId(iter->key()); - auto targets = MetaKeyUtils::parsePartVal(iter->val()); - for (auto& target : targets) { - hostAndPart[target].emplace_back(part); - } - iter->next(); - } - for (auto it = hostAndPart.begin(); it != hostAndPart.end(); it++) { - hosts.emplace_back(std::pair(it->first, it->second)); +// The skeleton to run the job. +// You should rewrite the executeInternal to trigger the calling. +nebula::cpp2::ErrorCode MetaJobExecutor::execute() { + folly::SemiFuture future = executeInternal(); + auto rc = nebula::cpp2::ErrorCode::SUCCEEDED; + future.wait(); + if (!future.value().ok()) { + LOG(ERROR) << future.value().toString(); + rc = nebula::cpp2::ErrorCode::E_ADD_JOB_FAILURE; } - return hosts; + return rc; } -ErrOrHosts MetaJobExecutor::getLeaderHost(GraphSpaceID space) { - const auto& hostPrefix = MetaKeyUtils::leaderPrefix(space); - std::unique_ptr leaderIter; - auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, hostPrefix, &leaderIter); - if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Get space " << space - << "'s part failed, error: " << apache::thrift::util::enumNameSafe(retCode); - return retCode; - } +// Stop the job when the user cancel it. +nebula::cpp2::ErrorCode MetaJobExecutor::stop() { return nebula::cpp2::ErrorCode::SUCCEEDED; } - std::vector>> hosts; - HostAddr host; - nebula::cpp2::ErrorCode code; - for (; leaderIter->valid(); leaderIter->next()) { - auto spaceAndPart = MetaKeyUtils::parseLeaderKeyV3(leaderIter->key()); - auto partId = spaceAndPart.second; - std::tie(host, std::ignore, code) = MetaKeyUtils::parseLeaderValV3(leaderIter->val()); - if (code != nebula::cpp2::ErrorCode::SUCCEEDED) { - continue; - } - auto it = - std::find_if(hosts.begin(), hosts.end(), [&](auto& item) { return item.first == host; }); - if (it == hosts.end()) { - hosts.emplace_back(std::make_pair(host, std::vector{partId})); - } else { - it->second.emplace_back(partId); - } - } - return hosts; -} +nebula::cpp2::ErrorCode MetaJobExecutor::finish(bool) { return nebula::cpp2::ErrorCode::SUCCEEDED; } -ErrOrHosts MetaJobExecutor::getListenerHost(GraphSpaceID space, cpp2::ListenerType type) { - const auto& prefix = MetaKeyUtils::listenerPrefix(space, type); - std::unique_ptr iter; - auto ret = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(ERROR) << "Get space " << space - << "'s listener failed, error: " << apache::thrift::util::enumNameSafe(ret); - return ret; - } +void MetaJobExecutor::setSpaceId(GraphSpaceID spaceId) { space_ = spaceId; } - auto activeHostsRet = - ActiveHostsMan::getActiveHosts(kvstore_, - FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor, - cpp2::HostRole::LISTENER); - if (!nebula::ok(activeHostsRet)) { - return nebula::error(activeHostsRet); - } +bool MetaJobExecutor::isMetaJob() { return true; } - auto activeHosts = std::move(nebula::value(activeHostsRet)); - std::vector>> hosts; +nebula::cpp2::ErrorCode MetaJobExecutor::recovery() { return nebula::cpp2::ErrorCode::SUCCEEDED; } - while (iter->valid()) { - auto host = MetaKeyUtils::deserializeHostAddr(iter->val()); - auto part = MetaKeyUtils::parseListenerPart(iter->key()); - if (std::find(activeHosts.begin(), activeHosts.end(), host) == activeHosts.end()) { - LOG(ERROR) << "Invalid host : " << network::NetworkUtils::toHostsStr({host}); - return nebula::cpp2::ErrorCode::E_INVALID_HOST; - } - auto it = std::find_if( - hosts.begin(), hosts.end(), [&host](auto& item) { return item.first == host; }); - if (it == hosts.end()) { - hosts.emplace_back(std::make_pair(host, std::vector{part})); - } else { - it->second.emplace_back(part); - } - iter->next(); - } - if (hosts.empty()) { - return nebula::cpp2::ErrorCode::E_LISTENER_NOT_FOUND; - } - return hosts; +void MetaJobExecutor::setFinishCallBack( + std::function func) { + executorOnFinished_ = func; } -nebula::cpp2::ErrorCode MetaJobExecutor::execute() { - ErrOrHosts addressesRet; - switch (toHost_) { - case TargetHosts::LEADER: { - addressesRet = getLeaderHost(space_); - break; - } - case TargetHosts::LISTENER: { - addressesRet = getListenerHost(space_, cpp2::ListenerType::ELASTICSEARCH); - break; - } - case TargetHosts::NONE: { - addressesRet = {{HostAddr(), {}}}; - break; - } - case TargetHosts::DEFAULT: { - addressesRet = getTargetHost(space_); - break; - } - } - - if (!nebula::ok(addressesRet)) { - LOG(ERROR) << "Can't get hosts"; - return nebula::error(addressesRet); - } - - std::vector parts; - auto addresses = nebula::value(addressesRet); - - // write all tasks first. - if (toHost_ != TargetHosts::NONE) { - for (auto i = 0U; i != addresses.size(); ++i) { - TaskDescription task(jobId_, i, addresses[i].first); - std::vector data{{task.taskKey(), task.taskVal()}}; - folly::Baton baton; - auto rc = nebula::cpp2::ErrorCode::SUCCEEDED; - kvstore_->asyncMultiPut( - kDefaultSpaceId, kDefaultPartId, std::move(data), [&](nebula::cpp2::ErrorCode code) { - rc = code; - baton.post(); - }); - baton.wait(); - if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(INFO) << "write to kv store failed, error: " << apache::thrift::util::enumNameSafe(rc); - return rc; - } - } - } - - std::vector> futures; - for (auto& address : addresses) { - // transform to the admin host - auto h = Utils::getAdminAddrFromStoreAddr(address.first); - futures.emplace_back(executeInternal(std::move(h), std::move(address.second))); - } - - auto rc = nebula::cpp2::ErrorCode::SUCCEEDED; - auto tries = folly::collectAll(std::move(futures)).get(); - for (auto& t : tries) { - if (t.hasException()) { - LOG(ERROR) << t.exception().what(); - rc = nebula::cpp2::ErrorCode::E_RPC_FAILURE; - continue; - } - if (!t.value().ok()) { - LOG(ERROR) << t.value().toString(); - rc = nebula::cpp2::ErrorCode::E_RPC_FAILURE; - continue; - } - } - return rc; +nebula::cpp2::ErrorCode MetaJobExecutor::saveSpecialTaskStatus(const cpp2::ReportTaskReq&) { + return nebula::cpp2::ErrorCode::SUCCEEDED; } +folly::Future MetaJobExecutor::executeInternal() { return Status::OK(); } + } // namespace meta } // namespace nebula diff --git a/src/meta/processors/job/MetaJobExecutor.h b/src/meta/processors/job/MetaJobExecutor.h index 8b75c51c0a5..7c461488aff 100644 --- a/src/meta/processors/job/MetaJobExecutor.h +++ b/src/meta/processors/job/MetaJobExecutor.h @@ -12,98 +12,63 @@ #include "kvstore/KVStore.h" #include "meta/processors/admin/AdminClient.h" #include "meta/processors/job/JobDescription.h" +#include "meta/processors/job/JobExecutor.h" namespace nebula { namespace meta { -using PartsOfHost = std::pair>; -using ErrOrHosts = ErrorOr>; - -class MetaJobExecutor { +class MetaJobExecutor : public JobExecutor { public: - enum class TargetHosts { LEADER = 0, LISTENER, NONE, DEFAULT }; - MetaJobExecutor(JobID jobId, kvstore::KVStore* kvstore, AdminClient* adminClient, const std::vector& paras) - : jobId_(jobId), kvstore_(kvstore), adminClient_(adminClient), paras_(paras) { - onFinished_ = [](bool) { return nebula::cpp2::ErrorCode::SUCCEEDED; }; + : JobExecutor(kvstore), jobId_(jobId), adminClient_(adminClient), paras_(paras) { + executorOnFinished_ = [](meta::cpp2::JobStatus) { return nebula::cpp2::ErrorCode::SUCCEEDED; }; } virtual ~MetaJobExecutor() = default; // Check the arguments about the job. - virtual bool check() = 0; + bool check() override; // Prepare the Job info from the arguments. - virtual nebula::cpp2::ErrorCode prepare() = 0; + nebula::cpp2::ErrorCode prepare() override; // The skeleton to run the job. // You should rewrite the executeInternal to trigger the calling. - nebula::cpp2::ErrorCode execute(); - - void interruptExecution(JobID jobId); + nebula::cpp2::ErrorCode execute() override; // Stop the job when the user cancel it. - virtual nebula::cpp2::ErrorCode stop() = 0; + nebula::cpp2::ErrorCode stop() override; - virtual nebula::cpp2::ErrorCode finish(bool) { - return nebula::cpp2::ErrorCode::SUCCEEDED; - } + nebula::cpp2::ErrorCode finish(bool) override; - void setSpaceId(GraphSpaceID spaceId) { - space_ = spaceId; - } + void setSpaceId(GraphSpaceID spaceId) override; - virtual nebula::cpp2::ErrorCode saveSpecialTaskStatus(const cpp2::ReportTaskReq&) { - return nebula::cpp2::ErrorCode::SUCCEEDED; - } + bool isMetaJob() override; - virtual bool runInMeta() { - return false; - } + nebula::cpp2::ErrorCode recovery() override; - virtual nebula::cpp2::ErrorCode recovery() { - return nebula::cpp2::ErrorCode::SUCCEEDED; - } + void setFinishCallBack( + std::function func) override; - void setFinishCallBack(std::function func) { - onFinished_ = func; - } + nebula::cpp2::ErrorCode saveSpecialTaskStatus(const cpp2::ReportTaskReq&) override; protected: - ErrorOr getSpaceIdFromName(const std::string& spaceName); - - ErrOrHosts getTargetHost(GraphSpaceID space); - - ErrOrHosts getLeaderHost(GraphSpaceID space); - - ErrOrHosts getListenerHost(GraphSpaceID space, cpp2::ListenerType type); - - virtual folly::Future executeInternal(HostAddr&& address, - std::vector&& parts) = 0; + virtual folly::Future executeInternal(); protected: JobID jobId_{INT_MIN}; TaskID taskId_{0}; - kvstore::KVStore* kvstore_{nullptr}; AdminClient* adminClient_{nullptr}; GraphSpaceID space_; std::vector paras_; - TargetHosts toHost_{TargetHosts::DEFAULT}; int32_t concurrency_{INT_MAX}; volatile bool stopped_{false}; std::mutex muInterrupt_; std::condition_variable condInterrupt_; - std::function onFinished_; -}; - -class MetaJobExecutorFactory { - public: - static std::unique_ptr createMetaJobExecutor(const JobDescription& jd, - kvstore::KVStore* store, - AdminClient* client); + std::function executorOnFinished_; }; } // namespace meta diff --git a/src/meta/processors/job/RebuildJobExecutor.h b/src/meta/processors/job/RebuildJobExecutor.h index 8488f6f5f2a..4832b724c67 100644 --- a/src/meta/processors/job/RebuildJobExecutor.h +++ b/src/meta/processors/job/RebuildJobExecutor.h @@ -8,18 +8,18 @@ #include "interface/gen-cpp2/common_types.h" #include "meta/processors/admin/AdminClient.h" -#include "meta/processors/job/MetaJobExecutor.h" +#include "meta/processors/job/StorageJobExecutor.h" namespace nebula { namespace meta { -class RebuildJobExecutor : public MetaJobExecutor { +class RebuildJobExecutor : public StorageJobExecutor { public: RebuildJobExecutor(JobID jobId, kvstore::KVStore* kvstore, AdminClient* adminClient, const std::vector& paras) - : MetaJobExecutor(jobId, kvstore, adminClient, paras) { + : StorageJobExecutor(jobId, kvstore, adminClient, paras) { toHost_ = TargetHosts::LEADER; } diff --git a/src/meta/processors/job/SimpleConcurrentJobExecutor.cpp b/src/meta/processors/job/SimpleConcurrentJobExecutor.cpp index 4de1254f247..a69ea88c9e1 100644 --- a/src/meta/processors/job/SimpleConcurrentJobExecutor.cpp +++ b/src/meta/processors/job/SimpleConcurrentJobExecutor.cpp @@ -15,7 +15,7 @@ SimpleConcurrentJobExecutor::SimpleConcurrentJobExecutor(JobID jobId, kvstore::KVStore* kvstore, AdminClient* adminClient, const std::vector& paras) - : MetaJobExecutor(jobId, kvstore, adminClient, paras) {} + : StorageJobExecutor(jobId, kvstore, adminClient, paras) {} bool SimpleConcurrentJobExecutor::check() { auto parasNum = paras_.size(); diff --git a/src/meta/processors/job/SimpleConcurrentJobExecutor.h b/src/meta/processors/job/SimpleConcurrentJobExecutor.h index b6301b0090e..6400379bb01 100644 --- a/src/meta/processors/job/SimpleConcurrentJobExecutor.h +++ b/src/meta/processors/job/SimpleConcurrentJobExecutor.h @@ -7,12 +7,12 @@ #define META_SIMPLECONCURRENTJOBEXECUTOR_H_ #include "interface/gen-cpp2/common_types.h" -#include "meta/processors/job/MetaJobExecutor.h" +#include "meta/processors/job/StorageJobExecutor.h" namespace nebula { namespace meta { -class SimpleConcurrentJobExecutor : public MetaJobExecutor { +class SimpleConcurrentJobExecutor : public StorageJobExecutor { public: SimpleConcurrentJobExecutor(JobID jobId, kvstore::KVStore* kvstore, diff --git a/src/meta/processors/job/StatsJobExecutor.h b/src/meta/processors/job/StatsJobExecutor.h index a7e3e23ab2f..bfbc3d84478 100644 --- a/src/meta/processors/job/StatsJobExecutor.h +++ b/src/meta/processors/job/StatsJobExecutor.h @@ -8,18 +8,18 @@ #include "interface/gen-cpp2/meta_types.h" #include "meta/processors/admin/AdminClient.h" -#include "meta/processors/job/MetaJobExecutor.h" +#include "meta/processors/job/StorageJobExecutor.h" namespace nebula { namespace meta { -class StatsJobExecutor : public MetaJobExecutor { +class StatsJobExecutor : public StorageJobExecutor { public: StatsJobExecutor(JobID jobId, kvstore::KVStore* kvstore, AdminClient* adminClient, const std::vector& paras) - : MetaJobExecutor(jobId, kvstore, adminClient, paras) { + : StorageJobExecutor(jobId, kvstore, adminClient, paras) { toHost_ = TargetHosts::LEADER; } diff --git a/src/meta/processors/job/StorageJobExecutor.cpp b/src/meta/processors/job/StorageJobExecutor.cpp new file mode 100644 index 00000000000..40922456dc3 --- /dev/null +++ b/src/meta/processors/job/StorageJobExecutor.cpp @@ -0,0 +1,199 @@ +/* Copyright (c) 2019 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "meta/processors/job/StorageJobExecutor.h" + +#include "common/network/NetworkUtils.h" +#include "common/utils/MetaKeyUtils.h" +#include "common/utils/Utils.h" +#include "interface/gen-cpp2/common_types.h" +#include "meta/ActiveHostsMan.h" +#include "meta/common/MetaCommon.h" +#include "meta/processors/Common.h" +#include "meta/processors/admin/AdminClient.h" +#include "meta/processors/job/BalanceJobExecutor.h" +#include "meta/processors/job/CompactJobExecutor.h" +#include "meta/processors/job/FlushJobExecutor.h" +#include "meta/processors/job/RebuildEdgeJobExecutor.h" +#include "meta/processors/job/RebuildFTJobExecutor.h" +#include "meta/processors/job/RebuildTagJobExecutor.h" +#include "meta/processors/job/StatsJobExecutor.h" +#include "meta/processors/job/TaskDescription.h" + +DECLARE_int32(heartbeat_interval_secs); +DECLARE_uint32(expired_time_factor); + +namespace nebula { +namespace meta { + +ErrOrHosts StorageJobExecutor::getTargetHost(GraphSpaceID spaceId) { + std::unique_ptr iter; + const auto& partPrefix = MetaKeyUtils::partPrefix(spaceId); + auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, partPrefix, &iter); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Fetch Parts Failed, error: " << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + // use vector instead of set because this can convenient for next step + std::unordered_map> hostAndPart; + std::vector>> hosts; + while (iter->valid()) { + auto part = MetaKeyUtils::parsePartKeyPartId(iter->key()); + auto targets = MetaKeyUtils::parsePartVal(iter->val()); + for (auto& target : targets) { + hostAndPart[target].emplace_back(part); + } + iter->next(); + } + for (auto it = hostAndPart.begin(); it != hostAndPart.end(); it++) { + hosts.emplace_back(std::pair(it->first, it->second)); + } + return hosts; +} + +ErrOrHosts StorageJobExecutor::getLeaderHost(GraphSpaceID space) { + const auto& hostPrefix = MetaKeyUtils::leaderPrefix(space); + std::unique_ptr leaderIter; + auto retCode = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, hostPrefix, &leaderIter); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get space " << space + << "'s part failed, error: " << apache::thrift::util::enumNameSafe(retCode); + return retCode; + } + + std::vector>> hosts; + HostAddr host; + nebula::cpp2::ErrorCode code; + for (; leaderIter->valid(); leaderIter->next()) { + auto spaceAndPart = MetaKeyUtils::parseLeaderKeyV3(leaderIter->key()); + auto partId = spaceAndPart.second; + std::tie(host, std::ignore, code) = MetaKeyUtils::parseLeaderValV3(leaderIter->val()); + if (code != nebula::cpp2::ErrorCode::SUCCEEDED) { + continue; + } + auto it = + std::find_if(hosts.begin(), hosts.end(), [&](auto& item) { return item.first == host; }); + if (it == hosts.end()) { + hosts.emplace_back(std::make_pair(host, std::vector{partId})); + } else { + it->second.emplace_back(partId); + } + } + return hosts; +} + +ErrOrHosts StorageJobExecutor::getListenerHost(GraphSpaceID space, cpp2::ListenerType type) { + const auto& prefix = MetaKeyUtils::listenerPrefix(space, type); + std::unique_ptr iter; + auto ret = kvstore_->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get space " << space + << "'s listener failed, error: " << apache::thrift::util::enumNameSafe(ret); + return ret; + } + + auto activeHostsRet = + ActiveHostsMan::getActiveHosts(kvstore_, + FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor, + cpp2::HostRole::LISTENER); + if (!nebula::ok(activeHostsRet)) { + return nebula::error(activeHostsRet); + } + + auto activeHosts = std::move(nebula::value(activeHostsRet)); + std::vector>> hosts; + + while (iter->valid()) { + auto host = MetaKeyUtils::deserializeHostAddr(iter->val()); + auto part = MetaKeyUtils::parseListenerPart(iter->key()); + if (std::find(activeHosts.begin(), activeHosts.end(), host) == activeHosts.end()) { + LOG(ERROR) << "Invalid host : " << network::NetworkUtils::toHostsStr({host}); + return nebula::cpp2::ErrorCode::E_INVALID_HOST; + } + auto it = std::find_if( + hosts.begin(), hosts.end(), [&host](auto& item) { return item.first == host; }); + if (it == hosts.end()) { + hosts.emplace_back(std::make_pair(host, std::vector{part})); + } else { + it->second.emplace_back(part); + } + iter->next(); + } + if (hosts.empty()) { + return nebula::cpp2::ErrorCode::E_LISTENER_NOT_FOUND; + } + return hosts; +} + +nebula::cpp2::ErrorCode StorageJobExecutor::execute() { + ErrOrHosts addressesRet; + switch (toHost_) { + case TargetHosts::LEADER: { + addressesRet = getLeaderHost(space_); + break; + } + case TargetHosts::LISTENER: { + addressesRet = getListenerHost(space_, cpp2::ListenerType::ELASTICSEARCH); + break; + } + case TargetHosts::DEFAULT: { + addressesRet = getTargetHost(space_); + break; + } + } + + if (!nebula::ok(addressesRet)) { + LOG(ERROR) << "Can't get hosts"; + return nebula::error(addressesRet); + } + + std::vector parts; + auto addresses = nebula::value(addressesRet); + + // write all tasks first. + for (auto i = 0U; i != addresses.size(); ++i) { + TaskDescription task(jobId_, i, addresses[i].first); + std::vector data{{task.taskKey(), task.taskVal()}}; + folly::Baton baton; + auto rc = nebula::cpp2::ErrorCode::SUCCEEDED; + kvstore_->asyncMultiPut( + kDefaultSpaceId, kDefaultPartId, std::move(data), [&](nebula::cpp2::ErrorCode code) { + rc = code; + baton.post(); + }); + baton.wait(); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(INFO) << "write to kv store failed, error: " << apache::thrift::util::enumNameSafe(rc); + return rc; + } + } + + std::vector> futures; + for (auto& address : addresses) { + // transform to the admin host + auto h = Utils::getAdminAddrFromStoreAddr(address.first); + futures.emplace_back(executeInternal(std::move(h), std::move(address.second))); + } + + auto rc = nebula::cpp2::ErrorCode::SUCCEEDED; + auto tries = folly::collectAll(std::move(futures)).get(); + for (auto& t : tries) { + if (t.hasException()) { + LOG(ERROR) << t.exception().what(); + rc = nebula::cpp2::ErrorCode::E_RPC_FAILURE; + continue; + } + if (!t.value().ok()) { + LOG(ERROR) << t.value().toString(); + rc = nebula::cpp2::ErrorCode::E_RPC_FAILURE; + continue; + } + } + return rc; +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/job/StorageJobExecutor.h b/src/meta/processors/job/StorageJobExecutor.h new file mode 100644 index 00000000000..9bb9c5bf07e --- /dev/null +++ b/src/meta/processors/job/StorageJobExecutor.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef META_STORAGEJOBEXECUTOR_H_ +#define META_STORAGEJOBEXECUTOR_H_ + +#include + +#include "common/base/ErrorOr.h" +#include "kvstore/KVStore.h" +#include "meta/processors/admin/AdminClient.h" +#include "meta/processors/job/JobDescription.h" +#include "meta/processors/job/JobExecutor.h" + +namespace nebula { +namespace meta { + +using PartsOfHost = std::pair>; +using ErrOrHosts = ErrorOr>; + +class StorageJobExecutor : public JobExecutor { + public: + enum class TargetHosts { LEADER = 0, LISTENER, DEFAULT }; + + StorageJobExecutor(JobID jobId, + kvstore::KVStore* kvstore, + AdminClient* adminClient, + const std::vector& paras) + : JobExecutor(kvstore), jobId_(jobId), adminClient_(adminClient), paras_(paras) {} + + virtual ~StorageJobExecutor() = default; + + // Check the arguments about the job. + bool check() override { return true; } + + // Prepare the Job info from the arguments. + nebula::cpp2::ErrorCode prepare() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + + // The skeleton to run the job. + // You should rewrite the executeInternal to trigger the calling. + nebula::cpp2::ErrorCode execute() override; + + void interruptExecution(JobID jobId); + + // Stop the job when the user cancel it. + nebula::cpp2::ErrorCode stop() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + + nebula::cpp2::ErrorCode finish(bool) override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + + void setSpaceId(GraphSpaceID spaceId) override { space_ = spaceId; } + + nebula::cpp2::ErrorCode saveSpecialTaskStatus(const cpp2::ReportTaskReq&) override { + return nebula::cpp2::ErrorCode::SUCCEEDED; + } + + bool isMetaJob() override { return false; } + + nebula::cpp2::ErrorCode recovery() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + + protected: + ErrOrHosts getTargetHost(GraphSpaceID space); + + ErrOrHosts getLeaderHost(GraphSpaceID space); + + ErrOrHosts getListenerHost(GraphSpaceID space, cpp2::ListenerType type); + + virtual folly::Future executeInternal(HostAddr&& address, + std::vector&& parts) = 0; + + protected: + JobID jobId_{INT_MIN}; + TaskID taskId_{0}; + AdminClient* adminClient_{nullptr}; + GraphSpaceID space_; + std::vector paras_; + TargetHosts toHost_{TargetHosts::DEFAULT}; + int32_t concurrency_{INT_MAX}; + volatile bool stopped_{false}; + std::mutex muInterrupt_; + std::condition_variable condInterrupt_; +}; + +} // namespace meta +} // namespace nebula + +#endif // META_STORAGEJOBEXECUTOR_H_ diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp new file mode 100644 index 00000000000..7c6feabde96 --- /dev/null +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp @@ -0,0 +1,305 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "meta/processors/job/ZoneBalanceJobExecutor.h" + +#include + +#include "common/utils/MetaKeyUtils.h" +#include "kvstore/NebulaStore.h" +#include "meta/processors/job/JobUtils.h" + +namespace nebula { +namespace meta { + +nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::prepare() { + auto spaceRet = getSpaceIdFromName(paras_.back()); + if (!nebula::ok(spaceRet)) { + LOG(ERROR) << "Can't find the space: " << paras_.back(); + return nebula::error(spaceRet); + } + GraphSpaceID spaceId = nebula::value(spaceRet); + nebula::cpp2::ErrorCode rc = spaceInfo_.getInfo(spaceId, kvstore_); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + return rc; + } + lostZones_.reserve(paras_.size() - 1); + for (size_t i = 0; i < paras_.size() - 1; i++) { + lostZones_.emplace_back(paras_[i]); + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::stop() { + plan_->stop(); + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +folly::Future ZoneBalanceJobExecutor::executeInternal() { + if (plan_ == nullptr) { + Status status = buildBalancePlan(); + if (status != Status::OK()) { + return status; + } + } + plan_->setFinishCallBack([this](meta::cpp2::JobStatus status) { + if (LastUpdateTimeMan::update(kvstore_, time::WallClock::fastNowInMilliSec()) != + nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Balance plan " << plan_->id() << " update meta failed"; + } + if (status == meta::cpp2::JobStatus::FINISHED) { + nebula::cpp2::ErrorCode ret = updateMeta(); + if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { + status = meta::cpp2::JobStatus::FAILED; + } + } + executorOnFinished_(status); + }); + plan_->invoke(); + return Status::OK(); +} + +nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::updateMeta() { + std::string spaceKey = MetaKeyUtils::spaceKey(spaceInfo_.spaceId_); + std::string spaceVal; + kvstore_->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); + std::vector zones; + for (std::string& zn : lostZones_) { + spaceInfo_.zones_.erase(zn); + } + for (auto& p : spaceInfo_.zones_) { + zones.push_back(p.first); + } + properties.set_zone_names(std::move(zones)); + std::vector data; + data.emplace_back(MetaKeyUtils::spaceKey(spaceInfo_.spaceId_), + MetaKeyUtils::spaceVal(properties)); + folly::Baton baton; + auto ret = nebula::cpp2::ErrorCode::SUCCEEDED; + kvstore_->asyncMultiPut(kDefaultSpaceId, + kDefaultPartId, + std::move(data), + [&baton, &ret](nebula::cpp2::ErrorCode code) { + if (nebula::cpp2::ErrorCode::SUCCEEDED != code) { + ret = code; + LOG(ERROR) << "Can't write the kvstore, ret = " + << static_cast(code); + } + baton.post(); + }); + baton.wait(); + return ret; +} + +/* first, move the lostZones' parts to the active zones + * second, make balance for the active zones */ +Status ZoneBalanceJobExecutor::buildBalancePlan() { + for (std::string& zn : lostZones_) { + if (!spaceInfo_.zones_.count(zn)) { + return Status::Error("space %s does not have zone %s", spaceInfo_.name_.c_str(), zn.c_str()); + } + } + + std::map activeZones; + std::map lostZones; + for (auto& zoneMapEntry : spaceInfo_.zones_) { + activeZones.emplace(zoneMapEntry.first, &zoneMapEntry.second); + } + for (std::string& zn : lostZones_) { + auto it = activeZones.find(zn); + if (it != activeZones.end()) { + lostZones.emplace(it->first, it->second); + activeZones.erase(it); + } + } + int32_t activeSize = activeZones.size(); + if (activeSize < spaceInfo_.replica_) { + return Status::Error("Not enough alive zones to hold replica"); + } + std::vector tasks; + + std::vector sortedActiveZones; + sortedActiveZones.reserve(activeZones.size()); + std::map> sortedZoneHosts; + std::for_each(activeZones.begin(), + activeZones.end(), + [&sortedActiveZones, + &sortedZoneHosts](std::pair& activeZonesEntry) { + sortedActiveZones.push_back(activeZonesEntry.second); + std::vector& hosts = sortedZoneHosts[activeZonesEntry.first]; + for (auto& hostMapEntry : activeZonesEntry.second->hosts_) { + hosts.push_back(&hostMapEntry.second); + } + std::sort(hosts.begin(), hosts.end(), [](Host*& l, Host*& r) -> bool { + return l->parts_.size() < r->parts_.size(); + }); + sortedActiveZones.back()->calPartNum(); + }); + std::sort(sortedActiveZones.begin(), sortedActiveZones.end(), [](Zone*& l, Zone*& r) -> bool { + return l->partNum_ < r->partNum_; + }); + + auto insertPartIntoZone = [&sortedZoneHosts](Zone* zone, PartitionID partId) -> HostAddr { + std::vector& sortedHosts = sortedZoneHosts[zone->zoneName_]; + sortedHosts.front()->parts_.emplace(partId); + zone->partNum_++; + HostAddr ha = sortedHosts.front()->ha_; + for (size_t i = 0; i < sortedHosts.size() - 1; i++) { + if (sortedHosts[i]->parts_.size() >= sortedHosts[i + 1]->parts_.size()) { + std::swap(sortedHosts[i], sortedHosts[i + 1]); + } else { + break; + } + } + return ha; + }; + + auto chooseZoneToInsert = [&insertPartIntoZone, + &sortedActiveZones](PartitionID partId) -> HostAddr { + size_t index = 0; + for (size_t i = 0; i < sortedActiveZones.size(); i++) { + if (!sortedActiveZones[i]->partExist(partId)) { + index = i; + break; + } + } + HostAddr ha = insertPartIntoZone(sortedActiveZones[index], partId); + for (size_t i = index; i < sortedActiveZones.size() - 1; i++) { + if (sortedActiveZones[i]->partNum_ >= sortedActiveZones[i + 1]->partNum_) { + std::swap(sortedActiveZones[i], sortedActiveZones[i + 1]); + } else { + break; + } + } + return ha; + }; + + for (auto& zoneMapEntry : lostZones) { + Zone* zone = zoneMapEntry.second; + for (auto& hostMapEntry : zone->hosts_) { + for (PartitionID partId : hostMapEntry.second.parts_) { + HostAddr dst = chooseZoneToInsert(partId); + tasks.emplace_back( + jobId_, spaceInfo_.spaceId_, partId, hostMapEntry.first, dst, kvstore_, adminClient_); + } + hostMapEntry.second.parts_.clear(); + } + zone->calPartNum(); + } + + int32_t totalPartNum = 0; + int32_t avgPartNum = 0; + for (auto& z : sortedActiveZones) { + totalPartNum += z->partNum_; + } + avgPartNum = totalPartNum / sortedActiveZones.size(); + int32_t remainder = totalPartNum - avgPartNum * sortedActiveZones.size(); + int32_t leftBegin = 0; + int32_t leftEnd = 0; + int32_t rightBegin = 0; + int32_t rightEnd = sortedActiveZones.size(); + for (size_t i = 0; i < sortedActiveZones.size(); i++) { + if (avgPartNum <= sortedActiveZones[i]->partNum_) { + leftEnd = i; + break; + } + } + for (size_t i = leftEnd; i < sortedActiveZones.size(); i++) { + if (avgPartNum < sortedActiveZones[i]->partNum_) { + rightBegin = i; + break; + } + } + for (int32_t right = rightBegin; right < rightEnd;) { + Zone* srcZone = sortedActiveZones[right]; + // if remainder>0 some zones will hold avgPartNum+1 patrs, we prioritise taking the right side + // zones to hold them + if (srcZone->partNum_ == avgPartNum + 1 && remainder) { + right++; + remainder--; + continue; + } + if (srcZone->partNum_ == avgPartNum) { + right++; + continue; + } + std::vector& sortedHosts = sortedZoneHosts[srcZone->zoneName_]; + int32_t hostIndex = sortedHosts.size() - 1; + // to find a part to move,we prioritise moving parts from who has the most + for (; hostIndex >= 0; hostIndex--) { + std::set& hostParts = sortedHosts[hostIndex]->parts_; + PartitionID movePart = -1; + for (PartitionID partId : hostParts) { + bool matched = false; + // to find a zone which does not contain the part in the left side to insert + for (int32_t leftIndex = leftBegin; leftIndex < leftEnd; leftIndex++) { + if (!sortedActiveZones[leftIndex]->partExist(partId)) { + HostAddr dst = insertPartIntoZone(sortedActiveZones[leftIndex], partId); + tasks.emplace_back(jobId_, + spaceInfo_.spaceId_, + partId, + sortedHosts[hostIndex]->ha_, + dst, + kvstore_, + adminClient_); + movePart = partId; + int32_t newLeftIndex = leftIndex; + for (; newLeftIndex < leftEnd - 1; newLeftIndex++) { + if (sortedActiveZones[newLeftIndex]->partNum_ > + sortedActiveZones[newLeftIndex + 1]->partNum_) { + std::swap(sortedActiveZones[newLeftIndex], sortedActiveZones[newLeftIndex + 1]); + } else { + break; + } + } + // if the zone's part reach the avgPartNum,is can't recieve parts any more + if (newLeftIndex == leftEnd - 1 && + sortedActiveZones[newLeftIndex]->partNum_ >= avgPartNum) { + leftEnd--; + } + // all zones in left side have reached avgPartNum,and now some of them will take + // avgPartNum+1 if there still has remainder + if (leftBegin == leftEnd) { + leftEnd = rightBegin; + } + matched = true; + break; + } + } + if (matched) { + break; + } + } + if (movePart != -1) { + hostParts.erase(movePart); + srcZone->partNum_--; + break; + } + } + for (int32_t i = hostIndex; i > 0; i--) { + if (sortedHosts[i]->parts_.size() <= sortedHosts[i - 1]->parts_.size()) { + std::swap(sortedHosts[i], sortedHosts[i - 1]); + } else { + break; + } + } + } + if (tasks.empty()) { + return Status::Balanced(); + } + plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); + for (BalanceTask& task : tasks) { + plan_->addTask(std::move(task)); + } + nebula::cpp2::ErrorCode rc = plan_->saveInStore(); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + return Status::Error("save balance zone plan failed"); + } + return Status::OK(); +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.h b/src/meta/processors/job/ZoneBalanceJobExecutor.h new file mode 100644 index 00000000000..9c96d066a91 --- /dev/null +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2020 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef META_ZONEBALANCEJOBEXECUTOR_H_ +#define META_ZONEBALANCEJOBEXECUTOR_H_ + +#include "meta/processors/job/BalanceJobExecutor.h" + +namespace nebula { +namespace meta { + +class ZoneBalanceJobExecutor : public BalanceJobExecutor { + FRIEND_TEST(BalanceTest, RemoveZonePlanTest); + FRIEND_TEST(BalanceTest, BalanceZonePlanTest); + FRIEND_TEST(BalanceTest, BalanceZoneRemainderPlanTest); + FRIEND_TEST(BalanceTest, NormalZoneTest); + FRIEND_TEST(BalanceTest, StopPlanTest); + + public: + ZoneBalanceJobExecutor(JobDescription jobDescription, + kvstore::KVStore* kvstore, + AdminClient* adminClient, + const std::vector& params) + : BalanceJobExecutor(jobDescription.getJobId(), kvstore, adminClient, params), + jobDescription_(jobDescription) {} + nebula::cpp2::ErrorCode prepare() override; + nebula::cpp2::ErrorCode stop() override; + + protected: + folly::Future executeInternal() override; + Status buildBalancePlan() override; + nebula::cpp2::ErrorCode updateMeta(); + + private: + std::vector lostZones_; + JobDescription jobDescription_; +}; + +} // namespace meta +} // namespace nebula + +#endif // META_BALANCEJOBEXECUTOR_H_ diff --git a/src/meta/test/BalancerTest.cpp b/src/meta/test/BalancerTest.cpp index ff0504d22b1..54e918b0535 100644 --- a/src/meta/test/BalancerTest.cpp +++ b/src/meta/test/BalancerTest.cpp @@ -10,6 +10,9 @@ #include "common/base/Base.h" #include "common/fs/TempDir.h" #include "meta/processors/job/BalanceJobExecutor.h" +#include "meta/processors/job/DataBalanceJobExecutor.h" +#include "meta/processors/job/LeaderBalanceJobExecutor.h" +#include "meta/processors/job/ZoneBalanceJobExecutor.h" #include "meta/processors/parts/CreateSpaceProcessor.h" #include "meta/test/MockAdminClient.h" #include "meta/test/TestUtils.h" @@ -90,695 +93,333 @@ TEST(BalanceTest, BalanceTaskTest) { LOG(INFO) << "Test finished!"; } -void showHostLoading(kvstore::KVStore* kv, GraphSpaceID spaceId) { - auto prefix = MetaKeyUtils::partPrefix(spaceId); - std::unique_ptr iter; - auto ret = kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, ret); - HostParts hostPart; - while (iter->valid()) { - auto key = iter->key(); - PartitionID partId; - memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); - auto hs = MetaKeyUtils::parsePartVal(iter->val()); - for (auto h : hs) { - hostPart[h].emplace_back(partId); - } - iter->next(); - } - - for (auto it = hostPart.begin(); it != hostPart.end(); it++) { - std::stringstream ss; - for (auto part : it->second) { - ss << part << " "; - } - LOG(INFO) << "Host: " << it->first << " parts: " << ss.str(); - } -} - -HostParts assignHostParts(kvstore::KVStore* kv, GraphSpaceID spaceId) { - auto prefix = MetaKeyUtils::partPrefix(spaceId); - std::unique_ptr iter; - kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - HostParts hostPart; - while (iter->valid()) { - auto key = iter->key(); - PartitionID partId; - memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); - auto hs = MetaKeyUtils::parsePartVal(iter->val()); - for (auto h : hs) { - hostPart[h].emplace_back(partId); - } - iter->next(); - } - return hostPart; -} - -void testRestBlancer() { - DataBalanceJobExecutor::plan_.reset(nullptr); - BalanceJobExecutor::lock_.unlock(); - BalanceJobExecutor::running_ = false; - LeaderBalanceJobExecutor::inLeaderBalance_ = false; -} - -TEST(BalanceTest, SimpleTestWithZone) { - fs::TempDir rootPath("/tmp/SimpleTestWithZone.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - { - std::vector hosts; - for (int i = 0; i < 4; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - - // create zone and group - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}}}, - {"zone_1", {{"1", 1}}}, - {"zone_2", {{"2", 2}}}, - {"zone_3", {{"3", 3}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - } - sleep(1); - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("1", 1), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("2", 2), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("3", 3), std::vector{}); - int32_t totalParts = 12; - std::vector tasks; - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3"}; - auto code = balancer.assembleZoneParts(zones, hostParts); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); - balancer.balanceParts(0, hostParts, totalParts, tasks, true); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_EQ(3, it->second.size()); - } - EXPECT_EQ(3, tasks.size()); - } - testRestBlancer(); -} - -TEST(BalanceTest, ExpansionZoneTest) { - fs::TempDir rootPath("/tmp/ExpansionZoneTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - { - std::vector hosts; - for (int i = 0; i < 3; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - - // create zone and group - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}}}, {"zone_1", {{"1", 1}}}, {"zone_2", {{"2", 2}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - } - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - { - std::vector hosts; - for (int i = 0; i < 4; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}}}, - {"zone_1", {{"1", 1}}}, - {"zone_2", {{"2", 2}}}, - {"zone_3", {{"3", 3}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3"}; - properties.zone_names_ref() = std::move(zones); - std::vector data; - data.emplace_back(MetaKeyUtils::spaceKey(1), MetaKeyUtils::spaceVal(properties)); - folly::Baton baton; - kv->asyncMultiPut(0, 0, std::move(data), [&](nebula::cpp2::ErrorCode code) { - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); - baton.post(); - }); - baton.wait(); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3"}; - properties.zone_names_ref() = std::move(zones); - std::vector data; - data.emplace_back(MetaKeyUtils::spaceKey(1), MetaKeyUtils::spaceVal(properties)); - folly::Baton baton; - kv->asyncMultiPut(0, 0, std::move(data), [&](nebula::cpp2::ErrorCode code) { - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); - baton.post(); - }); - baton.wait(); - } - { - HostParts hostParts; - int32_t totalParts = 0; - auto result = balancer.getHostParts(1, true, hostParts, totalParts); - ASSERT_TRUE(nebula::ok(result)); - std::vector tasks; - hostParts.emplace(HostAddr("3", 3), std::vector{}); - balancer.balanceParts(0, hostParts, totalParts, tasks, true); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_EQ(3, it->second.size()); - } - EXPECT_EQ(3, tasks.size()); - } - testRestBlancer(); -} - -TEST(BalanceTest, ExpansionHostIntoZoneTest) { - fs::TempDir rootPath("/tmp/ExpansionHostIntoZoneTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - { - std::vector hosts; - for (int i = 0; i < 6; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - - // create zone and group - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}}}, {"zone_1", {{"1", 1}}}, {"zone_2", {{"2", 2}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - } - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - { - std::vector hosts; - for (int i = 0; i < 6; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}, {"3", 3}}}, - {"zone_1", {{"1", 1}, {"4", 4}}}, - {"zone_2", {{"2", 2}, {"5", 5}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - HostParts hostParts; - int32_t totalParts = 0; - auto result = balancer.getHostParts(1, true, hostParts, totalParts); - ASSERT_TRUE(nebula::ok(result)); - - std::vector tasks; - hostParts.emplace(HostAddr("3", 3), std::vector{}); - hostParts.emplace(HostAddr("4", 4), std::vector{}); - hostParts.emplace(HostAddr("5", 5), std::vector{}); - - balancer.balanceParts(0, hostParts, totalParts, tasks, true); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_EQ(2, it->second.size()); - } - EXPECT_EQ(6, tasks.size()); - } - testRestBlancer(); -} - -TEST(BalanceTest, ShrinkZoneTest) { - fs::TempDir rootPath("/tmp/ShrinkZoneTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - { - std::vector hosts; - for (int i = 0; i < 4; i++) { - hosts.emplace_back(std::to_string(i), i); - } - - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - // create zone and group - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}}}, - {"zone_1", {{"1", 1}}}, - {"zone_2", {{"2", 2}}}, - {"zone_3", {{"3", 3}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - } - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - balancer.lostHosts_ = {{"3", 3}}; - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); -} - -TEST(BalanceTest, ShrinkHostFromZoneTest) { - fs::TempDir rootPath("/tmp/ShrinkHostFromZoneTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - { - std::vector hosts; - for (int i = 0; i < 6; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - - // create zone and group - ZoneInfo zoneInfo = {{"zone_0", {{"0", 0}, {"3", 3}}}, - {"zone_1", {{"1", 1}, {"4", 4}}}, - {"zone_2", {{"2", 2}, {"5", 5}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 4; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - } - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd(0L, cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - showHostLoading(kv, 1); - - { - ZoneInfo zoneInfo = { - {"zone_0", {{"0", 0}}}, {"zone_1", {{"1", 1}, {"4", 4}}}, {"zone_2", {{"2", 2}, {"5", 5}}}}; - TestUtils::assembleZone(kv, zoneInfo); - } - balancer.lostHosts_ = {{"3", 3}}; - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); -} - -TEST(BalanceTest, DISABLED_BalanceWithComplexZoneTest) { - fs::TempDir rootPath("/tmp/LeaderBalanceWithComplexZoneTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - std::vector hosts; - for (int i = 0; i < 18; i++) { - hosts.emplace_back(std::to_string(i), i); - } - TestUtils::createSomeHosts(kv, hosts); - TestUtils::registerHB(kv, hosts); - - { - ZoneInfo zoneInfo = { - {"zone_0", {HostAddr("0", 0), HostAddr("1", 1)}}, - {"zone_1", {HostAddr("2", 2), HostAddr("3", 3)}}, - {"zone_2", {HostAddr("4", 4), HostAddr("5", 5)}}, - {"zone_3", {HostAddr("6", 6), HostAddr("7", 7)}}, - {"zone_4", {HostAddr("8", 8), HostAddr("9", 9)}}, - {"zone_5", {HostAddr("10", 10), HostAddr("11", 11)}}, - {"zone_6", {HostAddr("12", 12), HostAddr("13", 13)}}, - {"zone_7", {HostAddr("14", 14), HostAddr("15", 15)}}, - {"zone_8", {HostAddr("16", 16), HostAddr("17", 17)}}, - }; - TestUtils::assembleZone(kv, zoneInfo); - } - { - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "default_space"; - properties.partition_num_ref() = 18; - properties.replica_factor_ref() = 3; - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(1, resp.get_id().get_space_id()); - LOG(INFO) << "Show host about space " << resp.get_id().get_space_id(); - showHostLoading(kv, resp.get_id().get_space_id()); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "space_on_group_0"; - properties.partition_num_ref() = 64; - properties.replica_factor_ref() = 3; - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3", "zone_4"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(2, resp.get_id().get_space_id()); - LOG(INFO) << "Show host about space " << resp.get_id().get_space_id(); - showHostLoading(kv, resp.get_id().get_space_id()); - } - { - cpp2::SpaceDesc properties; - properties.space_name_ref() = "space_on_group_1"; - properties.partition_num_ref() = 81; - properties.replica_factor_ref() = 3; - std::vector zones = { - "zone_0", "zone_1", "zone_2", "zone_3", "zone_4", "zone_5", "zone_6", "zone_7", "zone_8"}; - properties.zone_names_ref() = std::move(zones); - cpp2::CreateSpaceReq req; - req.properties_ref() = std::move(properties); - auto* processor = CreateSpaceProcessor::instance(kv); - auto f = processor->getFuture(); - processor->process(req); - auto resp = std::move(f).get(); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); - ASSERT_EQ(3, resp.get_id().get_space_id()); - LOG(INFO) << "Show host about space " << resp.get_id().get_space_id(); - showHostLoading(kv, resp.get_id().get_space_id()); +SpaceInfo createSpaceInfo( + const std::string& name, + GraphSpaceID spaceId, + int32_t replica, + const std::vector< + std::pair>>>>& + zones) { + SpaceInfo spaceInfo; + spaceInfo.name_ = name; + spaceInfo.spaceId_ = spaceId; + spaceInfo.replica_ = replica; + for (const auto& z : zones) { + Zone zone(z.first); + for (const auto& h : z.second) { + Host host(h.first); + for (const auto& p : h.second) { + host.parts_.insert(p); + } + zone.hosts_.emplace(host.ha_, host); } + spaceInfo.zones_.emplace(zone.zoneName_, zone); } - sleep(1); + return spaceInfo; +} - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - { - int32_t totalParts = 18 * 3; - std::vector tasks; - auto hostParts = assignHostParts(kv, 1); - balancer.balanceParts(1, hostParts, totalParts, tasks, true); - } - { - int32_t totalParts = 64 * 3; - std::vector tasks; - auto hostParts = assignHostParts(kv, 2); - std::vector zones = {"zone_0", "zone_1", "zone_2", "zone_3", "zone_4"}; - auto code = balancer.assembleZoneParts(zones, hostParts); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); - balancer.balanceParts(2, hostParts, totalParts, tasks, true); +void checkZoneAvg(const Zone& zone) { + int32_t avg = zone.partNum_ / zone.hosts_.size(); + for (const auto& p : zone.hosts_) { + EXPECT_EQ(p.second.parts_.size() - avg <= 1, true); } - { - auto dump = [](const HostParts& hostParts, const std::vector& tasks) { - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - std::stringstream ss; - ss << it->first << ": "; - for (auto partId : it->second) { - ss << partId << ", "; - } - LOG(INFO) << ss.str() << " size " << it->second.size(); - } - for (const auto& task : tasks) { - LOG(INFO) << task.taskIdStr(); - } - }; - - HostParts hostParts; - std::vector parts; - for (int32_t i = 1; i <= 81; i++) { - parts.emplace_back(i); - } - - for (int32_t i = 0; i < 18; i++) { - if (i == 10 || i == 12 || i == 14) { - hostParts.emplace(HostAddr(std::to_string(i), i), parts); - } else { - hostParts.emplace(HostAddr(std::to_string(i), i), std::vector{}); - } - } +} - LOG(INFO) << "=== original map ===="; - int32_t totalParts = 243; - std::vector tasks; - dump(hostParts, tasks); - std::vector zones = { - "zone_0", "zone_1", "zone_2", "zone_3", "zone_4", "zone_5", "zone_6", "zone_7", "zone_8"}; - auto code = balancer.assembleZoneParts(zones, hostParts); - ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); - balancer.balanceParts(3, hostParts, totalParts, tasks, true); +void checkConflic(const Zone& zone) { + int32_t totalNum = 0; + for (const auto& p : zone.hosts_) { + totalNum += p.second.parts_.size(); + } + EXPECT_EQ(totalNum, zone.partNum_); +} - LOG(INFO) << "=== new map ===="; - dump(hostParts, tasks); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_GE(it->second.size(), 5); - EXPECT_LE(it->second.size(), 24); +TEST(BalanceTest, RemoveZonePlanTest) { + fs::TempDir rootPath("/tmp/RemoveZoneTest.XXXXXX"); + std::unique_ptr store = MockCluster::initMetaKV(rootPath.path()); + SpaceInfo spaceInfo = createSpaceInfo( + "space1", + 1, + 3, + {{"zone1", + {{{"127.0.0.1", 11}, {5}}, + {{"127.0.0.1", 12}, {10, 15}}, + {{"127.0.0.1", 13}, {12, 13, 14}}}}, + {"zone2", + {{{"127.0.0.1", 21}, {3, 4}}, {{"127.0.0.1", 22}, {8}}, {{"127.0.0.1", 23}, {15}}}}, + {"zone3", + {{{"127.0.0.1", 31}, {1, 2}}, + {{"127.0.0.1", 32}, {6, 7, 8, 9, 10}}, + {{"127.0.0.1", 33}, {11, 12}}}}, + {"zone4", + {{{"127.0.0.1", 41}, {1, 2, 3}}, + {{"127.0.0.1", 42}, {6, 7, 11}}, + {{"127.0.0.1", 43}, {12, 13, 14}}}}, + {"zone5", + {{{"127.0.0.1", 51}, {3, 4, 5}}, + {{"127.0.0.1", 52}, {9, 10, 11}}, + {{"127.0.0.1", 53}, {13, 14, 15}}}}, + {"zone6", {{{"127.0.0.1", 61}, {4}}, {{"127.0.0.1", 62}, {8, 9}}}}, + {"zone7", + {{{"127.0.0.1", 71}, {1, 2}}, {{"127.0.0.1", 72}, {6, 7}}, {{"127.0.0.1", 73}, {5}}}}}); + ZoneBalanceJobExecutor balancer(JobDescription(), store.get(), nullptr, {}); + balancer.lostZones_ = {"zone6", "zone7"}; + balancer.spaceInfo_ = spaceInfo; + Status status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + checkZoneAvg(balancer.spaceInfo_.zones_["zone1"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone2"]); + + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone4"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone5"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone6"].partNum_, 0); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone7"].partNum_, 0); + checkConflic(balancer.spaceInfo_.zones_["zone1"]); + checkConflic(balancer.spaceInfo_.zones_["zone2"]); + checkConflic(balancer.spaceInfo_.zones_["zone3"]); + checkConflic(balancer.spaceInfo_.zones_["zone4"]); + checkConflic(balancer.spaceInfo_.zones_["zone5"]); + checkConflic(balancer.spaceInfo_.zones_["zone6"]); + checkConflic(balancer.spaceInfo_.zones_["zone7"]); +} - LOG(INFO) << "Host " << it->first << " Part Size " << it->second.size(); - } - showHostLoading(kv, 3); - } +TEST(BalanceTest, BalanceZonePlanTest) { + fs::TempDir rootPath("/tmp/BalanceZoneTest.XXXXXX"); + std::unique_ptr store = MockCluster::initMetaKV(rootPath.path()); + SpaceInfo spaceInfo = createSpaceInfo( + "space1", + 1, + 3, + { + {"zone1", + {{{"127.0.0.1", 11}, {5}}, + {{"127.0.0.1", 12}, {10, 15}}, + {{"127.0.0.1", 13}, {12, 13, 14}}}}, + {"zone2", + {{{"127.0.0.1", 21}, {3, 4}}, {{"127.0.0.1", 22}, {8}}, {{"127.0.0.1", 23}, {15}}}}, + {"zone3", + {{{"127.0.0.1", 31}, {1, 2}}, + {{"127.0.0.1", 32}, {6, 7, 8, 9, 10}}, + {{"127.0.0.1", 33}, {11, 12}}}}, + {"zone4", + {{{"127.0.0.1", 41}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 42}, {6, 7, 8, 9, 11}}, + {{"127.0.0.1", 43}, {12, 13, 14}}}}, + {"zone5", + {{{"127.0.0.1", 51}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 52}, {6, 7, 9, 10, 11}}, + {{"127.0.0.1", 53}, {13, 14, 15}}}}, + }); + ZoneBalanceJobExecutor balancer(JobDescription(), store.get(), nullptr, {}); + balancer.spaceInfo_ = spaceInfo; + Status status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone4"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone5"].partNum_, 9); + checkZoneAvg(balancer.spaceInfo_.zones_["zone1"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone2"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone4"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone5"]); + checkConflic(balancer.spaceInfo_.zones_["zone1"]); + checkConflic(balancer.spaceInfo_.zones_["zone2"]); + checkConflic(balancer.spaceInfo_.zones_["zone3"]); + checkConflic(balancer.spaceInfo_.zones_["zone4"]); + checkConflic(balancer.spaceInfo_.zones_["zone5"]); + balancer.lostZones_ = {"zone4", "zone5"}; + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 15); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 15); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 15); + balancer.lostZones_ = {}; + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone4"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone5"].partNum_, 9); } -TEST(BalanceTest, BalancePartsTest) { - fs::TempDir rootPath("/tmp/BalancePartsTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); +TEST(BalanceTest, BalanceZoneRemainderPlanTest) { + fs::TempDir rootPath("/tmp/BalanceZoneTest.XXXXXX"); + std::unique_ptr store = MockCluster::initMetaKV(rootPath.path()); + SpaceInfo spaceInfo = createSpaceInfo( + "space1", + 1, + 3, + { + {"zone1", + {{{"127.0.0.1", 11}, {5}}, + {{"127.0.0.1", 12}, {10, 15}}, + {{"127.0.0.1", 13}, {12, 13, 14}}}}, + {"zone2", + {{{"127.0.0.1", 21}, {3, 4}}, {{"127.0.0.1", 22}, {8, 16}}, {{"127.0.0.1", 23}, {15}}}}, + {"zone3", + {{{"127.0.0.1", 31}, {1, 2}}, + {{"127.0.0.1", 32}, {6, 7, 8, 9, 10}}, + {{"127.0.0.1", 33}, {11, 12}}}}, + {"zone4", + {{{"127.0.0.1", 41}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 42}, {6, 7, 8, 9, 11}}, + {{"127.0.0.1", 43}, {12, 13, 14, 16}}}}, + {"zone5", + {{{"127.0.0.1", 51}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 52}, {6, 7, 9, 10, 11}}, + {{"127.0.0.1", 53}, {13, 14, 15, 16}}}}, + }); + ZoneBalanceJobExecutor balancer(JobDescription(), store.get(), nullptr, {}); + balancer.spaceInfo_ = spaceInfo; + Status status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 10); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone4"].partNum_, 10); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone5"].partNum_, 10); + checkZoneAvg(balancer.spaceInfo_.zones_["zone1"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone2"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone4"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone5"]); + checkConflic(balancer.spaceInfo_.zones_["zone1"]); + checkConflic(balancer.spaceInfo_.zones_["zone2"]); + checkConflic(balancer.spaceInfo_.zones_["zone3"]); + checkConflic(balancer.spaceInfo_.zones_["zone4"]); + checkConflic(balancer.spaceInfo_.zones_["zone5"]); + + spaceInfo = createSpaceInfo( + "space1", + 1, + 3, + { + {"zone1", + {{{"127.0.0.1", 11}, {5}}, + {{"127.0.0.1", 12}, {10, 15}}, + {{"127.0.0.1", 13}, {12, 13, 14}}}}, + {"zone2", + {{{"127.0.0.1", 21}, {3, 4}}, {{"127.0.0.1", 22}, {8}}, {{"127.0.0.1", 23}, {15}}}}, + {"zone3", + {{{"127.0.0.1", 31}, {1, 2}}, + {{"127.0.0.1", 32}, {6, 7, 8, 9, 10}}, + {{"127.0.0.1", 33}, {11, 12, 16}}}}, + {"zone4", + {{{"127.0.0.1", 41}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 42}, {6, 7, 8, 9, 11}}, + {{"127.0.0.1", 43}, {12, 13, 14, 16}}}}, + {"zone5", + {{{"127.0.0.1", 51}, {1, 2, 3, 4, 5}}, + {{"127.0.0.1", 52}, {6, 7, 9, 10, 11}}, + {{"127.0.0.1", 53}, {13, 14, 15, 16}}}}, + }); + balancer.spaceInfo_ = spaceInfo; + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone2"].partNum_, 9); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone3"].partNum_, 10); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone4"].partNum_, 10); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone5"].partNum_, 10); + checkZoneAvg(balancer.spaceInfo_.zones_["zone1"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone2"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone4"]); + checkZoneAvg(balancer.spaceInfo_.zones_["zone5"]); + checkConflic(balancer.spaceInfo_.zones_["zone1"]); + checkConflic(balancer.spaceInfo_.zones_["zone2"]); + checkConflic(balancer.spaceInfo_.zones_["zone3"]); + checkConflic(balancer.spaceInfo_.zones_["zone4"]); + checkConflic(balancer.spaceInfo_.zones_["zone5"]); + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::Balanced()); +} - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; +TEST(BalanceTest, BalanceDataPlanTest) { + fs::TempDir rootPath("/tmp/BalanceZoneTest.XXXXXX"); + std::unique_ptr store = MockCluster::initMetaKV(rootPath.path()); + SpaceInfo spaceInfo = createSpaceInfo( + "space1", + 1, + 3, + { + {"zone1", + {{{"127.0.0.1", 11}, {1, 2, 3, 53, 54}}, + {{"127.0.0.1", 12}, {4, 5}}, + {{"127.0.0.1", 13}, {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}, + {{"127.0.0.1", 14}, {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}}, + {{"127.0.0.1", 15}, {31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52}}}}, + }); + DataBalanceJobExecutor balancer(JobDescription(), store.get(), nullptr, {}); + balancer.spaceInfo_ = spaceInfo; + Status status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 11)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 12)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 13)].parts_.size(), + 10); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 14)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 15)].parts_.size(), + 11); + + spaceInfo = createSpaceInfo("space1", + 1, + 3, + {{"zone1", + {{{"127.0.0.1", 11}, {5, 6, 7, 8, 9, 10}}, + {{"127.0.0.1", 12}, {11, 12, 13, 17, 18, 19, 20}}, + {{"127.0.0.1", 13}, {21, 22, 23, 28, 29, 30}}, + {{"127.0.0.1", 14}, {31, 32, 33, 34, 35, 36, 37, 38, 39, 40}}, + {{"127.0.0.1", 15}, {41, 42, 43, 44, 45, 46, 47, 48, 49, 50}}, + {{"127.0.0.1", 16}, {51, 52, 53, 54, 14, 15, 16}}, + {{"127.0.0.1", 17}, {1, 2, 3, 4, 24, 25, 26, 27}}}}}); + balancer.spaceInfo_ = spaceInfo; + balancer.lostHosts_ = {{"127.0.0.1", 16}, {"127.0.0.1", 17}}; + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::OK()); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 11)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 12)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 13)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 14)].parts_.size(), + 11); + EXPECT_EQ(balancer.spaceInfo_.zones_["zone1"].hosts_[HostAddr("127.0.0.1", 15)].parts_.size(), + 10); + status = balancer.buildBalancePlan(); + EXPECT_EQ(status, Status::Balanced()); +} - auto dump = [](const HostParts& hostParts, const std::vector& tasks) { - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - std::stringstream ss; - ss << it->first << ": "; - for (auto partId : it->second) { - ss << partId << ", "; - } - VLOG(1) << ss.str(); - } - for (const auto& task : tasks) { - VLOG(1) << task.taskIdStr(); - } - }; - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("1", 0), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("2", 0), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("3", 0), std::vector{}); - int32_t totalParts = 12; - std::vector tasks; - VLOG(1) << "=== original map ===="; - dump(hostParts, tasks); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.balanceParts(0, hostParts, totalParts, tasks, false); - VLOG(1) << "=== new map ===="; - dump(hostParts, tasks); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_EQ(3, it->second.size()); - } - EXPECT_EQ(3, tasks.size()); - } - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4, 5}); - hostParts.emplace(HostAddr("1", 0), std::vector{1, 2, 4, 5}); - hostParts.emplace(HostAddr("2", 0), std::vector{2, 3, 4, 5}); - hostParts.emplace(HostAddr("3", 0), std::vector{1, 3}); - int32_t totalParts = 15; - std::vector tasks; - VLOG(1) << "=== original map ===="; - dump(hostParts, tasks); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.balanceParts(0, hostParts, totalParts, tasks, false); - VLOG(1) << "=== new map ===="; - dump(hostParts, tasks); - EXPECT_EQ(4, hostParts[HostAddr("0", 0)].size()); - EXPECT_EQ(4, hostParts[HostAddr("1", 0)].size()); - EXPECT_EQ(4, hostParts[HostAddr("2", 0)].size()); - EXPECT_EQ(3, hostParts[HostAddr("3", 0)].size()); - EXPECT_EQ(1, tasks.size()); - } - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4}); - hostParts.emplace(HostAddr("1", 0), std::vector{1, 2, 4, 5}); - hostParts.emplace(HostAddr("2", 0), std::vector{2, 3, 4, 5}); - hostParts.emplace(HostAddr("3", 0), std::vector{1, 3, 5}); - int32_t totalParts = 15; - std::vector tasks; - VLOG(1) << "=== original map ===="; - dump(hostParts, tasks); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.balanceParts(0, hostParts, totalParts, tasks, false); - VLOG(1) << "=== new map ===="; - dump(hostParts, tasks); - EXPECT_EQ(4, hostParts[HostAddr("0", 0)].size()); - EXPECT_EQ(4, hostParts[HostAddr("1", 0)].size()); - EXPECT_EQ(4, hostParts[HostAddr("2", 0)].size()); - EXPECT_EQ(3, hostParts[HostAddr("3", 0)].size()); - EXPECT_EQ(0, tasks.size()); - } - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("1", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("2", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("3", 0), std::vector{}); - hostParts.emplace(HostAddr("4", 0), std::vector{}); - hostParts.emplace(HostAddr("5", 0), std::vector{}); - hostParts.emplace(HostAddr("6", 0), std::vector{}); - hostParts.emplace(HostAddr("7", 0), std::vector{}); - hostParts.emplace(HostAddr("8", 0), std::vector{}); - int32_t totalParts = 27; - std::vector tasks; - VLOG(1) << "=== original map ===="; - dump(hostParts, tasks); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.balanceParts(0, hostParts, totalParts, tasks, false); - VLOG(1) << "=== new map ===="; - dump(hostParts, tasks); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_EQ(3, it->second.size()); +void showHostLoading(kvstore::KVStore* kv, GraphSpaceID spaceId) { + auto prefix = MetaKeyUtils::partPrefix(spaceId); + std::unique_ptr iter; + auto ret = kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, ret); + HostParts hostPart; + while (iter->valid()) { + auto key = iter->key(); + PartitionID partId; + memcpy(&partId, key.data() + prefix.size(), sizeof(PartitionID)); + auto hs = MetaKeyUtils::parsePartVal(iter->val()); + for (auto h : hs) { + hostPart[h].emplace_back(partId); } - EXPECT_EQ(18, tasks.size()); + iter->next(); } - { - HostParts hostParts; - hostParts.emplace(HostAddr("0", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("1", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("2", 0), std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9}); - hostParts.emplace(HostAddr("3", 0), std::vector{}); - hostParts.emplace(HostAddr("4", 0), std::vector{}); - hostParts.emplace(HostAddr("5", 0), std::vector{}); - hostParts.emplace(HostAddr("6", 0), std::vector{}); - hostParts.emplace(HostAddr("7", 0), std::vector{}); - int32_t totalParts = 27; - std::vector tasks; - VLOG(1) << "=== original map ===="; - dump(hostParts, tasks); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.balanceParts(0, hostParts, totalParts, tasks, false); - VLOG(1) << "=== new map ===="; - dump(hostParts, tasks); - for (auto it = hostParts.begin(); it != hostParts.end(); it++) { - EXPECT_GE(4, it->second.size()); - EXPECT_LE(3, it->second.size()); + + for (auto it = hostPart.begin(); it != hostPart.end(); it++) { + std::stringstream ss; + for (auto part : it->second) { + ss << part << " "; } - EXPECT_EQ(18, tasks.size()); + LOG(INFO) << "Host: " << it->first << " parts: " << ss.str(); } } @@ -894,7 +535,7 @@ TEST(BalanceTest, BalancePlanTest) { plan.addTask(std::move(task)); } folly::Baton b; - plan.onFinished_ = [&plan, &b]() { + plan.onFinished_ = [&plan, &b](meta::cpp2::JobStatus) { ASSERT_EQ(meta::cpp2::JobStatus::FINISHED, plan.status()); ASSERT_EQ(10, plan.finishedTaskNum_); b.post(); @@ -925,7 +566,7 @@ TEST(BalanceTest, BalancePlanTest) { plan.addTask(std::move(task)); } folly::Baton b; - plan.onFinished_ = [&plan, &b]() { + plan.onFinished_ = [&plan, &b](meta::cpp2::JobStatus) { ASSERT_EQ(meta::cpp2::JobStatus::FINISHED, plan.status()); ASSERT_EQ(10, plan.finishedTaskNum_); b.post(); @@ -966,7 +607,7 @@ TEST(BalanceTest, BalancePlanTest) { } TestUtils::registerHB(kv, hosts); folly::Baton b; - plan.onFinished_ = [&plan, &b]() { + plan.onFinished_ = [&plan, &b](meta::cpp2::JobStatus) { ASSERT_EQ(meta::cpp2::JobStatus::FAILED, plan.status()); ASSERT_EQ(10, plan.finishedTaskNum_); b.post(); @@ -976,17 +617,6 @@ TEST(BalanceTest, BalancePlanTest) { } } -void verifyBalancePlan(kvstore::KVStore* kv, JobID jobId, meta::cpp2::JobStatus jobStatus) { - std::string key = JobDescription::makeJobKey(jobId); - std::string value; - auto retcode = kv->get(kDefaultSpaceId, kDefaultPartId, key, &value); - EXPECT_EQ(retcode, nebula::cpp2::ErrorCode::SUCCEEDED); - auto optJobRet = JobDescription::makeJobDescription(key, value); - EXPECT_TRUE(nebula::ok(optJobRet)); - auto optJob = nebula::value(optJobRet); - EXPECT_EQ(jobStatus, optJob.getStatus()); -} - void verifyBalanceTask(kvstore::KVStore* kv, JobID jobId, BalanceTaskStatus status, @@ -996,227 +626,132 @@ void verifyBalanceTask(kvstore::KVStore* kv, const auto& prefix = MetaKeyUtils::balanceTaskPrefix(jobId); std::unique_ptr iter; auto code = kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - ASSERT_EQ(code, nebula::cpp2::ErrorCode::SUCCEEDED); + EXPECT_EQ(code, nebula::cpp2::ErrorCode::SUCCEEDED); int32_t num = 0; while (iter->valid()) { auto keyTuple = MetaKeyUtils::parseBalanceTaskKey(iter->key()); - ASSERT_EQ(jobId, std::get<0>(keyTuple)); - ASSERT_EQ(1, std::get<1>(keyTuple)); + EXPECT_EQ(jobId, std::get<0>(keyTuple)); + EXPECT_EQ(1, std::get<1>(keyTuple)); partCount[std::get<3>(keyTuple)]--; partCount[std::get<4>(keyTuple)]++; auto valueTuple = MetaKeyUtils::parseBalanceTaskVal(iter->val()); - ASSERT_EQ(status, std::get<0>(valueTuple)); - ASSERT_EQ(result, std::get<1>(valueTuple)); - ASSERT_LT(0, std::get<2>(valueTuple)); - ASSERT_LT(0, std::get<3>(valueTuple)); + EXPECT_EQ(status, std::get<0>(valueTuple)); + EXPECT_EQ(result, std::get<1>(valueTuple)); + EXPECT_LT(0, std::get<2>(valueTuple)); + if (result != BalanceTaskResult::IN_PROGRESS) { + EXPECT_LT(0, std::get<3>(valueTuple)); + } num++; iter->next(); } if (exceptNumber != 0) { - ASSERT_EQ(exceptNumber, num); + EXPECT_EQ(exceptNumber, num); } } -TEST(BalanceTest, NormalTest) { - fs::TempDir rootPath("/tmp/NormalTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv); - TestUtils::assembleSpace(kv, 1, 8, 3, 4); - std::unordered_map partCount; - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - LOG(INFO) << "Now, we lost host " << HostAddr("3", 3); - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - sleep(1); - LOG(INFO) << "Rebalance finished!"; - verifyBalanceTask( - kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 6); +void verifyMetaZone(kvstore::KVStore* kv, + GraphSpaceID spaceId, + const std::vector& zones) { + std::string spaceKey = MetaKeyUtils::spaceKey(spaceId); + std::string spaceVal; + kv->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); + const std::vector& zns = properties.get_zone_names(); + std::set zoneSet; + for (const std::string& zoneName : zns) { + zoneSet.emplace(zoneName); + } + std::set expectZones; + for (const std::string& zoneName : zones) { + expectZones.emplace(zoneName); + } + EXPECT_EQ(zoneSet, expectZones); } -TEST(BalanceTest, SpecifyHostTest) { - fs::TempDir rootPath("/tmp/SpecifyHostTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}}); - TestUtils::assembleSpace(kv, 1, 8, 3, 4); - std::unordered_map partCount; - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - - sleep(1); - LOG(INFO) << "Now, we remove host {3, 3}"; - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}}); - balancer.lostHosts_ = {{"3", 3}}; - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - LOG(INFO) << "Rebalance finished!"; - verifyBalanceTask( - kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 6); +JobDescription makeJobDescription(kvstore::KVStore* kv, cpp2::AdminCmd cmd) { + JobDescription jd(testJobId.fetch_add(1, std::memory_order_relaxed), cmd, {}); + std::vector data; + data.emplace_back(jd.jobKey(), jd.jobVal()); + folly::Baton baton; + kv->asyncMultiPut(0, 0, std::move(data), [&](nebula::cpp2::ErrorCode code) { + ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); + baton.post(); + }); + baton.wait(); + return jd; } -TEST(BalanceTest, SpecifyMultiHostTest) { - fs::TempDir rootPath("/tmp/SpecifyMultiHostTest.XXXXXX"); +TEST(BalanceTest, NormalZoneTest) { + fs::TempDir rootPath("/tmp/NormalZoneTest.XXXXXX"); auto store = MockCluster::initMetaKV(rootPath.path()); auto* kv = dynamic_cast(store.get()); FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}, {"5", 5}}); - TestUtils::assembleSpace(kv, 1, 12, 3, 6); + TestUtils::assembleSpaceWithZone(kv, 1, 8, 3, 8, 24); std::unordered_map partCount; - for (int32_t i = 0; i < 6; i++) { - partCount[HostAddr(std::to_string(i), i)] = 6; - } - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - LOG(INFO) << "Now, we want to remove host {2, 2}/{3, 3}"; - // If {"2", 2} and {"3", 3} are both dead, minority hosts for some part are - // alive, it would lead to a fail - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"4", 4}, {"5", 5}}); - balancer.lostHosts_ = {{"2", 2}, {"3", 3}}; - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(apache::thrift::util::enumNameSafe(nebula::cpp2::ErrorCode::E_NO_VALID_HOST), - ret.value().message()); - // If {"2", 2} is dead, {"3", 3} still alive, each part has majority hosts - // alive - testRestBlancer(); - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"3", 3}, {"4", 4}, {"5", 5}}); - balancer.lostHosts_ = {{"2", 2}, {"3", 3}}; - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); - LOG(INFO) << "Rebalance finished!"; - - // In theory, there should be only 12 tasks, but in some environment, 13 tasks - // is generated. A partition is moved more than once from A -> B -> C, actually - // A -> C is enough. - verifyBalanceTask( - kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount); - ASSERT_EQ(9, partCount[HostAddr("0", 0)]); - ASSERT_EQ(9, partCount[HostAddr("1", 1)]); - ASSERT_EQ(0, partCount[HostAddr("2", 2)]); - ASSERT_EQ(0, partCount[HostAddr("3", 3)]); - ASSERT_EQ(9, partCount[HostAddr("4", 4)]); - ASSERT_EQ(9, partCount[HostAddr("5", 5)]); -} - -TEST(BalanceTest, MockReplaceMachineTest) { - fs::TempDir rootPath("/tmp/MockReplaceMachineTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - TestUtils::assembleSpace(kv, 1, 12, 3, 3); DefaultValue>::SetFactory( [] { return folly::Future(Status::OK()); }); NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - - // add a new machine - TestUtils::createSomeHosts(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}}); - LOG(INFO) << "Now, we want to replace host {2, 2} with {3, 3}"; - // Because for all parts majority hosts still alive, we could balance - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - // {2, 2} should be offline now - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"3", 3}}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); - LOG(INFO) << "Rebalance finished!"; - std::unordered_map partCount; + JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::ZONE_BALANCE); + ZoneBalanceJobExecutor balancer(jd, kv, &client, {}); + balancer.spaceInfo_.getInfo(1, kv); + auto ret = balancer.executeInternal(); + EXPECT_EQ(Status::Balanced(), ret.value()); + balancer.finish(); + balancer.lostZones_ = {"5", "6", "7", "8"}; + folly::Baton baton; + balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { + baton.post(); + return nebula::cpp2::ErrorCode::SUCCEEDED; + }); + ret = balancer.executeInternal(); + baton.wait(); + EXPECT_EQ(Status::OK(), ret.value()); + verifyMetaZone(kv, balancer.spaceInfo_.spaceId_, {"1", "2", "3", "4"}); verifyBalanceTask( kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 12); } -TEST(BalanceTest, SingleReplicaTest) { - fs::TempDir rootPath("/tmp/SingleReplicaTest.XXXXXX"); +TEST(BalanceTest, NormalDataTest) { + fs::TempDir rootPath("/tmp/NormalDataTest.XXXXXX"); auto store = MockCluster::initMetaKV(rootPath.path()); auto* kv = dynamic_cast(store.get()); FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}, {"5", 5}}); - TestUtils::assembleSpace(kv, 1, 12, 1, 6); + TestUtils::assembleSpaceWithZone(kv, 1, 8, 3, 1, 8); std::unordered_map partCount; - for (int32_t i = 0; i < 6; i++) { - partCount[HostAddr(std::to_string(i), i)] = 2; - } DefaultValue>::SetFactory( [] { return folly::Future(Status::OK()); }); NiceMock client; - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); + JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); DataBalanceJobExecutor balancer(jd, kv, &client, {}); - - sleep(1); - LOG(INFO) << "Now, we want to remove host {2, 2} and {3, 3}"; - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}, {"5", 5}}); - - balancer.lostHosts_ = {{"2", 2}, {"3", 3}}; - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); - LOG(INFO) << "Rebalance finished!"; - + balancer.spaceInfo_.getInfo(1, kv); + auto ret = balancer.executeInternal(); + EXPECT_EQ(Status::Balanced(), ret.value()); + balancer.finish(); + balancer.lostHosts_ = {{"127.0.0.1", 1}, {"127.0.0.1", 8}}; + folly::Baton baton; + balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { + baton.post(); + return nebula::cpp2::ErrorCode::SUCCEEDED; + }); + ret = balancer.executeInternal(); + baton.wait(); + EXPECT_EQ(Status::OK(), ret.value()); verifyBalanceTask( - kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 4); - ASSERT_EQ(3, partCount[HostAddr("0", 0)]); - ASSERT_EQ(3, partCount[HostAddr("1", 1)]); - ASSERT_EQ(0, partCount[HostAddr("2", 2)]); - ASSERT_EQ(0, partCount[HostAddr("3", 3)]); - ASSERT_EQ(3, partCount[HostAddr("4", 4)]); - ASSERT_EQ(3, partCount[HostAddr("5", 5)]); + kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 6); } -TEST(BalanceTest, TryToRecoveryTest) { +TEST(BalanceTest, RecoveryTest) { fs::TempDir rootPath("/tmp/TryToRecoveryTest.XXXXXX"); auto store = MockCluster::initMetaKV(rootPath.path()); auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv); - TestUtils::assembleSpace(kv, 1, 8, 3, 4); - - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - LOG(INFO) << "Now, we lost host " << HostAddr("3", 3); - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - + TestUtils::assembleSpaceWithZone(kv, 1, 24, 1, 1, 8); DefaultValue>::SetFactory( [] { return folly::Future(Status::OK()); }); NiceMock client; - // first 6 call is the failed case, since we can't recover the plan, so only 6 - // call EXPECT_CALL(client, waitingForCatchUpData(_, _, _)) - .Times(6) + .Times(12) .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) @@ -1224,13 +759,18 @@ TEST(BalanceTest, TryToRecoveryTest) { .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); + JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); + balancer.spaceInfo_.getInfo(1, kv); + balancer.lostHosts_ = {{"127.0.0.1", 1}, {"127.0.0.1", 8}}; + folly::Baton baton; + balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { + baton.post(); + return nebula::cpp2::ErrorCode::SUCCEEDED; + }); + auto ret = balancer.executeInternal(); + baton.wait(); + EXPECT_EQ(Status::OK(), ret.value()); std::unordered_map partCount; verifyBalanceTask(kv, balancer.jobId_, @@ -1238,148 +778,102 @@ TEST(BalanceTest, TryToRecoveryTest) { BalanceTaskResult::FAILED, partCount, 6); - - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - LOG(INFO) << "Now let's try to recovery it. Since all host would be regarded " - "as offline, " - << "so all task will be invalid"; balancer.recovery(); - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); verifyBalanceTask( - kv, balancer.jobId_, BalanceTaskStatus::START, BalanceTaskResult::INVALID, partCount, 6); -} - -TEST(BalanceTest, RecoveryTest) { - FLAGS_task_concurrency = 1; - fs::TempDir rootPath("/tmp/RecoveryTest.XXXXXX"); - auto store = MockCluster::initMetaKV(rootPath.path()); - auto* kv = dynamic_cast(store.get()); - FLAGS_heartbeat_interval_secs = 1; - TestUtils::createSomeHosts(kv); - TestUtils::assembleSpace(kv, 1, 8, 3, 4); - - DefaultValue>::SetFactory( - [] { return folly::Future(Status::OK()); }); - NiceMock client; - // first 6 call is the failed case, the later call will return default value - // In gtest release 1.8.0 we can only write as follows: - EXPECT_CALL(client, waitingForCatchUpData(_, _, _)) - .Times(AtLeast(12)) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))) - .WillOnce(Return(ByMove(folly::Future(Status::Error("catch up failed"))))); - - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - LOG(INFO) << "Now, we lost host " << HostAddr("3", 3); - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); - std::unordered_map partCount; - verifyBalanceTask(kv, - balancer.jobId_, - BalanceTaskStatus::CATCH_UP_DATA, - BalanceTaskResult::FAILED, - partCount, - 6); - - // register hb again to prevent from regarding src as offline - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - LOG(INFO) << "Now let's try to recovery it."; - balancer.recovery(); - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - sleep(1); + kv, balancer.jobId_, BalanceTaskStatus::START, BalanceTaskResult::IN_PROGRESS, partCount, 6); + baton.reset(); + balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { + baton.post(); + return nebula::cpp2::ErrorCode::SUCCEEDED; + }); + ret = balancer.executeInternal(); + baton.wait(); verifyBalanceTask( kv, balancer.jobId_, BalanceTaskStatus::END, BalanceTaskResult::SUCCEEDED, partCount, 6); } TEST(BalanceTest, StopPlanTest) { - FLAGS_task_concurrency = 1; fs::TempDir rootPath("/tmp/StopAndRecoverTest.XXXXXX"); auto store = MockCluster::initMetaKV(rootPath.path()); auto* kv = dynamic_cast(store.get()); FLAGS_heartbeat_interval_secs = 1; TestUtils::createSomeHosts(kv); - TestUtils::assembleSpace(kv, 1, 8, 3, 4); - - // {3, 3} is lost for now - sleep(FLAGS_heartbeat_interval_secs * FLAGS_expired_time_factor + 1); - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - + TestUtils::assembleSpaceWithZone(kv, 1, 24, 3, 5, 5); DefaultValue>::SetFactory( [] { return folly::Future(Status::OK()); }); NiceMock delayClient; EXPECT_CALL(delayClient, waitingForCatchUpData(_, _, _)) - // first task in first plan will be blocked, all other tasks will be - // skipped, - .Times(1) + .Times(8) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) + .WillOnce( + Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))) .WillOnce( Return(ByMove(folly::makeFuture(Status::OK()).delayed(std::chrono::seconds(3))))); - - JobDescription jd( - testJobId.fetch_add(1, std::memory_order_relaxed), cpp2::AdminCmd::DATA_BALANCE, {}); - DataBalanceJobExecutor balancer(jd, kv, &delayClient, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - sleep(1); - LOG(INFO) << "Rebalance should still in progress"; - - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); + FLAGS_task_concurrency = 8; + JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); + ZoneBalanceJobExecutor balancer(jd, kv, &delayClient, {}); + balancer.spaceInfo_.getInfo(1, kv); + balancer.lostZones_ = {"4", "5"}; + folly::Baton baton; + balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { + baton.post(); + return nebula::cpp2::ErrorCode::SUCCEEDED; + }); + auto ret = balancer.executeInternal(); + EXPECT_EQ(Status::OK(), ret.value()); auto stopRet = balancer.stop(); EXPECT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, stopRet); + baton.wait(); + const auto& prefix = MetaKeyUtils::balanceTaskPrefix(balancer.jobId_); + std::unique_ptr iter; + auto retcode = kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); + ASSERT_EQ(retcode, nebula::cpp2::ErrorCode::SUCCEEDED); + int32_t taskEnded = 0; + int32_t taskStopped = 0; + int32_t invalid = 0; + int32_t success = 0; + int32_t progress = 0; + while (iter->valid()) { + BalanceTask task; + { + auto tup = MetaKeyUtils::parseBalanceTaskVal(iter->val()); + task.status_ = std::get<0>(tup); + task.ret_ = std::get<1>(tup); + task.startTimeMs_ = std::get<2>(tup); + task.endTimeMs_ = std::get<3>(tup); + + if (task.ret_ == BalanceTaskResult::SUCCEEDED) { + success++; + } else if (task.ret_ == BalanceTaskResult::INVALID) { + invalid++; + } else if (task.ret_ == BalanceTaskResult::IN_PROGRESS) { + progress++; + } - // wait until the only IN_PROGRESS task finished; - sleep(3); - { - const auto& prefix = MetaKeyUtils::balanceTaskPrefix(balancer.jobId_); - std::unique_ptr iter; - auto retcode = kv->prefix(kDefaultSpaceId, kDefaultPartId, prefix, &iter); - ASSERT_EQ(retcode, nebula::cpp2::ErrorCode::SUCCEEDED); - int32_t taskEnded = 0; - int32_t taskStopped = 0; - while (iter->valid()) { - BalanceTask task; - // PartitionID partId = - // std::get<2>(BalanceTask::MetaServiceUtils(iter->key())); - { - auto tup = MetaKeyUtils::parseBalanceTaskVal(iter->val()); - task.status_ = std::get<0>(tup); - task.ret_ = std::get<1>(tup); - task.startTimeMs_ = std::get<2>(tup); - task.endTimeMs_ = std::get<3>(tup); - - if (task.status_ == BalanceTaskStatus::END) { - taskEnded++; - } else { - taskStopped++; - } + if (task.status_ == BalanceTaskStatus::END) { + taskEnded++; + } else { + taskStopped++; } - iter->next(); } - ASSERT_EQ(1, taskEnded); - ASSERT_EQ(5, taskStopped); + iter->next(); } - - TestUtils::registerHB(kv, {{"0", 0}, {"1", 1}, {"2", 2}}); - NiceMock normalClient; - - balancer.adminClient_ = &normalClient; - testRestBlancer(); - ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::OK(), ret.value()); - testRestBlancer(); - sleep(1); + EXPECT_EQ(8, taskEnded); + EXPECT_EQ(22, taskStopped); + EXPECT_EQ(22, invalid); + EXPECT_EQ(8, success); + EXPECT_EQ(0, progress); } void verifyLeaderBalancePlan(HostLeaderMap& hostLeaderMap, @@ -1647,8 +1141,8 @@ TEST(BalanceTest, LeaderBalanceTest) { LeaderBalanceJobExecutor balancer( testJobId.fetch_add(1, std::memory_order_relaxed), kv, &client, {}); - auto ret = balancer.executeInternal(HostAddr(), {}); - ASSERT_EQ(Status::Error("partiton failed to transfer leader"), ret.value()); + auto ret = balancer.executeInternal(); + ASSERT_EQ(Status::OK(), ret.value()); } TEST(BalanceTest, LeaderBalanceWithZoneTest) { diff --git a/src/meta/test/GetStatsTest.cpp b/src/meta/test/GetStatsTest.cpp index a58e001c365..8cb7a3e1454 100644 --- a/src/meta/test/GetStatsTest.cpp +++ b/src/meta/test/GetStatsTest.cpp @@ -62,11 +62,12 @@ struct JobCallBack { req.task_id_ref() = taskId_; cpp2::StatsItem item; - item.tag_vertices_ref() = {{"t1", n_}, {"t2", n_}}; - item.edges_ref() = {{"e1", n_}, {"e2", n_}}; - item.space_vertices_ref() = 2 * n_; - item.space_edges_ref() = 2 * n_; - req.stats_ref() = item; + item.set_tag_vertices({{"t1", n_}, {"t2", n_}}); + item.set_edges({{"e1", n_}, {"e2", n_}}); + item.set_space_vertices(2 * n_); + item.set_space_edges(2 * n_); + req.set_stats(item); + jobMgr_->muJobFinished_.unlock(); jobMgr_->reportTaskFinish(req); return folly::Future(Status::OK()); } diff --git a/src/meta/test/JobManagerTest.cpp b/src/meta/test/JobManagerTest.cpp index 55ae1b5a155..92f45d34be3 100644 --- a/src/meta/test/JobManagerTest.cpp +++ b/src/meta/test/JobManagerTest.cpp @@ -25,8 +25,6 @@ namespace meta { using ::testing::DefaultValue; using ::testing::NiceMock; -bool gInitialized = false; - class JobManagerTest : public ::testing::Test { protected: void SetUp() override { @@ -45,25 +43,27 @@ class JobManagerTest : public ::testing::Test { adminClient_ = std::make_unique>(); DefaultValue>::SetFactory( [] { return folly::Future(Status::OK()); }); + } - jobMgr = JobManager::getInstance(); + std::unique_ptr> getJobManager() { + std::unique_ptr> jobMgr( + new JobManager(), [](JobManager* p) { + std::pair pair; + while (!p->lowPriorityQueue_->empty()) { + p->lowPriorityQueue_->dequeue(pair); + } + while (!p->highPriorityQueue_->empty()) { + p->highPriorityQueue_->dequeue(pair); + } + delete p; + }); jobMgr->status_ = JobManager::JbmgrStatus::NOT_START; jobMgr->kvStore_ = kv_.get(); - if (!gInitialized) { - jobMgr->init(kv_.get()); - gInitialized = true; - } + jobMgr->init(kv_.get()); + return jobMgr; } void TearDown() override { - auto cleanUnboundQueue = [](auto& q) { - std::pair pair; - while (!q.empty()) { - q.dequeue(pair); - } - }; - cleanUnboundQueue(*jobMgr->lowPriorityQueue_); - cleanUnboundQueue(*jobMgr->highPriorityQueue_); kv_.reset(); rootPath_.reset(); } @@ -71,10 +71,10 @@ class JobManagerTest : public ::testing::Test { std::unique_ptr rootPath_{nullptr}; std::unique_ptr kv_{nullptr}; std::unique_ptr adminClient_{nullptr}; - JobManager* jobMgr{nullptr}; }; TEST_F(JobManagerTest, addJob) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras{"test"}; JobDescription job(1, cpp2::AdminCmd::COMPACT, paras); auto rc = jobMgr->addJob(job, adminClient_.get()); @@ -82,9 +82,10 @@ TEST_F(JobManagerTest, addJob) { } TEST_F(JobManagerTest, AddRebuildTagIndexJob) { + std::unique_ptr> jobMgr = getJobManager(); // For preventing job schedule in JobManager jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; - + jobMgr->bgThread_.join(); std::vector paras{"tag_index_name", "test_space"}; JobDescription job(11, cpp2::AdminCmd::REBUILD_TAG_INDEX, paras); auto rc = jobMgr->addJob(job, adminClient_.get()); @@ -94,9 +95,10 @@ TEST_F(JobManagerTest, AddRebuildTagIndexJob) { } TEST_F(JobManagerTest, AddRebuildEdgeIndexJob) { + std::unique_ptr> jobMgr = getJobManager(); // For preventing job schedule in JobManager jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; - + jobMgr->bgThread_.join(); std::vector paras{"edge_index_name", "test_space"}; JobDescription job(11, cpp2::AdminCmd::REBUILD_EDGE_INDEX, paras); auto rc = jobMgr->addJob(job, adminClient_.get()); @@ -106,9 +108,10 @@ TEST_F(JobManagerTest, AddRebuildEdgeIndexJob) { } TEST_F(JobManagerTest, StatsJob) { + std::unique_ptr> jobMgr = getJobManager(); // For preventing job schedule in JobManager jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; - + jobMgr->bgThread_.join(); std::vector paras{"test_space"}; JobDescription job(12, cpp2::AdminCmd::STATS, paras); auto rc = jobMgr->addJob(job, adminClient_.get()); @@ -127,9 +130,10 @@ TEST_F(JobManagerTest, StatsJob) { } TEST_F(JobManagerTest, JobPriority) { + std::unique_ptr> jobMgr = getJobManager(); // For preventing job schedule in JobManager jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; - + jobMgr->bgThread_.join(); ASSERT_EQ(0, jobMgr->jobSize()); std::vector paras{"test"}; @@ -157,14 +161,13 @@ TEST_F(JobManagerTest, JobPriority) { result = jobMgr->try_dequeue(opJobId); ASSERT_FALSE(result); - - jobMgr->status_ = JobManager::JbmgrStatus::IDLE; } TEST_F(JobManagerTest, JobDeduplication) { + std::unique_ptr> jobMgr = getJobManager(); // For preventing job schedule in JobManager jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; - + jobMgr->bgThread_.join(); ASSERT_EQ(0, jobMgr->jobSize()); std::vector paras{"test"}; @@ -209,10 +212,10 @@ TEST_F(JobManagerTest, JobDeduplication) { result = jobMgr->try_dequeue(opJobId); ASSERT_FALSE(result); - jobMgr->status_ = JobManager::JbmgrStatus::IDLE; } TEST_F(JobManagerTest, loadJobDescription) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras{"test_space"}; JobDescription job1(1, cpp2::AdminCmd::COMPACT, paras); job1.setStatus(cpp2::JobStatus ::RUNNING); @@ -240,6 +243,7 @@ TEST(JobUtilTest, dummy) { } TEST_F(JobManagerTest, showJobs) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras1{"test_space"}; JobDescription jd1(1, cpp2::AdminCmd::COMPACT, paras1); jd1.setStatus(cpp2::JobStatus::RUNNING); @@ -273,6 +277,7 @@ TEST_F(JobManagerTest, showJobs) { } TEST_F(JobManagerTest, showJobsFromMultiSpace) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras1{"test_space"}; JobDescription jd1(1, cpp2::AdminCmd::COMPACT, paras1); jd1.setStatus(cpp2::JobStatus::RUNNING); @@ -305,6 +310,7 @@ HostAddr toHost(std::string strIp) { } TEST_F(JobManagerTest, showJob) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras{"test_space"}; JobDescription jd(1, cpp2::AdminCmd::COMPACT, paras); @@ -358,6 +364,7 @@ TEST_F(JobManagerTest, showJob) { } TEST_F(JobManagerTest, showJobInOtherSpace) { + std::unique_ptr> jobMgr = getJobManager(); std::vector paras{"test_space"}; JobDescription jd(1, cpp2::AdminCmd::COMPACT, paras); @@ -389,8 +396,10 @@ TEST_F(JobManagerTest, showJobInOtherSpace) { } TEST_F(JobManagerTest, recoverJob) { + std::unique_ptr> jobMgr = getJobManager(); // set status to prevent running the job since AdminClient is a injector - jobMgr->status_ = JobManager::JbmgrStatus::NOT_START; + jobMgr->status_ = JobManager::JbmgrStatus::STOPPED; + jobMgr->bgThread_.join(); auto spaceName = "test_space"; int32_t nJob = 3; for (auto i = 0; i != nJob; ++i) { diff --git a/src/meta/test/TestUtils.h b/src/meta/test/TestUtils.h index 5021e630c34..7c09ab0646d 100644 --- a/src/meta/test/TestUtils.h +++ b/src/meta/test/TestUtils.h @@ -208,6 +208,62 @@ class TestUtils { baton.wait(); } + static void assembleSpaceWithZone(kvstore::KVStore* kv, + GraphSpaceID id, + int32_t partitionNum, + int32_t replica, + int32_t zoneNum, + int32_t totalHost) { + cpp2::SpaceDesc properties; + properties.set_space_name("test_space"); + properties.set_partition_num(partitionNum); + properties.set_replica_factor(replica); + auto spaceVal = MetaKeyUtils::spaceVal(properties); + std::vector data; + data.emplace_back(MetaKeyUtils::indexSpaceKey("test_space"), + std::string(reinterpret_cast(&id), sizeof(GraphSpaceID))); + std::vector>> zones; + std::vector zoneNames; + std::map zonePartNum; + for (int32_t i = 0; i < zoneNum; i++) { + zones.push_back({std::to_string(i + 1), {}}); + zonePartNum[std::to_string(i + 1)] = 0; + zoneNames.push_back(std::to_string(i + 1)); + } + properties.set_zone_names(zoneNames); + data.emplace_back(MetaKeyUtils::spaceKey(id), MetaKeyUtils::spaceVal(properties)); + std::vector allHosts; + for (int32_t i = 0; i < totalHost; i++) { + zones[i % zoneNum].second.emplace_back("127.0.0.1", i + 1); + allHosts.emplace_back("127.0.0.1", i + 1); + data.emplace_back(nebula::MetaKeyUtils::machineKey("127.0.0.1", i + 1), ""); + data.emplace_back(nebula::MetaKeyUtils::hostKey("127.0.0.1", i + 1), + HostInfo::encodeV2(HostInfo( + time::WallClock::fastNowInMilliSec(), cpp2::HostRole::STORAGE, ""))); + } + for (auto& p : zones) { + data.emplace_back(MetaKeyUtils::zoneKey(p.first), MetaKeyUtils::zoneVal(p.second)); + } + for (auto partId = 1; partId <= partitionNum; partId++) { + std::vector hosts; + size_t idx = partId; + for (int32_t i = 0; i < replica; i++, idx++) { + std::string zoneName = zones[idx % zoneNum].first; + std::vector& zoneHosts = zones[idx % zoneNum].second; + int32_t hostIndex = zonePartNum[zoneName] % zoneHosts.size(); + hosts.push_back(zoneHosts[hostIndex]); + zonePartNum[zoneName]++; + } + data.emplace_back(MetaKeyUtils::partKey(id, partId), MetaKeyUtils::partVal(hosts)); + } + folly::Baton baton; + kv->asyncMultiPut(0, 0, std::move(data), [&](nebula::cpp2::ErrorCode code) { + ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, code); + baton.post(); + }); + baton.wait(); + } + static void mockTag(kvstore::KVStore* kv, int32_t tagNum, SchemaVer version = 0, diff --git a/src/parser/AdminSentences.cpp b/src/parser/AdminSentences.cpp index 51fa6e22925..64dbdb9a646 100644 --- a/src/parser/AdminSentences.cpp +++ b/src/parser/AdminSentences.cpp @@ -271,6 +271,17 @@ std::string AdminJobSentence::toString() const { } return str; } + case meta::cpp2::AdminCmd::ZONE_BALANCE: + if (paras_.empty()) { + return "SUBMIT JOB BALANCE ZONE"; + } else { + std::string str = "SUBMIT JOB BALANCE ZONE REMOVE"; + for (size_t i = 0; i < paras_.size(); i++) { + auto &s = paras_[i]; + str += i == 0 ? " " + s : ", " + s; + } + return str; + } case meta::cpp2::AdminCmd::LEADER_BALANCE: return "SUBMIT JOB BALANCE LEADER"; case meta::cpp2::AdminCmd::UNKNOWN: diff --git a/src/parser/parser.yy b/src/parser/parser.yy index 66679935d31..74e22facff0 100644 --- a/src/parser/parser.yy +++ b/src/parser/parser.yy @@ -191,7 +191,7 @@ static constexpr size_t kCommentLengthLimit = 256; %token KW_USER KW_USERS KW_ACCOUNT %token KW_PASSWORD KW_CHANGE KW_ROLE KW_ROLES %token KW_GOD KW_ADMIN KW_DBA KW_GUEST KW_GRANT KW_REVOKE KW_ON -%token KW_OUT KW_BOTH KW_SUBGRAPH +%token KW_OUT KW_BOTH KW_SUBGRAPH KW_ACROSS %token KW_EXPLAIN KW_PROFILE KW_FORMAT %token KW_CONTAINS %token KW_STARTS KW_ENDS @@ -3281,15 +3281,15 @@ admin_job_sentence meta::cpp2::AdminCmd::LEADER_BALANCE); $$ = sentence; } - | KW_SUBMIT KW_JOB KW_BALANCE KW_DATA { + | KW_SUBMIT KW_JOB KW_BALANCE KW_IN KW_ZONE { auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, meta::cpp2::AdminCmd::DATA_BALANCE); $$ = sentence; } - | KW_SUBMIT KW_JOB KW_BALANCE KW_DATA KW_REMOVE host_list { + | KW_SUBMIT KW_JOB KW_BALANCE KW_IN KW_ZONE KW_REMOVE host_list { auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, meta::cpp2::AdminCmd::DATA_BALANCE); - HostList* hl = $6; + HostList* hl = $7; std::vector has = hl->hosts(); for (HostAddr& ha: has) { sentence->addPara(ha.toString()); @@ -3297,6 +3297,22 @@ admin_job_sentence delete hl; $$ = sentence; } + | KW_SUBMIT KW_JOB KW_BALANCE KW_ACROSS KW_ZONE { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::ZONE_BALANCE); + $$ = sentence; + } + | KW_SUBMIT KW_JOB KW_BALANCE KW_ACROSS KW_ZONE KW_REMOVE zone_name_list { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::ZONE_BALANCE); + ZoneNameList* nl = $7; + std::vector names = nl->zoneNames(); + for (std::string& name: names) { + sentence->addPara(name); + } + delete nl; + $$ = sentence; + } ; job_concurrency diff --git a/src/parser/scanner.lex b/src/parser/scanner.lex index b25fa898211..451a6a63f8a 100644 --- a/src/parser/scanner.lex +++ b/src/parser/scanner.lex @@ -162,6 +162,7 @@ LABEL_FULL_WIDTH {CN_EN_FULL_WIDTH}{CN_EN_NUM_FULL_WIDTH}* "PROFILE" { return TokenType::KW_PROFILE; } "FORMAT" { return TokenType::KW_FORMAT; } "CASE" { return TokenType::KW_CASE; } +"ACROSS" { return TokenType::KW_ACROSS; } /** * TODO(dutor) Manage the dynamic allocated objects with an object pool, From 2adc92b63f3200c5f3523d11f07fe6a07a7980ee Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Wed, 8 Dec 2021 20:19:00 +0800 Subject: [PATCH 2/6] alter space add zone --- src/clients/meta/MetaClient.cpp | 19 ++++ src/clients/meta/MetaClient.h | 4 + src/graph/executor/Executor.cpp | 3 + src/graph/executor/admin/SpaceExecutor.cpp | 17 ++++ src/graph/executor/admin/SpaceExecutor.h | 8 ++ src/graph/planner/plan/Admin.h | 32 +++++++ src/graph/planner/plan/PlanNode.cpp | 2 + src/graph/planner/plan/PlanNode.h | 1 + src/graph/service/PermissionCheck.cpp | 1 + src/graph/validator/AdminValidator.cpp | 11 ++- src/graph/validator/AdminValidator.h | 12 +++ src/graph/validator/Validator.cpp | 2 + src/interface/meta.thrift | 11 +++ src/meta/CMakeLists.txt | 1 + src/meta/MetaServiceHandler.cpp | 7 ++ src/meta/MetaServiceHandler.h | 2 + .../processors/parts/AlterSpaceProcessor.cpp | 92 +++++++++++++++++++ .../processors/parts/AlterSpaceProcessor.h | 33 +++++++ src/meta/test/ProcessorTest.cpp | 65 +++++++++++++ src/parser/AdminSentences.cpp | 9 ++ src/parser/AdminSentences.h | 22 +++++ src/parser/Sentence.h | 1 + src/parser/parser.yy | 15 ++- 23 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 src/meta/processors/parts/AlterSpaceProcessor.cpp create mode 100644 src/meta/processors/parts/AlterSpaceProcessor.h diff --git a/src/clients/meta/MetaClient.cpp b/src/clients/meta/MetaClient.cpp index aab1d30cf9b..7a6760aff39 100644 --- a/src/clients/meta/MetaClient.cpp +++ b/src/clients/meta/MetaClient.cpp @@ -1213,6 +1213,25 @@ folly::Future>> MetaClient::listHosts(cpp2: return future; } +folly::Future> MetaClient::alterSpace(const std::string& spaceName, + meta::cpp2::AlterSpaceOp op, + const std::vector& paras) { + cpp2::AlterSpaceReq req; + req.set_op(op); + req.set_space_name(spaceName); + req.set_paras(paras); + folly::Promise> promise; + auto future = promise.getFuture(); + getResponse( + std::move(req), + [](auto client, auto request) { return client->future_alterSpace(request); }, + [](cpp2::ExecResp&& resp) -> bool { + return resp.get_code() == nebula::cpp2::ErrorCode::SUCCEEDED; + }, + std::move(promise)); + return future; +} + folly::Future>> MetaClient::listParts( GraphSpaceID spaceId, std::vector partIds) { cpp2::ListPartsReq req; diff --git a/src/clients/meta/MetaClient.h b/src/clients/meta/MetaClient.h index f81701c8feb..5f1d5022c63 100644 --- a/src/clients/meta/MetaClient.h +++ b/src/clients/meta/MetaClient.h @@ -266,6 +266,10 @@ class MetaClient { folly::Future>> listHosts( cpp2::ListHostType type = cpp2::ListHostType::ALLOC); + folly::Future> alterSpace(const std::string& spaceName, + meta::cpp2::AlterSpaceOp op, + const std::vector& paras); + folly::Future>> listParts(GraphSpaceID spaceId, std::vector partIds); diff --git a/src/graph/executor/Executor.cpp b/src/graph/executor/Executor.cpp index d9fff1d751b..1a254c1063c 100644 --- a/src/graph/executor/Executor.cpp +++ b/src/graph/executor/Executor.cpp @@ -537,6 +537,9 @@ Executor *Executor::makeExecutor(QueryContext *qctx, const PlanNode *node) { case PlanNode::Kind::kArgument: { return pool->add(new ArgumentExecutor(node, qctx)); } + case PlanNode::Kind::kAlterSpace: { + return pool->add(new AlterSpaceExecutor(node, qctx)); + } case PlanNode::Kind::kUnknown: { LOG(FATAL) << "Unknown plan node kind " << static_cast(node->kind()); break; diff --git a/src/graph/executor/admin/SpaceExecutor.cpp b/src/graph/executor/admin/SpaceExecutor.cpp index e7f9cc52a62..628723dc9a0 100644 --- a/src/graph/executor/admin/SpaceExecutor.cpp +++ b/src/graph/executor/admin/SpaceExecutor.cpp @@ -270,5 +270,22 @@ folly::Future ShowCreateSpaceExecutor::execute() { .build()); }); } + +folly::Future AlterSpaceExecutor::execute() { + SCOPED_TIMER(&execTime_); + auto *asnode = asNode(node()); + return qctx() + ->getMetaClient() + ->alterSpace(asnode->getSpaceName(), asnode->getAlterSpaceOp(), asnode->getParas()) + .via(runner()) + .thenValue([this](StatusOr &&resp) { + SCOPED_TIMER(&execTime_); + if (!resp.ok()) { + LOG(ERROR) << resp.status().toString(); + return std::move(resp).status(); + } + return Status::OK(); + }); +} } // namespace graph } // namespace nebula diff --git a/src/graph/executor/admin/SpaceExecutor.h b/src/graph/executor/admin/SpaceExecutor.h index f61e5e3e95a..52f56bd2edf 100644 --- a/src/graph/executor/admin/SpaceExecutor.h +++ b/src/graph/executor/admin/SpaceExecutor.h @@ -60,6 +60,14 @@ class ShowCreateSpaceExecutor final : public Executor { folly::Future execute() override; }; + +class AlterSpaceExecutor final : public Executor { + public: + AlterSpaceExecutor(const PlanNode *node, QueryContext *qctx) + : Executor("AlterSpaceExecutor", node, qctx) {} + + folly::Future execute() override; +}; } // namespace graph } // namespace nebula diff --git a/src/graph/planner/plan/Admin.h b/src/graph/planner/plan/Admin.h index 74943f7fb2d..bd1299e555d 100644 --- a/src/graph/planner/plan/Admin.h +++ b/src/graph/planner/plan/Admin.h @@ -216,6 +216,38 @@ class DropSpace final : public SingleDependencyNode { bool ifExists_; }; +class AlterSpace final : public SingleDependencyNode { + public: + static AlterSpace* make(QueryContext* qctx, + PlanNode* input, + const std::string& spaceName, + meta::cpp2::AlterSpaceOp op, + const std::vector& paras) { + return qctx->objPool()->add(new AlterSpace(qctx, input, spaceName, op, paras)); + } + const std::string& getSpaceName() const { return spaceName_; } + + meta::cpp2::AlterSpaceOp getAlterSpaceOp() const { return op_; } + + const std::vector& getParas() const { return paras_; } + + private: + AlterSpace(QueryContext* qctx, + PlanNode* input, + const std::string& spaceName, + meta::cpp2::AlterSpaceOp op, + const std::vector& paras) + : SingleDependencyNode(qctx, Kind::kAlterSpace, input), + spaceName_(spaceName), + op_(op), + paras_(paras) {} + + private: + std::string spaceName_; + meta::cpp2::AlterSpaceOp op_; + std::vector paras_; +}; + class DescSpace final : public SingleDependencyNode { public: static DescSpace* make(QueryContext* qctx, PlanNode* input, std::string spaceName) { diff --git a/src/graph/planner/plan/PlanNode.cpp b/src/graph/planner/plan/PlanNode.cpp index 8f65ba9d08c..75964f3565f 100644 --- a/src/graph/planner/plan/PlanNode.cpp +++ b/src/graph/planner/plan/PlanNode.cpp @@ -174,6 +174,8 @@ const char* PlanNode::toString(PlanNode::Kind kind) { return "DropEdge"; case Kind::kShowSpaces: return "ShowSpaces"; + case Kind::kAlterSpace: + return "AlterSpaces"; case Kind::kShowTags: return "ShowTags"; case Kind::kShowEdges: diff --git a/src/graph/planner/plan/PlanNode.h b/src/graph/planner/plan/PlanNode.h index d477f96ca66..fd42c3439f9 100644 --- a/src/graph/planner/plan/PlanNode.h +++ b/src/graph/planner/plan/PlanNode.h @@ -96,6 +96,7 @@ class PlanNode { kDropSpace, kDropTag, kDropEdge, + kAlterSpace, // index related kCreateTagIndex, diff --git a/src/graph/service/PermissionCheck.cpp b/src/graph/service/PermissionCheck.cpp index bf9cbadef34..0e6c2e9bbca 100644 --- a/src/graph/service/PermissionCheck.cpp +++ b/src/graph/service/PermissionCheck.cpp @@ -51,6 +51,7 @@ Status PermissionCheck::permissionCheck(ClientSession *session, return Status::OK(); } case Sentence::Kind::kCreateSpace: + case Sentence::Kind::kAlterSpace: case Sentence::Kind::kCreateSpaceAs: case Sentence::Kind::kDropSpace: case Sentence::Kind::kCreateSnapshot: diff --git a/src/graph/validator/AdminValidator.cpp b/src/graph/validator/AdminValidator.cpp index 07346057c20..ff2f578e438 100644 --- a/src/graph/validator/AdminValidator.cpp +++ b/src/graph/validator/AdminValidator.cpp @@ -163,10 +163,19 @@ Status CreateSpaceAsValidator::toPlan() { return Status::OK(); } -Status DescSpaceValidator::validateImpl() { +Status AlterSpaceValidator::validateImpl() { return Status::OK(); } + +Status AlterSpaceValidator::toPlan() { + auto sentence = static_cast(sentence_); + auto *doNode = AlterSpace::make( + qctx_, nullptr, sentence->spaceName(), sentence->alterSpaceOp(), sentence->paras()); + root_ = doNode; + tail_ = root_; return Status::OK(); } +Status DescSpaceValidator::validateImpl() { return Status::OK(); } + Status DescSpaceValidator::toPlan() { auto sentence = static_cast(sentence_); auto *doNode = DescSpace::make(qctx_, nullptr, *sentence->spaceName()); diff --git a/src/graph/validator/AdminValidator.h b/src/graph/validator/AdminValidator.h index a7b33aef779..93d5d17ac09 100644 --- a/src/graph/validator/AdminValidator.h +++ b/src/graph/validator/AdminValidator.h @@ -48,6 +48,18 @@ class CreateSpaceAsValidator final : public Validator { std::string newSpaceName_; }; +class AlterSpaceValidator final : public Validator { + public: + AlterSpaceValidator(Sentence* sentence, QueryContext* context) : Validator(sentence, context) { + noSpaceRequired_ = true; + } + + private: + Status validateImpl() override; + + Status toPlan() override; +}; + class DescSpaceValidator final : public Validator { public: DescSpaceValidator(Sentence* sentence, QueryContext* context) : Validator(sentence, context) { diff --git a/src/graph/validator/Validator.cpp b/src/graph/validator/Validator.cpp index 65df184d700..f38965f4b73 100644 --- a/src/graph/validator/Validator.cpp +++ b/src/graph/validator/Validator.cpp @@ -252,6 +252,8 @@ std::unique_ptr Validator::makeValidator(Sentence* sentence, QueryCon return std::make_unique(sentence, context); case Sentence::Kind::kKillQuery: return std::make_unique(sentence, context); + case Sentence::Kind::kAlterSpace: + return std::make_unique(sentence, context); case Sentence::Kind::kUnknown: case Sentence::Kind::kReturn: { // nothing diff --git a/src/interface/meta.thrift b/src/interface/meta.thrift index 4e4a02b7e26..c7291171200 100644 --- a/src/interface/meta.thrift +++ b/src/interface/meta.thrift @@ -209,6 +209,16 @@ struct ExecResp { 3: common.HostAddr leader, } +enum AlterSpaceOp { + ADD_ZONE = 0x01, +} (cpp.enum_strict) + +struct AlterSpaceReq { + 1: binary space_name, + 2: AlterSpaceOp op, + 3: list paras, +} + // Job related data structures enum AdminJobOp { ADD = 0x01, @@ -1169,6 +1179,7 @@ service MetaService { ExecResp dropSpace(1: DropSpaceReq req); GetSpaceResp getSpace(1: GetSpaceReq req); ListSpacesResp listSpaces(1: ListSpacesReq req); + ExecResp alterSpace(1: AlterSpaceReq req); ExecResp createSpaceAs(1: CreateSpaceAsReq req); diff --git a/src/meta/CMakeLists.txt b/src/meta/CMakeLists.txt index 05bed4b1f98..c3051cb4068 100644 --- a/src/meta/CMakeLists.txt +++ b/src/meta/CMakeLists.txt @@ -15,6 +15,7 @@ nebula_add_library( processors/parts/ListSpacesProcessor.cpp processors/parts/DropSpaceProcessor.cpp processors/parts/GetPartsAllocProcessor.cpp + processors/parts/AlterSpaceProcessor.cpp processors/schema/CreateTagProcessor.cpp processors/schema/AlterTagProcessor.cpp processors/schema/GetTagProcessor.cpp diff --git a/src/meta/MetaServiceHandler.cpp b/src/meta/MetaServiceHandler.cpp index ec5b00ce754..ba29d149291 100644 --- a/src/meta/MetaServiceHandler.cpp +++ b/src/meta/MetaServiceHandler.cpp @@ -42,6 +42,7 @@ #include "meta/processors/kv/RemoveRangeProcessor.h" #include "meta/processors/kv/ScanProcessor.h" #include "meta/processors/listener/ListenerProcessor.h" +#include "meta/processors/parts/AlterSpaceProcessor.h" #include "meta/processors/parts/CreateSpaceAsProcessor.h" #include "meta/processors/parts/CreateSpaceProcessor.h" #include "meta/processors/parts/DropSpaceProcessor.h" @@ -86,6 +87,12 @@ folly::Future MetaServiceHandler::future_createSpace( RETURN_FUTURE(processor); } +folly::Future MetaServiceHandler::future_alterSpace( + const cpp2::AlterSpaceReq& req) { + auto* processor = AlterSpaceProcessor::instance(kvstore_); + RETURN_FUTURE(processor); +} + folly::Future MetaServiceHandler::future_createSpaceAs( const cpp2::CreateSpaceAsReq& req) { auto* processor = CreateSpaceAsProcessor::instance(kvstore_); diff --git a/src/meta/MetaServiceHandler.h b/src/meta/MetaServiceHandler.h index e0040c93ffe..84fbade153b 100644 --- a/src/meta/MetaServiceHandler.h +++ b/src/meta/MetaServiceHandler.h @@ -34,6 +34,8 @@ class MetaServiceHandler final : public cpp2::MetaServiceSvIf { * */ folly::Future future_createSpace(const cpp2::CreateSpaceReq& req) override; + folly::Future future_alterSpace(const cpp2::AlterSpaceReq& req) override; + folly::Future future_createSpaceAs(const cpp2::CreateSpaceAsReq& req) override; folly::Future future_dropSpace(const cpp2::DropSpaceReq& req) override; diff --git a/src/meta/processors/parts/AlterSpaceProcessor.cpp b/src/meta/processors/parts/AlterSpaceProcessor.cpp new file mode 100644 index 00000000000..d0a9bbfe7e8 --- /dev/null +++ b/src/meta/processors/parts/AlterSpaceProcessor.cpp @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#include "meta/processors/parts/AlterSpaceProcessor.h" + +namespace nebula { +namespace meta { +void AlterSpaceProcessor::process(const cpp2::AlterSpaceReq& req) { + const std::vector& zones = req.get_paras(); + const std::string& spaceName = req.get_space_name(); + cpp2::AlterSpaceOp op = req.get_op(); + switch (op) { + case cpp2::AlterSpaceOp::ADD_ZONE: { + nebula::cpp2::ErrorCode ret = addZones(spaceName, zones); + if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { + handleErrorCode(ret); + onFinished(); + return; + } + break; + } + default: + break; + } + handleErrorCode(nebula::cpp2::ErrorCode::SUCCEEDED); + onFinished(); +} + +nebula::cpp2::ErrorCode AlterSpaceProcessor::addZones(const std::string& spaceName, + const std::vector& zones) { + auto spaceRet = getSpaceId(spaceName); + if (!nebula::ok(spaceRet)) { + auto retCode = nebula::error(spaceRet); + return retCode; + } + auto spaceId = nebula::value(spaceRet); + std::string spaceKey = MetaKeyUtils::spaceKey(spaceId); + std::string spaceVal; + nebula::cpp2::ErrorCode retCode = + kvstore_->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + if (retCode != nebula::cpp2::ErrorCode::SUCCEEDED) { + return retCode; + } + meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); + const std::vector& curZones = properties.get_zone_names(); + std::set zm; + for (const std::string& z : curZones) { + zm.insert(z); + } + std::vector newZones = curZones; + newZones.reserve(curZones.size() + zones.size()); + for (const std::string& z : zones) { + std::string zoneKey = MetaKeyUtils::zoneKey(z); + std::string zoneVal; + nebula::cpp2::ErrorCode zoneRet = + kvstore_->get(kDefaultSpaceId, kDefaultPartId, zoneKey, &zoneVal); + if (zoneRet != nebula::cpp2::ErrorCode::SUCCEEDED) { + return zoneRet == nebula::cpp2::ErrorCode::E_KEY_NOT_FOUND + ? nebula::cpp2::ErrorCode::E_ZONE_NOT_FOUND + : zoneRet; + } + if (zm.count(z)) { + return nebula::cpp2::ErrorCode::E_CONFLICT; + } + newZones.emplace_back(z); + } + properties.set_zone_names(newZones); + std::vector data; + data.emplace_back(MetaKeyUtils::spaceKey(spaceId), MetaKeyUtils::spaceVal(properties)); + folly::Baton baton; + auto ret = nebula::cpp2::ErrorCode::SUCCEEDED; + kvstore_->asyncMultiPut(kDefaultSpaceId, + kDefaultPartId, + std::move(data), + [&ret, &baton](nebula::cpp2::ErrorCode code) { + if (nebula::cpp2::ErrorCode::SUCCEEDED != code) { + ret = code; + LOG(INFO) << "Put data error on meta server"; + } + baton.post(); + }); + baton.wait(); + if (ret != nebula::cpp2::ErrorCode::SUCCEEDED) { + return ret; + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + +} // namespace meta +} // namespace nebula diff --git a/src/meta/processors/parts/AlterSpaceProcessor.h b/src/meta/processors/parts/AlterSpaceProcessor.h new file mode 100644 index 00000000000..90c8bb9799b --- /dev/null +++ b/src/meta/processors/parts/AlterSpaceProcessor.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 vesoft inc. All rights reserved. + * + * This source code is licensed under Apache 2.0 License. + */ + +#ifndef NEBULA_GRAPH_ALTERSPACEPROCESSOR_H +#define NEBULA_GRAPH_ALTERSPACEPROCESSOR_H + +#include "meta/processors/BaseProcessor.h" + +namespace nebula { +namespace meta { +class AlterSpaceProcessor : public BaseProcessor { + public: + static AlterSpaceProcessor* instance(kvstore::KVStore* kvstore) { + return new AlterSpaceProcessor(kvstore); + } + + void process(const cpp2::AlterSpaceReq& req); + + private: + nebula::cpp2::ErrorCode addZones(const std::string& spaceName, + const std::vector& zones); + + private: + explicit AlterSpaceProcessor(kvstore::KVStore* kvstore) + : BaseProcessor(kvstore) {} +}; + +} // namespace meta +} // namespace nebula + +#endif // NEBULA_GRAPH_ALTERSPACEPROCESSOR_H diff --git a/src/meta/test/ProcessorTest.cpp b/src/meta/test/ProcessorTest.cpp index f2f5d918e3d..b942e98afdd 100644 --- a/src/meta/test/ProcessorTest.cpp +++ b/src/meta/test/ProcessorTest.cpp @@ -13,6 +13,7 @@ #include "meta/processors/kv/RemoveProcessor.h" #include "meta/processors/kv/RemoveRangeProcessor.h" #include "meta/processors/kv/ScanProcessor.h" +#include "meta/processors/parts/AlterSpaceProcessor.h" #include "meta/processors/parts/CreateSpaceProcessor.h" #include "meta/processors/parts/DropSpaceProcessor.h" #include "meta/processors/parts/GetPartsAllocProcessor.h" @@ -4362,6 +4363,70 @@ TEST(ProcessorTest, DropZoneTest) { } } +TEST(ProcessorTest, AlterSpaceTest) { + fs::TempDir rootPath("/tmp/RenameZoneTest.XXXXXX"); + auto store = MockCluster::initMetaKV(rootPath.path()); + auto* kv = dynamic_cast(store.get()); + TestUtils::assembleSpaceWithZone(kv, 1, 8, 1, 8, 8); + TestUtils::assembleZone(kv, + {{"9", {HostAddr("127.0.0.1", 9)}}, + {"10", {HostAddr("127.0.0.1", 10)}}, + {"11", {HostAddr("127.0.0.1", 11)}}}); + { + AlterSpaceProcessor* processor = AlterSpaceProcessor::instance(kv); + meta::cpp2::AlterSpaceReq req; + req.space_name_ref() = "test_space"; + req.op_ref() = meta::cpp2::AlterSpaceOp::ADD_ZONE; + req.paras_ref() = {"12"}; + auto f = processor->getFuture(); + processor->process(req); + auto resp = std::move(f).get(); + ASSERT_EQ(nebula::cpp2::ErrorCode::E_ZONE_NOT_FOUND, resp.get_code()); + } + { + AlterSpaceProcessor* processor = AlterSpaceProcessor::instance(kv); + meta::cpp2::AlterSpaceReq req; + req.space_name_ref() = "aaa"; + req.op_ref() = meta::cpp2::AlterSpaceOp::ADD_ZONE; + req.paras_ref() = {"9"}; + auto f = processor->getFuture(); + processor->process(req); + auto resp = std::move(f).get(); + ASSERT_EQ(nebula::cpp2::ErrorCode::E_SPACE_NOT_FOUND, resp.get_code()); + } + { + AlterSpaceProcessor* processor = AlterSpaceProcessor::instance(kv); + meta::cpp2::AlterSpaceReq req; + req.space_name_ref() = "test_space"; + req.op_ref() = meta::cpp2::AlterSpaceOp::ADD_ZONE; + req.paras_ref() = {"8"}; + auto f = processor->getFuture(); + processor->process(req); + auto resp = std::move(f).get(); + ASSERT_EQ(nebula::cpp2::ErrorCode::E_CONFLICT, resp.get_code()); + } + { + AlterSpaceProcessor* processor = AlterSpaceProcessor::instance(kv); + meta::cpp2::AlterSpaceReq req; + req.space_name_ref() = "test_space"; + req.op_ref() = meta::cpp2::AlterSpaceOp::ADD_ZONE; + req.paras_ref() = {"9", "10", "11"}; + auto f = processor->getFuture(); + processor->process(req); + auto resp = std::move(f).get(); + ASSERT_EQ(nebula::cpp2::ErrorCode::SUCCEEDED, resp.get_code()); + std::string spaceKey = MetaKeyUtils::spaceKey(1); + std::string spaceVal; + kv->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); + const std::vector& zones = properties.get_zone_names(); + const std::vector& res = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"}; + std::set result(zones.begin(), zones.end()); + std::set expected(res.begin(), res.end()); + ASSERT_EQ(result, expected); + } +} + } // namespace meta } // namespace nebula diff --git a/src/parser/AdminSentences.cpp b/src/parser/AdminSentences.cpp index 64dbdb9a646..28c9092a9af 100644 --- a/src/parser/AdminSentences.cpp +++ b/src/parser/AdminSentences.cpp @@ -132,6 +132,15 @@ std::string DropSpaceSentence::toString() const { return folly::stringPrintf("DROP SPACE %s", spaceName_.get()->c_str()); } +std::string AlterSpaceSentence::toString() const { + std::string zones = paras_.front(); + for (size_t i = 1; i < paras_.size(); i++) { + zones += "," + paras_[i]; + } + return folly::stringPrintf( + "ALTER SPACE %s ADD ZONE %s", spaceName_.get()->c_str(), zones.c_str()); +} + std::string DescribeSpaceSentence::toString() const { return folly::stringPrintf("DESCRIBE SPACE %s", spaceName_.get()->c_str()); } diff --git a/src/parser/AdminSentences.h b/src/parser/AdminSentences.h index 60df905244a..3e125b83026 100644 --- a/src/parser/AdminSentences.h +++ b/src/parser/AdminSentences.h @@ -439,6 +439,28 @@ class DropSpaceSentence final : public DropSentence { std::unique_ptr clusterName_; }; +class AlterSpaceSentence final : public Sentence { + public: + AlterSpaceSentence(std::string* spaceName, meta::cpp2::AlterSpaceOp op) + : op_(op), spaceName_(spaceName) { + kind_ = Kind::kAlterSpace; + } + void addPara(const std::string& para) { paras_.push_back(para); } + + std::string spaceName() const { return *spaceName_; } + + const std::vector& paras() const { return paras_; } + + meta::cpp2::AlterSpaceOp alterSpaceOp() const { return op_; } + + std::string toString() const override; + + private: + meta::cpp2::AlterSpaceOp op_; + std::unique_ptr spaceName_; + std::vector paras_; +}; + class DescribeSpaceSentence final : public Sentence { public: explicit DescribeSpaceSentence(std::string* spaceName) { diff --git a/src/parser/Sentence.h b/src/parser/Sentence.h index 14183641853..17995bfba3f 100644 --- a/src/parser/Sentence.h +++ b/src/parser/Sentence.h @@ -132,6 +132,7 @@ class Sentence { kShowQueries, kKillQuery, kShowMetaLeader, + kAlterSpace, }; Kind kind() const { diff --git a/src/parser/parser.yy b/src/parser/parser.yy index 74e22facff0..55e62f19b67 100644 --- a/src/parser/parser.yy +++ b/src/parser/parser.yy @@ -356,7 +356,7 @@ static constexpr size_t kCommentLengthLimit = 256; %type query_unique_identifier %type maintain_sentence -%type create_space_sentence describe_space_sentence drop_space_sentence +%type create_space_sentence describe_space_sentence drop_space_sentence alter_space_sentence %type create_tag_sentence create_edge_sentence %type alter_tag_sentence alter_edge_sentence %type drop_tag_sentence drop_edge_sentence @@ -3495,6 +3495,18 @@ zone_name_list } ; +alter_space_sentence + : KW_ALTER KW_SPACE name_label KW_ADD KW_ZONE name_label_list { + auto sentence = new AlterSpaceSentence($3, meta::cpp2::AlterSpaceOp::ADD_ZONE); + NameLabelList* nl = $6; + std::vector vec = nl->labels(); + for(const std::string *para:vec) { + sentence->addPara(*para); + } + delete nl; + $$ = sentence; + } + create_space_sentence : KW_CREATE KW_SPACE opt_if_not_exists name_label { auto sentence = new CreateSpaceSentence($4, $3); @@ -3833,6 +3845,7 @@ mutate_sentence maintain_sentence : create_space_sentence { $$ = $1; } | describe_space_sentence { $$ = $1; } + | alter_space_sentence { $$ = $1; } | drop_space_sentence { $$ = $1; } | create_tag_sentence { $$ = $1; } | create_edge_sentence { $$ = $1; } From be15d2458f965f5c4f53b8e81d7e7566e9e888fd Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Fri, 17 Dec 2021 21:42:47 +0800 Subject: [PATCH 3/6] make balanceTask bucket size be equal to distinct part number --- .../processors/job/BalanceJobExecutor.cpp | 9 +- src/meta/processors/job/BalanceJobExecutor.h | 7 +- src/meta/processors/job/BalancePlan.cpp | 90 +++++++++++-------- src/meta/processors/job/BalancePlan.h | 4 +- .../processors/job/DataBalanceJobExecutor.cpp | 55 +++++++----- .../processors/job/DataBalanceJobExecutor.h | 4 +- src/meta/processors/job/JobExecutor.cpp | 4 +- src/meta/processors/job/JobExecutor.h | 2 +- src/meta/processors/job/JobManager.cpp | 4 +- src/meta/processors/job/JobManager.h | 2 +- .../job/LeaderBalanceJobExecutor.cpp | 6 +- .../processors/job/LeaderBalanceJobExecutor.h | 6 +- .../processors/job/StorageJobExecutor.cpp | 2 +- .../processors/job/ZoneBalanceJobExecutor.cpp | 20 +++-- .../processors/job/ZoneBalanceJobExecutor.h | 4 +- .../processors/parts/AlterSpaceProcessor.cpp | 11 ++- src/meta/test/BalancerTest.cpp | 10 +-- src/parser/AdminSentences.cpp | 8 +- src/parser/test/ParserTest.cpp | 6 +- 19 files changed, 145 insertions(+), 109 deletions(-) diff --git a/src/meta/processors/job/BalanceJobExecutor.cpp b/src/meta/processors/job/BalanceJobExecutor.cpp index f7392d068ba..bb52f199319 100644 --- a/src/meta/processors/job/BalanceJobExecutor.cpp +++ b/src/meta/processors/job/BalanceJobExecutor.cpp @@ -88,11 +88,16 @@ nebula::cpp2::ErrorCode BalanceJobExecutor::save(const std::string& k, const std return rc; } -nebula::cpp2::ErrorCode SpaceInfo::getInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore) { +nebula::cpp2::ErrorCode SpaceInfo::loadInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore) { spaceId_ = spaceId; std::string spaceKey = MetaKeyUtils::spaceKey(spaceId); std::string spaceVal; - kvstore->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + auto rc = kvstore->get(kDefaultSpaceId, kDefaultPartId, spaceKey, &spaceVal); + if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Get space info " << spaceId + << " failed, error: " << apache::thrift::util::enumNameSafe(rc); + return rc; + } meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); name_ = properties.get_space_name(); replica_ = properties.get_replica_factor(); diff --git a/src/meta/processors/job/BalanceJobExecutor.h b/src/meta/processors/job/BalanceJobExecutor.h index ec9ac8f0de1..639b6a6f41c 100644 --- a/src/meta/processors/job/BalanceJobExecutor.h +++ b/src/meta/processors/job/BalanceJobExecutor.h @@ -17,10 +17,10 @@ using HostParts = std::unordered_map>; using LeaderBalancePlan = std::vector>; struct Host { - explicit Host(const HostAddr& ha) : ha_(ha) {} + explicit Host(const HostAddr& ha) : host_(ha) {} Host() = default; - HostAddr ha_; + HostAddr host_; std::set parts_; }; struct Zone { @@ -35,12 +35,13 @@ struct Zone { int32_t partNum_; }; struct SpaceInfo { - nebula::cpp2::ErrorCode getInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore); + nebula::cpp2::ErrorCode loadInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore); bool hasHost(const HostAddr& ha); std::string name_; GraphSpaceID spaceId_; int32_t replica_; + // zone_name -> zone std::map zones_; }; diff --git a/src/meta/processors/job/BalancePlan.cpp b/src/meta/processors/job/BalancePlan.cpp index 45ebd0278aa..4378802dd88 100644 --- a/src/meta/processors/job/BalancePlan.cpp +++ b/src/meta/processors/job/BalancePlan.cpp @@ -23,20 +23,13 @@ void BalancePlan::dispatchTasks() { for (auto& task : tasks_) { partTasks[std::make_pair(task.spaceId_, task.partId_)].emplace_back(index++); } - buckets_.resize(std::min(partTasks.size(), (size_t)FLAGS_task_concurrency)); + buckets_.resize(partTasks.size()); + int32_t bucketIndex = 0; for (auto it = partTasks.begin(); it != partTasks.end(); it++) { - size_t minNum = tasks_.size(); - int32_t i = 0, minIndex = 0; - for (auto& bucket : buckets_) { - if (bucket.size() < minNum) { - minNum = bucket.size(); - minIndex = i; - } - i++; - } for (auto taskIndex : it->second) { - buckets_[minIndex].emplace_back(taskIndex); + buckets_[bucketIndex].emplace_back(taskIndex); } + bucketIndex++; } } @@ -66,20 +59,32 @@ void BalancePlan::invoke() { stopped = stopped_; } if (finished) { - CHECK_EQ(j, this->buckets_[i].size() - 1); - saveInStore(true); + CHECK_EQ(j, buckets_[i].size() - 1); + saveInStore(); onFinished_(stopped ? meta::cpp2::JobStatus::STOPPED : (failed_ ? meta::cpp2::JobStatus::FAILED : meta::cpp2::JobStatus::FINISHED)); - } else if (j + 1 < this->buckets_[i].size()) { - auto& task = this->tasks_[this->buckets_[i][j + 1]]; + } else if (j + 1 < buckets_[i].size()) { + auto& task = tasks_[buckets_[i][j + 1]]; if (stopped) { task.ret_ = BalanceTaskResult::INVALID; } task.invoke(); + } else { + size_t index = curIndex_.fetch_add(1, std::memory_order_relaxed); + if (index < buckets_.size()) { + Bucket& bucket = buckets_[index]; + if (!bucket.empty()) { + auto& task = tasks_[bucket[0]]; + if (stopped) { + task.ret_ = BalanceTaskResult::INVALID; + } + task.invoke(); + } + } } }; // onFinished - tasks_[taskIndex].onError_ = [this, i, j, taskIndex]() { + tasks_[taskIndex].onError_ = [this, i, j]() { bool finished = false; bool stopped = false; { @@ -95,42 +100,49 @@ void BalancePlan::invoke() { stopped = stopped_; } if (finished) { - CHECK_EQ(j, this->buckets_[i].size() - 1); + CHECK_EQ(j, buckets_[i].size() - 1); onFinished_(stopped ? meta::cpp2::JobStatus::STOPPED : meta::cpp2::JobStatus::FAILED); - } else if (j + 1 < this->buckets_[i].size()) { - auto& task = this->tasks_[this->buckets_[i][j + 1]]; - if (tasks_[taskIndex].spaceId_ == task.spaceId_ && - tasks_[taskIndex].partId_ == task.partId_) { - LOG(INFO) << "Skip the task for the same partId " << task.partId_; - task.ret_ = BalanceTaskResult::FAILED; - } - if (stopped) { - task.ret_ = BalanceTaskResult::INVALID; - } + } else if (j + 1 < buckets_[i].size()) { + auto& task = tasks_[buckets_[i][j + 1]]; + LOG(INFO) << "Skip the task for the same partId " << task.partId_; + task.ret_ = BalanceTaskResult::FAILED; task.invoke(); + } else { + size_t index = curIndex_.fetch_add(1, std::memory_order_relaxed); + if (index < buckets_.size()) { + Bucket& bucket = buckets_[index]; + if (!bucket.empty()) { + auto& task = tasks_[bucket[0]]; + if (stopped) { + task.ret_ = BalanceTaskResult::INVALID; + } + task.invoke(); + } + } } }; // onError } // for (auto j = 0; j < buckets_[i].size(); j++) } // for (auto i = 0; i < buckets_.size(); i++) - saveInStore(true); - for (auto& bucket : buckets_) { - if (!bucket.empty()) { - tasks_[bucket[0]].invoke(); + saveInStore(); + uint32 bucketSize = buckets_.size(); + int32_t concurrency = std::min(FLAGS_task_concurrency, bucketSize); + curIndex_.store(concurrency, std::memory_order_relaxed); + for (int32_t i = 0; i < concurrency; i++) { + if (!buckets_[i].empty()) { + tasks_[buckets_[i][0]].invoke(); } } } -nebula::cpp2::ErrorCode BalancePlan::saveInStore(bool onlyPlan) { +nebula::cpp2::ErrorCode BalancePlan::saveInStore() { CHECK_NOTNULL(kv_); std::vector data; - if (!onlyPlan) { - for (auto& task : tasks_) { - data.emplace_back(MetaKeyUtils::balanceTaskKey( - task.jobId_, task.spaceId_, task.partId_, task.src_, task.dst_), - MetaKeyUtils::balanceTaskVal( - task.status_, task.ret_, task.startTimeMs_, task.endTimeMs_)); - } + for (auto& task : tasks_) { + data.emplace_back( + MetaKeyUtils::balanceTaskKey( + task.jobId_, task.spaceId_, task.partId_, task.src_, task.dst_), + MetaKeyUtils::balanceTaskVal(task.status_, task.ret_, task.startTimeMs_, task.endTimeMs_)); } folly::Baton baton; auto ret = nebula::cpp2::ErrorCode::SUCCEEDED; diff --git a/src/meta/processors/job/BalancePlan.h b/src/meta/processors/job/BalancePlan.h index 711a7e1e814..8aed704c9a9 100644 --- a/src/meta/processors/job/BalancePlan.h +++ b/src/meta/processors/job/BalancePlan.h @@ -17,7 +17,6 @@ namespace nebula { namespace meta { class BalancePlan { - friend class Balancer; friend class DataBalanceJobExecutor; FRIEND_TEST(BalanceTest, BalancePlanTest); FRIEND_TEST(BalanceTest, NormalTest); @@ -64,7 +63,7 @@ class BalancePlan { jobDescription_.setStatus(status); } - nebula::cpp2::ErrorCode saveInStore(bool onlyPlan = false); + nebula::cpp2::ErrorCode saveInStore(); JobID id() const { return jobDescription_.getJobId(); @@ -110,6 +109,7 @@ class BalancePlan { // List of task index in tasks_; using Bucket = std::vector; std::vector buckets_; + std::atomic curIndex_; }; } // namespace meta diff --git a/src/meta/processors/job/DataBalanceJobExecutor.cpp b/src/meta/processors/job/DataBalanceJobExecutor.cpp index c3ffc0ac3a7..8826b656e61 100644 --- a/src/meta/processors/job/DataBalanceJobExecutor.cpp +++ b/src/meta/processors/job/DataBalanceJobExecutor.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -35,18 +35,18 @@ folly::Future DataBalanceJobExecutor::executeInternal() { Status DataBalanceJobExecutor::buildBalancePlan() { std::map> lostZoneHost; std::map> activeSortedHost; - for (auto& p : spaceInfo_.zones_) { - for (auto& ph : p.second.hosts_) { - activeSortedHost[p.first].push_back(&ph.second); + for (auto& zoneMapEntry : spaceInfo_.zones_) { + for (auto& hostMapEntry : zoneMapEntry.second.hosts_) { + activeSortedHost[zoneMapEntry.first].push_back(&hostMapEntry.second); } } - for (HostAddr ha : lostHosts_) { - if (!spaceInfo_.hasHost(ha)) { + for (HostAddr host : lostHosts_) { + if (!spaceInfo_.hasHost(host)) { return Status::Error( - "Host %s does not belong to space %d", ha.toString().c_str(), spaceInfo_.spaceId_); + "Host %s does not belong to space %d", host.toString().c_str(), spaceInfo_.spaceId_); } for (auto& zoneMapEntry : spaceInfo_.zones_) { - auto it = zoneMapEntry.second.hosts_.find(ha); + auto it = zoneMapEntry.second.hosts_.find(host); if (it != zoneMapEntry.second.hosts_.end()) { lostZoneHost[zoneMapEntry.first].push_back(&it->second); std::vector& hvec = activeSortedHost[zoneMapEntry.first]; @@ -62,14 +62,20 @@ Status DataBalanceJobExecutor::buildBalancePlan() { }); } plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); - for (auto& p : lostZoneHost) { - std::vector& hvec = activeSortedHost[p.first]; - for (Host* h : p.second) { - for (PartitionID partId : h->parts_) { + // move parts of lost hosts to active hosts in the same zone + for (auto& zoneHostEntry : lostZoneHost) { + std::vector& hvec = activeSortedHost[zoneHostEntry.first]; + for (Host* host : zoneHostEntry.second) { + for (PartitionID partId : host->parts_) { Host* dstHost = hvec.front(); dstHost->parts_.insert(partId); - plan_->addTask(BalanceTask( - jobId_, spaceInfo_.spaceId_, partId, h->ha_, dstHost->ha_, kvstore_, adminClient_)); + plan_->addTask(BalanceTask(jobId_, + spaceInfo_.spaceId_, + partId, + host->host_, + dstHost->host_, + kvstore_, + adminClient_)); for (size_t i = 0; i < hvec.size() - 1; i++) { if (hvec[i]->parts_.size() > hvec[i + 1]->parts_.size()) { std::swap(hvec[i], hvec[i + 1]); @@ -78,16 +84,21 @@ Status DataBalanceJobExecutor::buildBalancePlan() { } } } - h->parts_.clear(); + host->parts_.clear(); } } lostZoneHost.clear(); + // rebalance for hosts in a zone auto balanceHostVec = [this](std::vector& hostVec) -> std::vector { size_t totalPartNum = 0; size_t avgPartNum = 0; for (Host* h : hostVec) { totalPartNum += h->parts_.size(); } + if (hostVec.size() == 0) { + LOG(ERROR) << "rebalance error: zone has no host"; + return {}; + } avgPartNum = totalPartNum / hostVec.size(); size_t remainder = totalPartNum - avgPartNum * hostVec.size(); size_t leftBegin = 0; @@ -101,9 +112,9 @@ Status DataBalanceJobExecutor::buildBalancePlan() { break; } } - for (size_t i = 0; i < hostVec.size(); i++) { + for (size_t i = leftEnd; i < hostVec.size(); i++) { + rightBegin = i; if (avgPartNum < hostVec[i]->parts_.size()) { - rightBegin = i; break; } } @@ -124,8 +135,8 @@ Status DataBalanceJobExecutor::buildBalancePlan() { tasks.emplace_back(jobId_, spaceInfo_.spaceId_, partId, - srcHost->ha_, - hostVec[leftBegin]->ha_, + srcHost->host_, + hostVec[leftBegin]->host_, kvstore_, adminClient_); size_t leftIndex = leftBegin; @@ -145,8 +156,8 @@ Status DataBalanceJobExecutor::buildBalancePlan() { } return tasks; }; - for (auto& p : activeSortedHost) { - std::vector& hvec = p.second; + for (auto& pair : activeSortedHost) { + std::vector& hvec = pair.second; std::vector tasks = balanceHostVec(hvec); for (BalanceTask& task : tasks) { plan_->addTask(std::move(task)); @@ -175,7 +186,7 @@ nebula::cpp2::ErrorCode DataBalanceJobExecutor::prepare() { return nebula::error(spaceRet); } GraphSpaceID spaceId = nebula::value(spaceRet); - nebula::cpp2::ErrorCode rc = spaceInfo_.getInfo(spaceId, kvstore_); + nebula::cpp2::ErrorCode rc = spaceInfo_.loadInfo(spaceId, kvstore_); if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { return rc; } diff --git a/src/meta/processors/job/DataBalanceJobExecutor.h b/src/meta/processors/job/DataBalanceJobExecutor.h index f7759320227..fe292160967 100644 --- a/src/meta/processors/job/DataBalanceJobExecutor.h +++ b/src/meta/processors/job/DataBalanceJobExecutor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -39,4 +39,4 @@ class DataBalanceJobExecutor : public BalanceJobExecutor { } // namespace meta } // namespace nebula -#endif // META_BALANCEJOBEXECUTOR_H_ +#endif // META_DATABALANCEJOBEXECUTOR_H_ diff --git a/src/meta/processors/job/JobExecutor.cpp b/src/meta/processors/job/JobExecutor.cpp index d940afdc7c2..f8d01f143f6 100644 --- a/src/meta/processors/job/JobExecutor.cpp +++ b/src/meta/processors/job/JobExecutor.cpp @@ -1,8 +1,10 @@ -/* Copyright (c) 2019 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ +#include "meta/processors/job/JobExecutor.h" + #include "common/network/NetworkUtils.h" #include "common/utils/MetaKeyUtils.h" #include "common/utils/Utils.h" diff --git a/src/meta/processors/job/JobExecutor.h b/src/meta/processors/job/JobExecutor.h index 9c482dd272b..166464f3f52 100644 --- a/src/meta/processors/job/JobExecutor.h +++ b/src/meta/processors/job/JobExecutor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ diff --git a/src/meta/processors/job/JobManager.cpp b/src/meta/processors/job/JobManager.cpp index e833b735126..795ade3c73c 100644 --- a/src/meta/processors/job/JobManager.cpp +++ b/src/meta/processors/job/JobManager.cpp @@ -312,9 +312,9 @@ nebula::cpp2::ErrorCode JobManager::saveTaskStatus(TaskDescription& td, return jobExec->saveSpecialTaskStatus(req); } -void JobManager::compareChangeStatus(JbmgrStatus expected, JbmgrStatus despire) { +void JobManager::compareChangeStatus(JbmgrStatus expected, JbmgrStatus desired) { JbmgrStatus ex = expected; - status_.compare_exchange_strong(ex, despire, std::memory_order_acq_rel); + status_.compare_exchange_strong(ex, desired, std::memory_order_acq_rel); } /** diff --git a/src/meta/processors/job/JobManager.h b/src/meta/processors/job/JobManager.h index 2b4e0056a81..52e73584058 100644 --- a/src/meta/processors/job/JobManager.h +++ b/src/meta/processors/job/JobManager.h @@ -141,7 +141,7 @@ class JobManager : public nebula::cpp::NonCopyable, public nebula::cpp::NonMovab nebula::cpp2::ErrorCode saveTaskStatus(TaskDescription& td, const cpp2::ReportTaskReq& req); - void compareChangeStatus(JbmgrStatus expected, JbmgrStatus despire); + void compareChangeStatus(JbmgrStatus expected, JbmgrStatus desired); private: // Todo(pandasheep) diff --git a/src/meta/processors/job/LeaderBalanceJobExecutor.cpp b/src/meta/processors/job/LeaderBalanceJobExecutor.cpp index cd17cda99ed..d55144d6bc3 100644 --- a/src/meta/processors/job/LeaderBalanceJobExecutor.cpp +++ b/src/meta/processors/job/LeaderBalanceJobExecutor.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -263,7 +263,7 @@ folly::Future LeaderBalanceJobExecutor::executeInternal() { << "Space: " << spaceId; continue; } - simplifyLeaderBalnacePlan(spaceId, plan); + simplifyLeaderBalancePlan(spaceId, plan); for (const auto& task : plan) { futures.emplace_back(adminClient_->transLeader(std::get<0>(task), std::get<1>(task), @@ -531,7 +531,7 @@ int32_t LeaderBalanceJobExecutor::giveupLeaders(HostParts& leaderParts, return taskCount; } -void LeaderBalanceJobExecutor::simplifyLeaderBalnacePlan(GraphSpaceID spaceId, +void LeaderBalanceJobExecutor::simplifyLeaderBalancePlan(GraphSpaceID spaceId, LeaderBalancePlan& plan) { std::unordered_map buckets; for (auto& task : plan) { diff --git a/src/meta/processors/job/LeaderBalanceJobExecutor.h b/src/meta/processors/job/LeaderBalanceJobExecutor.h index 113257d8093..e4b7352dc2c 100644 --- a/src/meta/processors/job/LeaderBalanceJobExecutor.h +++ b/src/meta/processors/job/LeaderBalanceJobExecutor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -60,7 +60,7 @@ class LeaderBalanceJobExecutor : public MetaJobExecutor { LeaderBalancePlan& plan, GraphSpaceID spaceId); - void simplifyLeaderBalnacePlan(GraphSpaceID spaceId, LeaderBalancePlan& plan); + void simplifyLeaderBalancePlan(GraphSpaceID spaceId, LeaderBalancePlan& plan); nebula::cpp2::ErrorCode getAllSpaces( std::vector>& spaces); @@ -91,4 +91,4 @@ class LeaderBalanceJobExecutor : public MetaJobExecutor { } // namespace meta } // namespace nebula -#endif // META_BALANCEJOBEXECUTOR_H_ +#endif // META_LEADERBALANCEJOBEXECUTOR_H_ diff --git a/src/meta/processors/job/StorageJobExecutor.cpp b/src/meta/processors/job/StorageJobExecutor.cpp index 40922456dc3..14ef50aa4bd 100644 --- a/src/meta/processors/job/StorageJobExecutor.cpp +++ b/src/meta/processors/job/StorageJobExecutor.cpp @@ -166,7 +166,7 @@ nebula::cpp2::ErrorCode StorageJobExecutor::execute() { }); baton.wait(); if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(INFO) << "write to kv store failed, error: " << apache::thrift::util::enumNameSafe(rc); + LOG(ERROR) << "write to kv store failed, error: " << apache::thrift::util::enumNameSafe(rc); return rc; } } diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp index 7c6feabde96..eb3abd1fd24 100644 --- a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -21,7 +21,7 @@ nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::prepare() { return nebula::error(spaceRet); } GraphSpaceID spaceId = nebula::value(spaceRet); - nebula::cpp2::ErrorCode rc = spaceInfo_.getInfo(spaceId, kvstore_); + nebula::cpp2::ErrorCode rc = spaceInfo_.loadInfo(spaceId, kvstore_); if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { return rc; } @@ -70,8 +70,8 @@ nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::updateMeta() { for (std::string& zn : lostZones_) { spaceInfo_.zones_.erase(zn); } - for (auto& p : spaceInfo_.zones_) { - zones.push_back(p.first); + for (auto& zoneMapEntry : spaceInfo_.zones_) { + zones.emplace_back(zoneMapEntry.first); } properties.set_zone_names(std::move(zones)); std::vector data; @@ -146,7 +146,7 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { std::vector& sortedHosts = sortedZoneHosts[zone->zoneName_]; sortedHosts.front()->parts_.emplace(partId); zone->partNum_++; - HostAddr ha = sortedHosts.front()->ha_; + HostAddr ha = sortedHosts.front()->host_; for (size_t i = 0; i < sortedHosts.size() - 1; i++) { if (sortedHosts[i]->parts_.size() >= sortedHosts[i + 1]->parts_.size()) { std::swap(sortedHosts[i], sortedHosts[i + 1]); @@ -177,6 +177,7 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { return ha; }; + // move parts of lost zones to active zones for (auto& zoneMapEntry : lostZones) { Zone* zone = zoneMapEntry.second; for (auto& hostMapEntry : zone->hosts_) { @@ -190,11 +191,16 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { zone->calPartNum(); } + // all parts of lost zones have moved to active zones, then rebalance the active zones int32_t totalPartNum = 0; int32_t avgPartNum = 0; for (auto& z : sortedActiveZones) { totalPartNum += z->partNum_; } + if (sortedActiveZones.size() == 0) { + LOG(ERROR) << "rebalance error: no active zones"; + return {}; + } avgPartNum = totalPartNum / sortedActiveZones.size(); int32_t remainder = totalPartNum - avgPartNum * sortedActiveZones.size(); int32_t leftBegin = 0; @@ -241,7 +247,7 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { tasks.emplace_back(jobId_, spaceInfo_.spaceId_, partId, - sortedHosts[hostIndex]->ha_, + sortedHosts[hostIndex]->host_, dst, kvstore_, adminClient_); @@ -255,7 +261,7 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { break; } } - // if the zone's part reach the avgPartNum,is can't recieve parts any more + // if the zone's part reach the avgPartNum,it can't recieve parts any more if (newLeftIndex == leftEnd - 1 && sortedActiveZones[newLeftIndex]->partNum_ >= avgPartNum) { leftEnd--; diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.h b/src/meta/processors/job/ZoneBalanceJobExecutor.h index 9c96d066a91..14df98e62ce 100644 --- a/src/meta/processors/job/ZoneBalanceJobExecutor.h +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 vesoft inc. All rights reserved. +/* Copyright (c) 2021 vesoft inc. All rights reserved. * * This source code is licensed under Apache 2.0 License. */ @@ -41,4 +41,4 @@ class ZoneBalanceJobExecutor : public BalanceJobExecutor { } // namespace meta } // namespace nebula -#endif // META_BALANCEJOBEXECUTOR_H_ +#endif // META_ZONEBALANCEJOBEXECUTOR_H_ diff --git a/src/meta/processors/parts/AlterSpaceProcessor.cpp b/src/meta/processors/parts/AlterSpaceProcessor.cpp index d0a9bbfe7e8..090df5f93fe 100644 --- a/src/meta/processors/parts/AlterSpaceProcessor.cpp +++ b/src/meta/processors/parts/AlterSpaceProcessor.cpp @@ -8,6 +8,7 @@ namespace nebula { namespace meta { void AlterSpaceProcessor::process(const cpp2::AlterSpaceReq& req) { + folly::SharedMutex::WriteHolder wHolder(LockUtils::spaceLock()); const std::vector& zones = req.get_paras(); const std::string& spaceName = req.get_space_name(); cpp2::AlterSpaceOp op = req.get_op(); @@ -45,13 +46,11 @@ nebula::cpp2::ErrorCode AlterSpaceProcessor::addZones(const std::string& spaceNa } meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); const std::vector& curZones = properties.get_zone_names(); - std::set zm; - for (const std::string& z : curZones) { - zm.insert(z); - } + std::set zm(curZones.begin(), curZones.end()); + std::set distinctZones(zones.begin(), zones.end()); std::vector newZones = curZones; - newZones.reserve(curZones.size() + zones.size()); - for (const std::string& z : zones) { + newZones.reserve(curZones.size() + distinctZones.size()); + for (const std::string& z : distinctZones) { std::string zoneKey = MetaKeyUtils::zoneKey(z); std::string zoneVal; nebula::cpp2::ErrorCode zoneRet = diff --git a/src/meta/test/BalancerTest.cpp b/src/meta/test/BalancerTest.cpp index 54e918b0535..8edb86eb22c 100644 --- a/src/meta/test/BalancerTest.cpp +++ b/src/meta/test/BalancerTest.cpp @@ -111,7 +111,7 @@ SpaceInfo createSpaceInfo( for (const auto& p : h.second) { host.parts_.insert(p); } - zone.hosts_.emplace(host.ha_, host); + zone.hosts_.emplace(host.host_, host); } spaceInfo.zones_.emplace(zone.zoneName_, zone); } @@ -694,7 +694,7 @@ TEST(BalanceTest, NormalZoneTest) { NiceMock client; JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::ZONE_BALANCE); ZoneBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.spaceInfo_.getInfo(1, kv); + balancer.spaceInfo_.loadInfo(1, kv); auto ret = balancer.executeInternal(); EXPECT_EQ(Status::Balanced(), ret.value()); balancer.finish(); @@ -725,7 +725,7 @@ TEST(BalanceTest, NormalDataTest) { NiceMock client; JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.spaceInfo_.getInfo(1, kv); + balancer.spaceInfo_.loadInfo(1, kv); auto ret = balancer.executeInternal(); EXPECT_EQ(Status::Balanced(), ret.value()); balancer.finish(); @@ -761,7 +761,7 @@ TEST(BalanceTest, RecoveryTest) { JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); DataBalanceJobExecutor balancer(jd, kv, &client, {}); - balancer.spaceInfo_.getInfo(1, kv); + balancer.spaceInfo_.loadInfo(1, kv); balancer.lostHosts_ = {{"127.0.0.1", 1}, {"127.0.0.1", 8}}; folly::Baton baton; balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { @@ -823,7 +823,7 @@ TEST(BalanceTest, StopPlanTest) { FLAGS_task_concurrency = 8; JobDescription jd = makeJobDescription(kv, cpp2::AdminCmd::DATA_BALANCE); ZoneBalanceJobExecutor balancer(jd, kv, &delayClient, {}); - balancer.spaceInfo_.getInfo(1, kv); + balancer.spaceInfo_.loadInfo(1, kv); balancer.lostZones_ = {"4", "5"}; folly::Baton baton; balancer.setFinishCallBack([&](meta::cpp2::JobStatus) { diff --git a/src/parser/AdminSentences.cpp b/src/parser/AdminSentences.cpp index 28c9092a9af..e3fe023ff14 100644 --- a/src/parser/AdminSentences.cpp +++ b/src/parser/AdminSentences.cpp @@ -271,9 +271,9 @@ std::string AdminJobSentence::toString() const { return "INGEST"; case meta::cpp2::AdminCmd::DATA_BALANCE: if (paras_.empty()) { - return "SUBMIT JOB BALANCE DATA"; + return "SUBMIT JOB BALANCE IN ZONE"; } else { - std::string str = "SUBMIT JOB BALANCE DATA REMOVE"; + std::string str = "SUBMIT JOB BALANCE IN ZONE REMOVE"; for (size_t i = 0; i < paras_.size(); i++) { auto &s = paras_[i]; str += i == 0 ? " " + s : ", " + s; @@ -282,9 +282,9 @@ std::string AdminJobSentence::toString() const { } case meta::cpp2::AdminCmd::ZONE_BALANCE: if (paras_.empty()) { - return "SUBMIT JOB BALANCE ZONE"; + return "SUBMIT JOB BALANCE ACROSS ZONE"; } else { - std::string str = "SUBMIT JOB BALANCE ZONE REMOVE"; + std::string str = "SUBMIT JOB BALANCE ACROSS ZONE REMOVE"; for (size_t i = 0; i < paras_.size(); i++) { auto &s = paras_[i]; str += i == 0 ? " " + s : ", " + s; diff --git a/src/parser/test/ParserTest.cpp b/src/parser/test/ParserTest.cpp index 20b3d4683ae..8cdf7974f42 100644 --- a/src/parser/test/ParserTest.cpp +++ b/src/parser/test/ParserTest.cpp @@ -3207,10 +3207,10 @@ TEST_F(ParserTest, JobTest) { checkTest("SUBMIT JOB FLUSH 111", "SUBMIT JOB FLUSH 111"); checkTest("SUBMIT JOB STATS", "SUBMIT JOB STATS"); checkTest("SUBMIT JOB STATS 111", "SUBMIT JOB STATS 111"); - checkTest("SUBMIT JOB BALANCE DATA", "SUBMIT JOB BALANCE DATA"); + checkTest("SUBMIT JOB BALANCE IN ZONE", "SUBMIT JOB BALANCE IN ZONE"); checkTest( - "SUBMIT JOB BALANCE DATA REMOVE 192.168.0.1:50000, 192.168.0.1:50001, 192.168.0.1:50002", - "SUBMIT JOB BALANCE DATA REMOVE \"192.168.0.1\":50000, \"192.168.0.1\":50001, " + "SUBMIT JOB BALANCE IN ZONE REMOVE 192.168.0.1:50000, 192.168.0.1:50001, 192.168.0.1:50002", + "SUBMIT JOB BALANCE IN ZONE REMOVE \"192.168.0.1\":50000, \"192.168.0.1\":50001, " "\"192.168.0.1\":50002"); checkTest("SUBMIT JOB BALANCE LEADER", "SUBMIT JOB BALANCE LEADER"); checkTest("SHOW JOBS", "SHOW JOBS"); From 74a23fc6b0e225f62d769bf3e851b2093425220d Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Thu, 23 Dec 2021 13:00:57 +0800 Subject: [PATCH 4/6] refactor zone balance --- .../processors/job/ZoneBalanceJobExecutor.cpp | 259 +++++++++--------- .../processors/job/ZoneBalanceJobExecutor.h | 7 + 2 files changed, 142 insertions(+), 124 deletions(-) diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp index eb3abd1fd24..4059b5b18be 100644 --- a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp @@ -94,6 +94,129 @@ nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::updateMeta() { return ret; } +HostAddr ZoneBalanceJobExecutor::insertPartIntoZone( + std::map>* sortedZoneHosts, Zone* zone, PartitionID partId) { + std::vector& sortedHosts = sortedZoneHosts->operator[](zone->zoneName_); + sortedHosts.front()->parts_.emplace(partId); + zone->partNum_++; + HostAddr ha = sortedHosts.front()->host_; + for (size_t i = 0; i < sortedHosts.size() - 1; i++) { + if (sortedHosts[i]->parts_.size() >= sortedHosts[i + 1]->parts_.size()) { + std::swap(sortedHosts[i], sortedHosts[i + 1]); + } else { + break; + } + } + return ha; +} + +nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::rebalanceActiveZones( + std::vector* sortedActiveZones, + std::map>* sortedZoneHosts, + std::vector* tasks) { + std::vector& sortedActiveZonesRef = *sortedActiveZones; + std::map>& sortedZoneHostsRef = *sortedZoneHosts; + int32_t totalPartNum = 0; + int32_t avgPartNum = 0; + for (auto& z : sortedActiveZonesRef) { + totalPartNum += z->partNum_; + } + if (sortedActiveZonesRef.size() == 0) { + LOG(ERROR) << "rebalance error: no active zones"; + return nebula::cpp2::ErrorCode::E_NO_HOSTS; + } + avgPartNum = totalPartNum / sortedActiveZonesRef.size(); + int32_t remainder = totalPartNum - avgPartNum * sortedActiveZonesRef.size(); + int32_t leftBegin = 0; + int32_t leftEnd = 0; + int32_t rightBegin = 0; + int32_t rightEnd = sortedActiveZonesRef.size(); + for (size_t i = 0; i < sortedActiveZonesRef.size(); i++) { + if (avgPartNum <= sortedActiveZonesRef[i]->partNum_) { + leftEnd = i; + break; + } + } + for (size_t i = leftEnd; i < sortedActiveZonesRef.size(); i++) { + if (avgPartNum < sortedActiveZonesRef[i]->partNum_) { + rightBegin = i; + break; + } + } + auto findZoneToInsert = [&](PartitionID partId, const HostAddr& srcHost) -> bool { + for (int32_t leftIndex = leftBegin; leftIndex < leftEnd; leftIndex++) { + if (!sortedActiveZonesRef[leftIndex]->partExist(partId)) { + HostAddr dst = insertPartIntoZone(sortedZoneHosts, sortedActiveZonesRef[leftIndex], partId); + tasks->emplace_back( + jobId_, spaceInfo_.spaceId_, partId, srcHost, dst, kvstore_, adminClient_); + int32_t newLeftIndex = leftIndex; + for (; newLeftIndex < leftEnd - 1; newLeftIndex++) { + if (sortedActiveZonesRef[newLeftIndex]->partNum_ > + sortedActiveZonesRef[newLeftIndex + 1]->partNum_) { + std::swap(sortedActiveZonesRef[newLeftIndex], sortedActiveZonesRef[newLeftIndex + 1]); + } else { + break; + } + } + // if the zone's part reach the avgPartNum,it can't recieve parts any more + if (newLeftIndex == leftEnd - 1 && + sortedActiveZonesRef[newLeftIndex]->partNum_ >= avgPartNum) { + leftEnd--; + } + // all zones in left side have reached avgPartNum,and now some of them will take + // avgPartNum+1 if there still has remainder + if (leftBegin == leftEnd) { + leftEnd = rightBegin; + } + return true; + } + } + return false; + }; + for (int32_t right = rightBegin; right < rightEnd;) { + Zone* srcZone = sortedActiveZonesRef[right]; + // if remainder>0 some zones will hold avgPartNum+1 patrs, we prioritise choosing zones in right + // side to hold them + if (srcZone->partNum_ == avgPartNum + 1 && remainder) { + right++; + remainder--; + continue; + } + if (srcZone->partNum_ == avgPartNum) { + right++; + continue; + } + std::vector& sortedHosts = sortedZoneHostsRef[srcZone->zoneName_]; + int32_t hostIndex = sortedHosts.size() - 1; + // to find a part to move,we prioritise moving parts from who has the most + for (; hostIndex >= 0; hostIndex--) { + std::set& hostParts = sortedHosts[hostIndex]->parts_; + PartitionID movePart = -1; + for (PartitionID partId : hostParts) { + // to find a zone which does not contain the part in the left side to insert + bool matched = findZoneToInsert(partId, sortedHosts[hostIndex]->host_); + if (matched) { + movePart = partId; + break; + } + } + if (movePart != -1) { + hostParts.erase(movePart); + srcZone->partNum_--; + break; + } + } + for (int32_t i = hostIndex; i > 0; i--) { + if (sortedHosts[i]->parts_.size() <= sortedHosts[i - 1]->parts_.size()) { + std::swap(sortedHosts[i], sortedHosts[i - 1]); + } else { + break; + } + } + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + /* first, move the lostZones' parts to the active zones * second, make balance for the active zones */ Status ZoneBalanceJobExecutor::buildBalancePlan() { @@ -142,23 +265,8 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { return l->partNum_ < r->partNum_; }); - auto insertPartIntoZone = [&sortedZoneHosts](Zone* zone, PartitionID partId) -> HostAddr { - std::vector& sortedHosts = sortedZoneHosts[zone->zoneName_]; - sortedHosts.front()->parts_.emplace(partId); - zone->partNum_++; - HostAddr ha = sortedHosts.front()->host_; - for (size_t i = 0; i < sortedHosts.size() - 1; i++) { - if (sortedHosts[i]->parts_.size() >= sortedHosts[i + 1]->parts_.size()) { - std::swap(sortedHosts[i], sortedHosts[i + 1]); - } else { - break; - } - } - return ha; - }; - - auto chooseZoneToInsert = [&insertPartIntoZone, - &sortedActiveZones](PartitionID partId) -> HostAddr { + auto chooseZoneToInsert = + [this, &sortedActiveZones, &sortedZoneHosts](PartitionID partId) -> HostAddr { size_t index = 0; for (size_t i = 0; i < sortedActiveZones.size(); i++) { if (!sortedActiveZones[i]->partExist(partId)) { @@ -166,7 +274,7 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { break; } } - HostAddr ha = insertPartIntoZone(sortedActiveZones[index], partId); + HostAddr ha = insertPartIntoZone(&sortedZoneHosts, sortedActiveZones[index], partId); for (size_t i = index; i < sortedActiveZones.size() - 1; i++) { if (sortedActiveZones[i]->partNum_ >= sortedActiveZones[i + 1]->partNum_) { std::swap(sortedActiveZones[i], sortedActiveZones[i + 1]); @@ -181,126 +289,29 @@ Status ZoneBalanceJobExecutor::buildBalancePlan() { for (auto& zoneMapEntry : lostZones) { Zone* zone = zoneMapEntry.second; for (auto& hostMapEntry : zone->hosts_) { - for (PartitionID partId : hostMapEntry.second.parts_) { + const HostAddr& hostAddr = hostMapEntry.first; + Host& host = hostMapEntry.second; + for (PartitionID partId : host.parts_) { HostAddr dst = chooseZoneToInsert(partId); tasks.emplace_back( - jobId_, spaceInfo_.spaceId_, partId, hostMapEntry.first, dst, kvstore_, adminClient_); + jobId_, spaceInfo_.spaceId_, partId, hostAddr, dst, kvstore_, adminClient_); } - hostMapEntry.second.parts_.clear(); + host.parts_.clear(); } zone->calPartNum(); } // all parts of lost zones have moved to active zones, then rebalance the active zones - int32_t totalPartNum = 0; - int32_t avgPartNum = 0; - for (auto& z : sortedActiveZones) { - totalPartNum += z->partNum_; - } - if (sortedActiveZones.size() == 0) { - LOG(ERROR) << "rebalance error: no active zones"; - return {}; - } - avgPartNum = totalPartNum / sortedActiveZones.size(); - int32_t remainder = totalPartNum - avgPartNum * sortedActiveZones.size(); - int32_t leftBegin = 0; - int32_t leftEnd = 0; - int32_t rightBegin = 0; - int32_t rightEnd = sortedActiveZones.size(); - for (size_t i = 0; i < sortedActiveZones.size(); i++) { - if (avgPartNum <= sortedActiveZones[i]->partNum_) { - leftEnd = i; - break; - } - } - for (size_t i = leftEnd; i < sortedActiveZones.size(); i++) { - if (avgPartNum < sortedActiveZones[i]->partNum_) { - rightBegin = i; - break; - } - } - for (int32_t right = rightBegin; right < rightEnd;) { - Zone* srcZone = sortedActiveZones[right]; - // if remainder>0 some zones will hold avgPartNum+1 patrs, we prioritise taking the right side - // zones to hold them - if (srcZone->partNum_ == avgPartNum + 1 && remainder) { - right++; - remainder--; - continue; - } - if (srcZone->partNum_ == avgPartNum) { - right++; - continue; - } - std::vector& sortedHosts = sortedZoneHosts[srcZone->zoneName_]; - int32_t hostIndex = sortedHosts.size() - 1; - // to find a part to move,we prioritise moving parts from who has the most - for (; hostIndex >= 0; hostIndex--) { - std::set& hostParts = sortedHosts[hostIndex]->parts_; - PartitionID movePart = -1; - for (PartitionID partId : hostParts) { - bool matched = false; - // to find a zone which does not contain the part in the left side to insert - for (int32_t leftIndex = leftBegin; leftIndex < leftEnd; leftIndex++) { - if (!sortedActiveZones[leftIndex]->partExist(partId)) { - HostAddr dst = insertPartIntoZone(sortedActiveZones[leftIndex], partId); - tasks.emplace_back(jobId_, - spaceInfo_.spaceId_, - partId, - sortedHosts[hostIndex]->host_, - dst, - kvstore_, - adminClient_); - movePart = partId; - int32_t newLeftIndex = leftIndex; - for (; newLeftIndex < leftEnd - 1; newLeftIndex++) { - if (sortedActiveZones[newLeftIndex]->partNum_ > - sortedActiveZones[newLeftIndex + 1]->partNum_) { - std::swap(sortedActiveZones[newLeftIndex], sortedActiveZones[newLeftIndex + 1]); - } else { - break; - } - } - // if the zone's part reach the avgPartNum,it can't recieve parts any more - if (newLeftIndex == leftEnd - 1 && - sortedActiveZones[newLeftIndex]->partNum_ >= avgPartNum) { - leftEnd--; - } - // all zones in left side have reached avgPartNum,and now some of them will take - // avgPartNum+1 if there still has remainder - if (leftBegin == leftEnd) { - leftEnd = rightBegin; - } - matched = true; - break; - } - } - if (matched) { - break; - } - } - if (movePart != -1) { - hostParts.erase(movePart); - srcZone->partNum_--; - break; - } - } - for (int32_t i = hostIndex; i > 0; i--) { - if (sortedHosts[i]->parts_.size() <= sortedHosts[i - 1]->parts_.size()) { - std::swap(sortedHosts[i], sortedHosts[i - 1]); - } else { - break; - } - } - } - if (tasks.empty()) { + nebula::cpp2::ErrorCode rc = rebalanceActiveZones(&sortedActiveZones, &sortedZoneHosts, &tasks); + + if (tasks.empty() || rc != nebula::cpp2::ErrorCode::SUCCEEDED) { return Status::Balanced(); } plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); for (BalanceTask& task : tasks) { plan_->addTask(std::move(task)); } - nebula::cpp2::ErrorCode rc = plan_->saveInStore(); + rc = plan_->saveInStore(); if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) { return Status::Error("save balance zone plan failed"); } diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.h b/src/meta/processors/job/ZoneBalanceJobExecutor.h index 14df98e62ce..798675191b5 100644 --- a/src/meta/processors/job/ZoneBalanceJobExecutor.h +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.h @@ -32,6 +32,13 @@ class ZoneBalanceJobExecutor : public BalanceJobExecutor { folly::Future executeInternal() override; Status buildBalancePlan() override; nebula::cpp2::ErrorCode updateMeta(); + HostAddr insertPartIntoZone(std::map>* sortedZoneHosts, + Zone* zone, + PartitionID partId); + nebula::cpp2::ErrorCode rebalanceActiveZones( + std::vector* sortedActiveZones, + std::map>* sortedZoneHosts, + std::vector* tasks); private: std::vector lostZones_; From 372fefb527f47ecefbf60c1ed454d7682d405be4 Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Fri, 24 Dec 2021 13:34:33 +0800 Subject: [PATCH 5/6] fix balance parser --- src/parser/parser.yy | 54 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/parser/parser.yy b/src/parser/parser.yy index 55e62f19b67..9b5f730b8b2 100644 --- a/src/parser/parser.yy +++ b/src/parser/parser.yy @@ -3747,30 +3747,42 @@ integer_list balance_sentence : KW_BALANCE KW_LEADER { - auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, - meta::cpp2::AdminCmd::LEADER_BALANCE); - $$ = sentence; + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::LEADER_BALANCE); + $$ = sentence; + } + | + KW_BALANCE KW_IN KW_ZONE { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::DATA_BALANCE); + $$ = sentence; } - | KW_BALANCE KW_DATA { - auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, - meta::cpp2::AdminCmd::DATA_BALANCE); - $$ = sentence; + | KW_BALANCE KW_IN KW_ZONE KW_REMOVE host_list { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::DATA_BALANCE); + HostList* hl = $5; + std::vector has = hl->hosts(); + for (HostAddr& ha: has) { + sentence->addPara(ha.toString()); + } + delete hl; + $$ = sentence; } - | KW_BALANCE KW_DATA legal_integer { - auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::SHOW); - sentence->addPara(std::to_string($3)); - $$ = sentence; + | KW_BALANCE KW_ACROSS KW_ZONE { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::ZONE_BALANCE); + $$ = sentence; } - | KW_BALANCE KW_DATA KW_REMOVE host_list { - auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, - meta::cpp2::AdminCmd::DATA_BALANCE); - HostList* hl = $4; - std::vector has = hl->hosts(); - for (HostAddr& ha: has) { - sentence->addPara(ha.toString()); - } - delete hl; - $$ = sentence; + | KW_BALANCE KW_ACROSS KW_ZONE KW_REMOVE zone_name_list { + auto sentence = new AdminJobSentence(meta::cpp2::AdminJobOp::ADD, + meta::cpp2::AdminCmd::ZONE_BALANCE); + ZoneNameList* nl = $5; + std::vector names = nl->zoneNames(); + for (std::string& name: names) { + sentence->addPara(name); + } + delete nl; + $$ = sentence; } ; From 6f5646eb5f6b62979a13950359c2228bdd4c88b7 Mon Sep 17 00:00:00 2001 From: liwenhui-soul <38217397+liwenhui-soul@users.noreply.github.com> Date: Mon, 27 Dec 2021 12:31:53 +0800 Subject: [PATCH 6/6] fix rebase conflict --- src/clients/meta/MetaClient.cpp | 6 ++-- src/graph/planner/plan/Admin.h | 12 +++++-- src/graph/validator/AdminValidator.cpp | 8 +++-- src/meta/processors/job/BalanceJobExecutor.h | 10 ++++-- src/meta/processors/job/BalancePlan.cpp | 2 +- .../processors/job/DataBalanceJobExecutor.cpp | 20 +++++++----- src/meta/processors/job/MetaJobExecutor.cpp | 32 ++++++++++++++----- src/meta/processors/job/StorageJobExecutor.h | 28 ++++++++++++---- .../processors/job/ZoneBalanceJobExecutor.cpp | 2 +- .../processors/parts/AlterSpaceProcessor.cpp | 6 +++- src/meta/test/BalancerTest.cpp | 2 +- src/meta/test/GetStatsTest.cpp | 10 +++--- src/meta/test/TestUtils.h | 8 ++--- src/parser/AdminSentences.h | 16 +++++++--- src/parser/test/ParserTest.cpp | 8 ++--- 15 files changed, 116 insertions(+), 54 deletions(-) diff --git a/src/clients/meta/MetaClient.cpp b/src/clients/meta/MetaClient.cpp index 7a6760aff39..5d03a17f95c 100644 --- a/src/clients/meta/MetaClient.cpp +++ b/src/clients/meta/MetaClient.cpp @@ -1217,9 +1217,9 @@ folly::Future> MetaClient::alterSpace(const std::string& spaceNam meta::cpp2::AlterSpaceOp op, const std::vector& paras) { cpp2::AlterSpaceReq req; - req.set_op(op); - req.set_space_name(spaceName); - req.set_paras(paras); + req.op_ref() = op; + req.space_name_ref() = spaceName; + req.paras_ref() = paras; folly::Promise> promise; auto future = promise.getFuture(); getResponse( diff --git a/src/graph/planner/plan/Admin.h b/src/graph/planner/plan/Admin.h index bd1299e555d..0e6536587e9 100644 --- a/src/graph/planner/plan/Admin.h +++ b/src/graph/planner/plan/Admin.h @@ -225,11 +225,17 @@ class AlterSpace final : public SingleDependencyNode { const std::vector& paras) { return qctx->objPool()->add(new AlterSpace(qctx, input, spaceName, op, paras)); } - const std::string& getSpaceName() const { return spaceName_; } + const std::string& getSpaceName() const { + return spaceName_; + } - meta::cpp2::AlterSpaceOp getAlterSpaceOp() const { return op_; } + meta::cpp2::AlterSpaceOp getAlterSpaceOp() const { + return op_; + } - const std::vector& getParas() const { return paras_; } + const std::vector& getParas() const { + return paras_; + } private: AlterSpace(QueryContext* qctx, diff --git a/src/graph/validator/AdminValidator.cpp b/src/graph/validator/AdminValidator.cpp index ff2f578e438..5afcdb2a925 100644 --- a/src/graph/validator/AdminValidator.cpp +++ b/src/graph/validator/AdminValidator.cpp @@ -163,7 +163,9 @@ Status CreateSpaceAsValidator::toPlan() { return Status::OK(); } -Status AlterSpaceValidator::validateImpl() { return Status::OK(); } +Status AlterSpaceValidator::validateImpl() { + return Status::OK(); +} Status AlterSpaceValidator::toPlan() { auto sentence = static_cast(sentence_); @@ -174,7 +176,9 @@ Status AlterSpaceValidator::toPlan() { return Status::OK(); } -Status DescSpaceValidator::validateImpl() { return Status::OK(); } +Status DescSpaceValidator::validateImpl() { + return Status::OK(); +} Status DescSpaceValidator::toPlan() { auto sentence = static_cast(sentence_); diff --git a/src/meta/processors/job/BalanceJobExecutor.h b/src/meta/processors/job/BalanceJobExecutor.h index 639b6a6f41c..a284c3b6485 100644 --- a/src/meta/processors/job/BalanceJobExecutor.h +++ b/src/meta/processors/job/BalanceJobExecutor.h @@ -23,10 +23,13 @@ struct Host { HostAddr host_; std::set parts_; }; + struct Zone { Zone() = default; explicit Zone(const std::string name) : zoneName_(name) {} - bool hasHost(const HostAddr& ha) { return hosts_.find(ha) != hosts_.end(); } + bool hasHost(const HostAddr& ha) { + return hosts_.find(ha) != hosts_.end(); + } int32_t calPartNum(); bool partExist(PartitionID partId); @@ -34,6 +37,7 @@ struct Zone { std::map hosts_; int32_t partNum_; }; + struct SpaceInfo { nebula::cpp2::ErrorCode loadInfo(GraphSpaceID spaceId, kvstore::KVStore* kvstore); bool hasHost(const HostAddr& ha); @@ -65,7 +69,9 @@ class BalanceJobExecutor : public MetaJobExecutor { protected: nebula::cpp2::ErrorCode save(const std::string& k, const std::string& v); - virtual Status buildBalancePlan() { return Status::OK(); } + virtual Status buildBalancePlan() { + return Status::OK(); + } protected: std::unique_ptr plan_; diff --git a/src/meta/processors/job/BalancePlan.cpp b/src/meta/processors/job/BalancePlan.cpp index 4378802dd88..84ed777e57f 100644 --- a/src/meta/processors/job/BalancePlan.cpp +++ b/src/meta/processors/job/BalancePlan.cpp @@ -230,7 +230,7 @@ ErrorOr> BalancePlan::getBalan task.startTimeMs_ = std::get<2>(tup); task.endTimeMs_ = std::get<3>(tup); if (resume && task.ret_ != BalanceTaskResult::SUCCEEDED) { - // Resume the failed task, skip the in-progress and invalid tasks + // Resume the failed or invalid task, skip the in-progress tasks if (task.ret_ == BalanceTaskResult::FAILED || task.ret_ == BalanceTaskResult::INVALID) { task.ret_ = BalanceTaskResult::IN_PROGRESS; } diff --git a/src/meta/processors/job/DataBalanceJobExecutor.cpp b/src/meta/processors/job/DataBalanceJobExecutor.cpp index 8826b656e61..d125bb2ca25 100644 --- a/src/meta/processors/job/DataBalanceJobExecutor.cpp +++ b/src/meta/processors/job/DataBalanceJobExecutor.cpp @@ -36,8 +36,10 @@ Status DataBalanceJobExecutor::buildBalancePlan() { std::map> lostZoneHost; std::map> activeSortedHost; for (auto& zoneMapEntry : spaceInfo_.zones_) { - for (auto& hostMapEntry : zoneMapEntry.second.hosts_) { - activeSortedHost[zoneMapEntry.first].push_back(&hostMapEntry.second); + const auto& zoneName = zoneMapEntry.first; + auto& zone = zoneMapEntry.second; + for (auto& [addr, host] : zone.hosts_) { + activeSortedHost[zoneName].push_back(&host); } } for (HostAddr host : lostHosts_) { @@ -64,10 +66,12 @@ Status DataBalanceJobExecutor::buildBalancePlan() { plan_.reset(new BalancePlan(jobDescription_, kvstore_, adminClient_)); // move parts of lost hosts to active hosts in the same zone for (auto& zoneHostEntry : lostZoneHost) { - std::vector& hvec = activeSortedHost[zoneHostEntry.first]; - for (Host* host : zoneHostEntry.second) { + const std::string& zoneName = zoneHostEntry.first; + std::vector& lostHostVec = zoneHostEntry.second; + std::vector& activeVec = activeSortedHost[zoneName]; + for (Host* host : lostHostVec) { for (PartitionID partId : host->parts_) { - Host* dstHost = hvec.front(); + Host* dstHost = activeVec.front(); dstHost->parts_.insert(partId); plan_->addTask(BalanceTask(jobId_, spaceInfo_.spaceId_, @@ -76,9 +80,9 @@ Status DataBalanceJobExecutor::buildBalancePlan() { dstHost->host_, kvstore_, adminClient_)); - for (size_t i = 0; i < hvec.size() - 1; i++) { - if (hvec[i]->parts_.size() > hvec[i + 1]->parts_.size()) { - std::swap(hvec[i], hvec[i + 1]); + for (size_t i = 0; i < activeVec.size() - 1; i++) { + if (activeVec[i]->parts_.size() > activeVec[i + 1]->parts_.size()) { + std::swap(activeVec[i], activeVec[i + 1]); } else { break; } diff --git a/src/meta/processors/job/MetaJobExecutor.cpp b/src/meta/processors/job/MetaJobExecutor.cpp index dc3cc4e8d7d..643aaa20921 100644 --- a/src/meta/processors/job/MetaJobExecutor.cpp +++ b/src/meta/processors/job/MetaJobExecutor.cpp @@ -12,10 +12,14 @@ DECLARE_uint32(expired_time_factor); namespace nebula { namespace meta { -bool MetaJobExecutor::check() { return true; } +bool MetaJobExecutor::check() { + return true; +} // Prepare the Job info from the arguments. -nebula::cpp2::ErrorCode MetaJobExecutor::prepare() { return nebula::cpp2::ErrorCode::SUCCEEDED; } +nebula::cpp2::ErrorCode MetaJobExecutor::prepare() { + return nebula::cpp2::ErrorCode::SUCCEEDED; +} // The skeleton to run the job. // You should rewrite the executeInternal to trigger the calling. @@ -31,15 +35,25 @@ nebula::cpp2::ErrorCode MetaJobExecutor::execute() { } // Stop the job when the user cancel it. -nebula::cpp2::ErrorCode MetaJobExecutor::stop() { return nebula::cpp2::ErrorCode::SUCCEEDED; } +nebula::cpp2::ErrorCode MetaJobExecutor::stop() { + return nebula::cpp2::ErrorCode::SUCCEEDED; +} -nebula::cpp2::ErrorCode MetaJobExecutor::finish(bool) { return nebula::cpp2::ErrorCode::SUCCEEDED; } +nebula::cpp2::ErrorCode MetaJobExecutor::finish(bool) { + return nebula::cpp2::ErrorCode::SUCCEEDED; +} -void MetaJobExecutor::setSpaceId(GraphSpaceID spaceId) { space_ = spaceId; } +void MetaJobExecutor::setSpaceId(GraphSpaceID spaceId) { + space_ = spaceId; +} -bool MetaJobExecutor::isMetaJob() { return true; } +bool MetaJobExecutor::isMetaJob() { + return true; +} -nebula::cpp2::ErrorCode MetaJobExecutor::recovery() { return nebula::cpp2::ErrorCode::SUCCEEDED; } +nebula::cpp2::ErrorCode MetaJobExecutor::recovery() { + return nebula::cpp2::ErrorCode::SUCCEEDED; +} void MetaJobExecutor::setFinishCallBack( std::function func) { @@ -50,7 +64,9 @@ nebula::cpp2::ErrorCode MetaJobExecutor::saveSpecialTaskStatus(const cpp2::Repor return nebula::cpp2::ErrorCode::SUCCEEDED; } -folly::Future MetaJobExecutor::executeInternal() { return Status::OK(); } +folly::Future MetaJobExecutor::executeInternal() { + return Status::OK(); +} } // namespace meta } // namespace nebula diff --git a/src/meta/processors/job/StorageJobExecutor.h b/src/meta/processors/job/StorageJobExecutor.h index 9bb9c5bf07e..c5eb0d32170 100644 --- a/src/meta/processors/job/StorageJobExecutor.h +++ b/src/meta/processors/job/StorageJobExecutor.h @@ -33,10 +33,14 @@ class StorageJobExecutor : public JobExecutor { virtual ~StorageJobExecutor() = default; // Check the arguments about the job. - bool check() override { return true; } + bool check() override { + return true; + } // Prepare the Job info from the arguments. - nebula::cpp2::ErrorCode prepare() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + nebula::cpp2::ErrorCode prepare() override { + return nebula::cpp2::ErrorCode::SUCCEEDED; + } // The skeleton to run the job. // You should rewrite the executeInternal to trigger the calling. @@ -45,19 +49,29 @@ class StorageJobExecutor : public JobExecutor { void interruptExecution(JobID jobId); // Stop the job when the user cancel it. - nebula::cpp2::ErrorCode stop() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + nebula::cpp2::ErrorCode stop() override { + return nebula::cpp2::ErrorCode::SUCCEEDED; + } - nebula::cpp2::ErrorCode finish(bool) override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + nebula::cpp2::ErrorCode finish(bool) override { + return nebula::cpp2::ErrorCode::SUCCEEDED; + } - void setSpaceId(GraphSpaceID spaceId) override { space_ = spaceId; } + void setSpaceId(GraphSpaceID spaceId) override { + space_ = spaceId; + } nebula::cpp2::ErrorCode saveSpecialTaskStatus(const cpp2::ReportTaskReq&) override { return nebula::cpp2::ErrorCode::SUCCEEDED; } - bool isMetaJob() override { return false; } + bool isMetaJob() override { + return false; + } - nebula::cpp2::ErrorCode recovery() override { return nebula::cpp2::ErrorCode::SUCCEEDED; } + nebula::cpp2::ErrorCode recovery() override { + return nebula::cpp2::ErrorCode::SUCCEEDED; + } protected: ErrOrHosts getTargetHost(GraphSpaceID space); diff --git a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp index 4059b5b18be..0d1833247ce 100644 --- a/src/meta/processors/job/ZoneBalanceJobExecutor.cpp +++ b/src/meta/processors/job/ZoneBalanceJobExecutor.cpp @@ -73,7 +73,7 @@ nebula::cpp2::ErrorCode ZoneBalanceJobExecutor::updateMeta() { for (auto& zoneMapEntry : spaceInfo_.zones_) { zones.emplace_back(zoneMapEntry.first); } - properties.set_zone_names(std::move(zones)); + properties.zone_names_ref() = std::move(zones); std::vector data; data.emplace_back(MetaKeyUtils::spaceKey(spaceInfo_.spaceId_), MetaKeyUtils::spaceVal(properties)); diff --git a/src/meta/processors/parts/AlterSpaceProcessor.cpp b/src/meta/processors/parts/AlterSpaceProcessor.cpp index 090df5f93fe..d89f1590f68 100644 --- a/src/meta/processors/parts/AlterSpaceProcessor.cpp +++ b/src/meta/processors/parts/AlterSpaceProcessor.cpp @@ -47,6 +47,7 @@ nebula::cpp2::ErrorCode AlterSpaceProcessor::addZones(const std::string& spaceNa meta::cpp2::SpaceDesc properties = MetaKeyUtils::parseSpace(spaceVal); const std::vector& curZones = properties.get_zone_names(); std::set zm(curZones.begin(), curZones.end()); + // zone_list may has duplicate zone std::set distinctZones(zones.begin(), zones.end()); std::vector newZones = curZones; newZones.reserve(curZones.size() + distinctZones.size()); @@ -60,12 +61,15 @@ nebula::cpp2::ErrorCode AlterSpaceProcessor::addZones(const std::string& spaceNa ? nebula::cpp2::ErrorCode::E_ZONE_NOT_FOUND : zoneRet; } + // if zone_list has a zone that already exist in current space, return error if (zm.count(z)) { return nebula::cpp2::ErrorCode::E_CONFLICT; } newZones.emplace_back(z); } - properties.set_zone_names(newZones); + + // update zones then put it into kv + properties.zone_names_ref() = newZones; std::vector data; data.emplace_back(MetaKeyUtils::spaceKey(spaceId), MetaKeyUtils::spaceVal(properties)); folly::Baton baton; diff --git a/src/meta/test/BalancerTest.cpp b/src/meta/test/BalancerTest.cpp index 8edb86eb22c..340e8f3260e 100644 --- a/src/meta/test/BalancerTest.cpp +++ b/src/meta/test/BalancerTest.cpp @@ -1285,7 +1285,7 @@ TEST(BalanceTest, LeaderBalanceWithLargerZoneTest) { } } -TEST(BalanceTest, DISABLED_LeaderBalanceWithComplexZoneTest) { +TEST(BalanceTest, LeaderBalanceWithComplexZoneTest) { fs::TempDir rootPath("/tmp/LeaderBalanceWithComplexZoneTest.XXXXXX"); auto store = MockCluster::initMetaKV(rootPath.path()); auto* kv = dynamic_cast(store.get()); diff --git a/src/meta/test/GetStatsTest.cpp b/src/meta/test/GetStatsTest.cpp index 8cb7a3e1454..aa6ce2e3da4 100644 --- a/src/meta/test/GetStatsTest.cpp +++ b/src/meta/test/GetStatsTest.cpp @@ -62,11 +62,11 @@ struct JobCallBack { req.task_id_ref() = taskId_; cpp2::StatsItem item; - item.set_tag_vertices({{"t1", n_}, {"t2", n_}}); - item.set_edges({{"e1", n_}, {"e2", n_}}); - item.set_space_vertices(2 * n_); - item.set_space_edges(2 * n_); - req.set_stats(item); + item.tag_vertices_ref() = {{"t1", n_}, {"t2", n_}}; + item.edges_ref() = {{"e1", n_}, {"e2", n_}}; + item.space_vertices_ref() = 2 * n_; + item.space_edges_ref() = 2 * n_; + req.stats_ref() = item; jobMgr_->muJobFinished_.unlock(); jobMgr_->reportTaskFinish(req); return folly::Future(Status::OK()); diff --git a/src/meta/test/TestUtils.h b/src/meta/test/TestUtils.h index 7c09ab0646d..ba95e4bcbf7 100644 --- a/src/meta/test/TestUtils.h +++ b/src/meta/test/TestUtils.h @@ -215,9 +215,9 @@ class TestUtils { int32_t zoneNum, int32_t totalHost) { cpp2::SpaceDesc properties; - properties.set_space_name("test_space"); - properties.set_partition_num(partitionNum); - properties.set_replica_factor(replica); + properties.space_name_ref() = "test_space"; + properties.partition_num_ref() = partitionNum; + properties.replica_factor_ref() = replica; auto spaceVal = MetaKeyUtils::spaceVal(properties); std::vector data; data.emplace_back(MetaKeyUtils::indexSpaceKey("test_space"), @@ -230,7 +230,7 @@ class TestUtils { zonePartNum[std::to_string(i + 1)] = 0; zoneNames.push_back(std::to_string(i + 1)); } - properties.set_zone_names(zoneNames); + properties.zone_names_ref() = zoneNames; data.emplace_back(MetaKeyUtils::spaceKey(id), MetaKeyUtils::spaceVal(properties)); std::vector allHosts; for (int32_t i = 0; i < totalHost; i++) { diff --git a/src/parser/AdminSentences.h b/src/parser/AdminSentences.h index 3e125b83026..812fe6f4d1e 100644 --- a/src/parser/AdminSentences.h +++ b/src/parser/AdminSentences.h @@ -445,13 +445,21 @@ class AlterSpaceSentence final : public Sentence { : op_(op), spaceName_(spaceName) { kind_ = Kind::kAlterSpace; } - void addPara(const std::string& para) { paras_.push_back(para); } + void addPara(const std::string& para) { + paras_.push_back(para); + } - std::string spaceName() const { return *spaceName_; } + std::string spaceName() const { + return *spaceName_; + } - const std::vector& paras() const { return paras_; } + const std::vector& paras() const { + return paras_; + } - meta::cpp2::AlterSpaceOp alterSpaceOp() const { return op_; } + meta::cpp2::AlterSpaceOp alterSpaceOp() const { + return op_; + } std::string toString() const override; diff --git a/src/parser/test/ParserTest.cpp b/src/parser/test/ParserTest.cpp index 8cdf7974f42..a40726b4525 100644 --- a/src/parser/test/ParserTest.cpp +++ b/src/parser/test/ParserTest.cpp @@ -2016,22 +2016,22 @@ TEST_F(ParserTest, BalanceOperation) { ASSERT_TRUE(result.ok()) << result.status(); } { - std::string query = "BALANCE DATA"; + std::string query = "BALANCE IN ZONE"; auto result = parse(query); ASSERT_TRUE(result.ok()) << result.status(); } { - std::string query = "BALANCE DATA 1234567890"; + std::string query = "BALANCE ACROSS ZONE"; auto result = parse(query); ASSERT_TRUE(result.ok()) << result.status(); } { - std::string query = "BALANCE DATA REMOVE 192.168.0.1:50000,192.168.0.1:50001"; + std::string query = "BALANCE IN ZONE REMOVE 192.168.0.1:50000,192.168.0.1:50001"; auto result = parse(query); ASSERT_TRUE(result.ok()) << result.status(); } { - std::string query = "BALANCE DATA REMOVE 192.168.0.1:50000,\"localhost\":50001"; + std::string query = "BALANCE IN ZONE REMOVE 192.168.0.1:50000,\"localhost\":50001"; auto result = parse(query); ASSERT_TRUE(result.ok()) << result.status(); }