Skip to content

Commit

Permalink
storage: Avoid replica thrashing when localities are different sizes
Browse files Browse the repository at this point in the history
Fixes #20241

Release note (bug fix): avoid rebalance thrashing when localities have
very different numbers of nodes
  • Loading branch information
a-robinson committed Dec 20, 2017
1 parent 06f7d76 commit f01eb24
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 4 deletions.
51 changes: 48 additions & 3 deletions pkg/storage/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,9 @@ func (a Allocator) RebalanceTarget(
if target == nil {
return nil, ""
}
// We could make a simulation here to verify whether we'll remove the target we'll rebalance to.

// Determine whether we'll just remove the target immediately after adding it.
// If we would, we don't want to actually do the rebalance.
for len(candidates) > 0 {
newReplica := roachpb.ReplicaDescriptor{
NodeID: target.store.Node.NodeID,
Expand All @@ -539,12 +541,17 @@ func (a Allocator) RebalanceTarget(
raftStatus, desc.Replicas, newReplica.ReplicaID)
}

removeReplica, _, err := a.simulateRemoveTarget(ctx, target.store.StoreID, constraints, replicaCandidates, rangeInfo)
removeReplica, details, err := a.simulateRemoveTarget(
ctx,
target.store.StoreID,
constraints,
replicaCandidates,
rangeInfo)
if err != nil {
log.Warningf(ctx, "simulating RemoveTarget failed: %s", err)
return nil, ""
}
if removeReplica.StoreID != target.store.StoreID {
if shouldRebalanceBetween(ctx, a.storePool.st, *target, removeReplica, existingCandidates, details) {
break
}
// Remove the considered target from our modified RangeDescriptor and from
Expand All @@ -568,6 +575,44 @@ func (a Allocator) RebalanceTarget(
return &target.store, string(details)
}

// shouldRebalanceBetween returns whether it's a good idea to rebalance to the
// given `add` candidate if the replica that will be removed after adding it is
// `remove`. This is a last failsafe to ensure that we don't take unnecessary
// rebalance actions that cause thrashing.
func shouldRebalanceBetween(
ctx context.Context,
st *cluster.Settings,
add candidate,
remove roachpb.ReplicaDescriptor,
existingCandidates candidateList,
removeDetails string,
) bool {
if remove.StoreID == add.store.StoreID {
log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s",
add.store.StoreID, removeDetails)
return false
}

// It's possible that we initially decided to rebalance based on comparing
// rebalance candidates in one locality to an existing replica in another
// locality (e.g. if one locality has many more nodes than another). This can
// make for unnecessary rebalances and even thrashing, so do a more direct
// comparison here of the replicas we'll actually be adding and removing.
for _, removeCandidate := range existingCandidates {
if removeCandidate.store.StoreID == remove.StoreID {
if removeCandidate.worthRebalancingTo(st, add) {
return true
}
log.VEventf(ctx, 2, "not rebalancing to %s because it isn't an improvement over "+
"what we'd remove after adding it: %s", add, removeCandidate)
return false
}
}
// If the code reaches this point, remove must be a non-live store, so let the
// rebalance happen.
return true
}

// TransferLeaseTarget returns a suitable replica to transfer the range lease
// to from the provided list. It excludes the current lease holder replica
// unless asked to do otherwise by the checkTransferLeaseSource parameter.
Expand Down
35 changes: 35 additions & 0 deletions pkg/storage/allocator_scorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,41 @@ func (c candidate) less(o candidate) bool {
return c.rangeCount > o.rangeCount
}

// worthRebalancingTo returns true if o is enough of a better fit for some
// range than c is that it's worth rebalancing from c to o.
func (c candidate) worthRebalancingTo(st *cluster.Settings, o candidate) bool {
if !o.valid {
return false
}
if !c.valid {
return true
}
if c.constraintScore != o.constraintScore {
return c.constraintScore < o.constraintScore
}
if c.convergesScore != o.convergesScore {
return c.convergesScore < o.convergesScore
}
// You might intuitively think that we should require o's balanceScore to
// be considerably higher than c's balanceScore, but that will effectively
// rule out rebalancing in clusters where one locality is much larger or
// smaller than the others, since all the stores in that locality will tend
// to have either a maximal or minimal balanceScore.
if c.balanceScore.totalScore() != o.balanceScore.totalScore() {
return c.balanceScore.totalScore() < o.balanceScore.totalScore()
}
// Instead, just require a gap between their number of ranges. This isn't
// great, particularly for stats-based rebalancing, but it only breaks
// balanceScore ties and it's a workable stop-gap on the way to something
// like #20751.
avgRangeCount := float64(c.rangeCount+o.rangeCount) / 2.0
// Use an overfullThreshold that is at least a couple replicas larger than
// the average of the two, to ensure that we don't keep rebalancing back
// and forth between nodes that only differ by one or two replicas.
overfullThreshold := math.Max(overfullRangeThreshold(st, avgRangeCount), avgRangeCount+1.5)
return float64(c.rangeCount) > overfullThreshold
}

type candidateList []candidate

func (cl candidateList) String() string {
Expand Down
38 changes: 37 additions & 1 deletion pkg/storage/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
repl := &Replica{RangeID: firstRange}

repl.mu.Lock()
repl.mu.state.Stats = &enginepb.MVCCStats{}
repl.mu.state.Stats = enginepb.MVCCStats{}
repl.mu.Unlock()

rs := newReplicaStats(clock, nil)
Expand Down Expand Up @@ -820,6 +820,42 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details)
}
}

// Set up a second round of testing where the other two stores in the big
// locality actually have fewer replicas, but enough that it still isn't
// worth rebalancing to them.
stores[1].Capacity.RangeCount = 46
stores[2].Capacity.RangeCount = 46
sg.GossipStores(stores, t)
for i := 0; i < 10; i++ {
result, details := a.RebalanceTarget(
context.Background(),
config.Constraints{},
status,
rangeInfo,
storeFilterThrottled,
)
if result != nil {
t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details)
}
}

// Make sure rebalancing does happen if we drop just a little further down.
stores[1].Capacity.RangeCount = 45
sg.GossipStores(stores, t)
for i := 0; i < 10; i++ {
result, details := a.RebalanceTarget(
context.Background(),
config.Constraints{},
status,
rangeInfo,
storeFilterThrottled,
)
if result == nil || result.StoreID != stores[1].StoreID {
t.Fatalf("expected rebalance to s%d, but got %v; details: %s",
stores[1].StoreID, result, details)
}
}
}

func TestAllocatorRebalanceDeadNodes(t *testing.T) {
Expand Down

0 comments on commit f01eb24

Please sign in to comment.