From f01eb24f5a4862d74b7d335230ce1c3b2b25d7d9 Mon Sep 17 00:00:00 2001 From: Alex Robinson Date: Fri, 15 Dec 2017 13:18:57 -0500 Subject: [PATCH] storage: Avoid replica thrashing when localities are different sizes Fixes #20241 Release note (bug fix): avoid rebalance thrashing when localities have very different numbers of nodes --- pkg/storage/allocator.go | 51 +++++++++++++++++++++++++++++++-- pkg/storage/allocator_scorer.go | 35 ++++++++++++++++++++++ pkg/storage/allocator_test.go | 38 +++++++++++++++++++++++- 3 files changed, 120 insertions(+), 4 deletions(-) diff --git a/pkg/storage/allocator.go b/pkg/storage/allocator.go index 99e9c0245e36..5c33f67861a5 100644 --- a/pkg/storage/allocator.go +++ b/pkg/storage/allocator.go @@ -516,7 +516,9 @@ func (a Allocator) RebalanceTarget( if target == nil { return nil, "" } - // We could make a simulation here to verify whether we'll remove the target we'll rebalance to. + + // Determine whether we'll just remove the target immediately after adding it. + // If we would, we don't want to actually do the rebalance. for len(candidates) > 0 { newReplica := roachpb.ReplicaDescriptor{ NodeID: target.store.Node.NodeID, @@ -539,12 +541,17 @@ func (a Allocator) RebalanceTarget( raftStatus, desc.Replicas, newReplica.ReplicaID) } - removeReplica, _, err := a.simulateRemoveTarget(ctx, target.store.StoreID, constraints, replicaCandidates, rangeInfo) + removeReplica, details, err := a.simulateRemoveTarget( + ctx, + target.store.StoreID, + constraints, + replicaCandidates, + rangeInfo) if err != nil { log.Warningf(ctx, "simulating RemoveTarget failed: %s", err) return nil, "" } - if removeReplica.StoreID != target.store.StoreID { + if shouldRebalanceBetween(ctx, a.storePool.st, *target, removeReplica, existingCandidates, details) { break } // Remove the considered target from our modified RangeDescriptor and from @@ -568,6 +575,44 @@ func (a Allocator) RebalanceTarget( return &target.store, string(details) } +// shouldRebalanceBetween returns whether it's a good idea to rebalance to the +// given `add` candidate if the replica that will be removed after adding it is +// `remove`. This is a last failsafe to ensure that we don't take unnecessary +// rebalance actions that cause thrashing. +func shouldRebalanceBetween( + ctx context.Context, + st *cluster.Settings, + add candidate, + remove roachpb.ReplicaDescriptor, + existingCandidates candidateList, + removeDetails string, +) bool { + if remove.StoreID == add.store.StoreID { + log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s", + add.store.StoreID, removeDetails) + return false + } + + // It's possible that we initially decided to rebalance based on comparing + // rebalance candidates in one locality to an existing replica in another + // locality (e.g. if one locality has many more nodes than another). This can + // make for unnecessary rebalances and even thrashing, so do a more direct + // comparison here of the replicas we'll actually be adding and removing. + for _, removeCandidate := range existingCandidates { + if removeCandidate.store.StoreID == remove.StoreID { + if removeCandidate.worthRebalancingTo(st, add) { + return true + } + log.VEventf(ctx, 2, "not rebalancing to %s because it isn't an improvement over "+ + "what we'd remove after adding it: %s", add, removeCandidate) + return false + } + } + // If the code reaches this point, remove must be a non-live store, so let the + // rebalance happen. + return true +} + // TransferLeaseTarget returns a suitable replica to transfer the range lease // to from the provided list. It excludes the current lease holder replica // unless asked to do otherwise by the checkTransferLeaseSource parameter. diff --git a/pkg/storage/allocator_scorer.go b/pkg/storage/allocator_scorer.go index 2776cc3dcca1..48ef71786c10 100644 --- a/pkg/storage/allocator_scorer.go +++ b/pkg/storage/allocator_scorer.go @@ -134,6 +134,41 @@ func (c candidate) less(o candidate) bool { return c.rangeCount > o.rangeCount } +// worthRebalancingTo returns true if o is enough of a better fit for some +// range than c is that it's worth rebalancing from c to o. +func (c candidate) worthRebalancingTo(st *cluster.Settings, o candidate) bool { + if !o.valid { + return false + } + if !c.valid { + return true + } + if c.constraintScore != o.constraintScore { + return c.constraintScore < o.constraintScore + } + if c.convergesScore != o.convergesScore { + return c.convergesScore < o.convergesScore + } + // You might intuitively think that we should require o's balanceScore to + // be considerably higher than c's balanceScore, but that will effectively + // rule out rebalancing in clusters where one locality is much larger or + // smaller than the others, since all the stores in that locality will tend + // to have either a maximal or minimal balanceScore. + if c.balanceScore.totalScore() != o.balanceScore.totalScore() { + return c.balanceScore.totalScore() < o.balanceScore.totalScore() + } + // Instead, just require a gap between their number of ranges. This isn't + // great, particularly for stats-based rebalancing, but it only breaks + // balanceScore ties and it's a workable stop-gap on the way to something + // like #20751. + avgRangeCount := float64(c.rangeCount+o.rangeCount) / 2.0 + // Use an overfullThreshold that is at least a couple replicas larger than + // the average of the two, to ensure that we don't keep rebalancing back + // and forth between nodes that only differ by one or two replicas. + overfullThreshold := math.Max(overfullRangeThreshold(st, avgRangeCount), avgRangeCount+1.5) + return float64(c.rangeCount) > overfullThreshold +} + type candidateList []candidate func (cl candidateList) String() string { diff --git a/pkg/storage/allocator_test.go b/pkg/storage/allocator_test.go index 90599852180c..ee57b09dc70f 100644 --- a/pkg/storage/allocator_test.go +++ b/pkg/storage/allocator_test.go @@ -787,7 +787,7 @@ func TestAllocatorRebalanceTarget(t *testing.T) { repl := &Replica{RangeID: firstRange} repl.mu.Lock() - repl.mu.state.Stats = &enginepb.MVCCStats{} + repl.mu.state.Stats = enginepb.MVCCStats{} repl.mu.Unlock() rs := newReplicaStats(clock, nil) @@ -820,6 +820,42 @@ func TestAllocatorRebalanceTarget(t *testing.T) { t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details) } } + + // Set up a second round of testing where the other two stores in the big + // locality actually have fewer replicas, but enough that it still isn't + // worth rebalancing to them. + stores[1].Capacity.RangeCount = 46 + stores[2].Capacity.RangeCount = 46 + sg.GossipStores(stores, t) + for i := 0; i < 10; i++ { + result, details := a.RebalanceTarget( + context.Background(), + config.Constraints{}, + status, + rangeInfo, + storeFilterThrottled, + ) + if result != nil { + t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details) + } + } + + // Make sure rebalancing does happen if we drop just a little further down. + stores[1].Capacity.RangeCount = 45 + sg.GossipStores(stores, t) + for i := 0; i < 10; i++ { + result, details := a.RebalanceTarget( + context.Background(), + config.Constraints{}, + status, + rangeInfo, + storeFilterThrottled, + ) + if result == nil || result.StoreID != stores[1].StoreID { + t.Fatalf("expected rebalance to s%d, but got %v; details: %s", + stores[1].StoreID, result, details) + } + } } func TestAllocatorRebalanceDeadNodes(t *testing.T) {