storage: Avoid replica thrashing when localities are different sizes

Fixes #20241 Release note (bug fix): avoid rebalance thrashing when localities have very different numbers of nodes
cockroachdb · Dec 20, 2017 · f01eb24 · f01eb24
1 parent 06f7d76
commit f01eb24
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 4 deletions.
diff --git a/pkg/storage/allocator.go b/pkg/storage/allocator.go
@@ -516,7 +516,9 @@ func (a Allocator) RebalanceTarget(
 	if target == nil {
 		return nil, ""
 	}
-	// We could make a simulation here to verify whether we'll remove the target we'll rebalance to.
+
+	// Determine whether we'll just remove the target immediately after adding it.
+	// If we would, we don't want to actually do the rebalance.
 	for len(candidates) > 0 {
 		newReplica := roachpb.ReplicaDescriptor{
 			NodeID:    target.store.Node.NodeID,
@@ -539,12 +541,17 @@ func (a Allocator) RebalanceTarget(
 				raftStatus, desc.Replicas, newReplica.ReplicaID)
 		}
 
-		removeReplica, _, err := a.simulateRemoveTarget(ctx, target.store.StoreID, constraints, replicaCandidates, rangeInfo)
+		removeReplica, details, err := a.simulateRemoveTarget(
+			ctx,
+			target.store.StoreID,
+			constraints,
+			replicaCandidates,
+			rangeInfo)
 		if err != nil {
 			log.Warningf(ctx, "simulating RemoveTarget failed: %s", err)
 			return nil, ""
 		}
-		if removeReplica.StoreID != target.store.StoreID {
+		if shouldRebalanceBetween(ctx, a.storePool.st, *target, removeReplica, existingCandidates, details) {
 			break
 		}
 		// Remove the considered target from our modified RangeDescriptor and from
@@ -568,6 +575,44 @@ func (a Allocator) RebalanceTarget(
 	return &target.store, string(details)
 }
 
+// shouldRebalanceBetween returns whether it's a good idea to rebalance to the
+// given `add` candidate if the replica that will be removed after adding it is
+// `remove`. This is a last failsafe to ensure that we don't take unnecessary
+// rebalance actions that cause thrashing.
+func shouldRebalanceBetween(
+	ctx context.Context,
+	st *cluster.Settings,
+	add candidate,
+	remove roachpb.ReplicaDescriptor,
+	existingCandidates candidateList,
+	removeDetails string,
+) bool {
+	if remove.StoreID == add.store.StoreID {
+		log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s",
+			add.store.StoreID, removeDetails)
+		return false
+	}
+
+	// It's possible that we initially decided to rebalance based on comparing
+	// rebalance candidates in one locality to an existing replica in another
+	// locality (e.g. if one locality has many more nodes than another). This can
+	// make for unnecessary rebalances and even thrashing, so do a more direct
+	// comparison here of the replicas we'll actually be adding and removing.
+	for _, removeCandidate := range existingCandidates {
+		if removeCandidate.store.StoreID == remove.StoreID {
+			if removeCandidate.worthRebalancingTo(st, add) {
+				return true
+			}
+			log.VEventf(ctx, 2, "not rebalancing to %s because it isn't an improvement over "+
+				"what we'd remove after adding it: %s", add, removeCandidate)
+			return false
+		}
+	}
+	// If the code reaches this point, remove must be a non-live store, so let the
+	// rebalance happen.
+	return true
+}
+
 // TransferLeaseTarget returns a suitable replica to transfer the range lease
 // to from the provided list. It excludes the current lease holder replica
 // unless asked to do otherwise by the checkTransferLeaseSource parameter.

diff --git a/pkg/storage/allocator_scorer.go b/pkg/storage/allocator_scorer.go
@@ -134,6 +134,41 @@ func (c candidate) less(o candidate) bool {
 	return c.rangeCount > o.rangeCount
 }
 
+// worthRebalancingTo returns true if o is enough of a better fit for some
+// range than c is that it's worth rebalancing from c to o.
+func (c candidate) worthRebalancingTo(st *cluster.Settings, o candidate) bool {
+	if !o.valid {
+		return false
+	}
+	if !c.valid {
+		return true
+	}
+	if c.constraintScore != o.constraintScore {
+		return c.constraintScore < o.constraintScore
+	}
+	if c.convergesScore != o.convergesScore {
+		return c.convergesScore < o.convergesScore
+	}
+	// You might intuitively think that we should require o's balanceScore to
+	// be considerably higher than c's balanceScore, but that will effectively
+	// rule out rebalancing in clusters where one locality is much larger or
+	// smaller than the others, since all the stores in that locality will tend
+	// to have either a maximal or minimal balanceScore.
+	if c.balanceScore.totalScore() != o.balanceScore.totalScore() {
+		return c.balanceScore.totalScore() < o.balanceScore.totalScore()
+	}
+	// Instead, just require a gap between their number of ranges. This isn't
+	// great, particularly for stats-based rebalancing, but it only breaks
+	// balanceScore ties and it's a workable stop-gap on the way to something
+	// like #20751.
+	avgRangeCount := float64(c.rangeCount+o.rangeCount) / 2.0
+	// Use an overfullThreshold that is at least a couple replicas larger than
+	// the average of the two, to ensure that we don't keep rebalancing back
+	// and forth between nodes that only differ by one or two replicas.
+	overfullThreshold := math.Max(overfullRangeThreshold(st, avgRangeCount), avgRangeCount+1.5)
+	return float64(c.rangeCount) > overfullThreshold
+}
+
 type candidateList []candidate
 
 func (cl candidateList) String() string {

diff --git a/pkg/storage/allocator_test.go b/pkg/storage/allocator_test.go
@@ -787,7 +787,7 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
 	repl := &Replica{RangeID: firstRange}
 
 	repl.mu.Lock()
-	repl.mu.state.Stats = &enginepb.MVCCStats{}
+	repl.mu.state.Stats = enginepb.MVCCStats{}
 	repl.mu.Unlock()
 
 	rs := newReplicaStats(clock, nil)
@@ -820,6 +820,42 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
 			t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details)
 		}
 	}
+
+	// Set up a second round of testing where the other two stores in the big
+	// locality actually have fewer replicas, but enough that it still isn't
+	// worth rebalancing to them.
+	stores[1].Capacity.RangeCount = 46
+	stores[2].Capacity.RangeCount = 46
+	sg.GossipStores(stores, t)
+	for i := 0; i < 10; i++ {
+		result, details := a.RebalanceTarget(
+			context.Background(),
+			config.Constraints{},
+			status,
+			rangeInfo,
+			storeFilterThrottled,
+		)
+		if result != nil {
+			t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details)
+		}
+	}
+
+	// Make sure rebalancing does happen if we drop just a little further down.
+	stores[1].Capacity.RangeCount = 45
+	sg.GossipStores(stores, t)
+	for i := 0; i < 10; i++ {
+		result, details := a.RebalanceTarget(
+			context.Background(),
+			config.Constraints{},
+			status,
+			rangeInfo,
+			storeFilterThrottled,
+		)
+		if result == nil || result.StoreID != stores[1].StoreID {
+			t.Fatalf("expected rebalance to s%d, but got %v; details: %s",
+				stores[1].StoreID, result, details)
+		}
+	}
 }
 
 func TestAllocatorRebalanceDeadNodes(t *testing.T) {