Skip to content

Commit

Permalink
storage: Fix simulation of rebalance removals to actually remove targets
Browse files Browse the repository at this point in the history
If the first target attempted was rejected due to the simulation
claiming that it would be immediately removed, we would reuse the
modified `rangeInfo.Desc.Replicas` that had the target added to it,
messing with future iterations of the loop.

Also, we weren't properly modifying the `candidates` slice, meaning that
we could end up trying the same replica multiple times.

Release note (bug fix): Improve data rebalancing to make thrashing
back and forth between nodes much less likely.
  • Loading branch information
a-robinson committed Dec 20, 2017
1 parent 5dbe05e commit fd46e3f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 28 deletions.
10 changes: 6 additions & 4 deletions pkg/storage/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,12 +542,14 @@ func (a Allocator) RebalanceTarget(
if removeReplica.StoreID != target.store.StoreID {
break
}
newTargets := candidates.removeCandidate(*target)
newTarget := newTargets.selectGood(a.randGen)
if newTarget == nil {
// Remove the considered target from our modified RangeDescriptor and from
// the candidates list, then try again if there are any other candidates.
rangeInfo.Desc.Replicas = rangeInfo.Desc.Replicas[:len(rangeInfo.Desc.Replicas)-1]
candidates = candidates.removeCandidate(*target)
target = candidates.selectGood(a.randGen)
if target == nil {
return nil, ""
}
target = newTarget
}
details, err := json.Marshal(decisionDetails{
Target: target.String(),
Expand Down
53 changes: 29 additions & 24 deletions pkg/storage/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -695,13 +695,13 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
clock := hlc.NewClock(manual.UnixNano, time.Nanosecond)
stopper, g, _, a, _ := createTestAllocator( /* deterministic */ false)
defer stopper.Stop(context.Background())
// We make 4 stores in this test, store1 and store2 are in the same datacenter a,
// store3 in the datacenter b, store4 in the datacenter c. all of our replicas are
// distributed within these three datacenters. Originally, store4 has much more replicas
// than other stores. So if we didn't make the simulation in RebalanceTarget, it will
// try to choose store2 as the target store to make an rebalance.
// However, we'll immediately remove the replica on the store2 to guarantee the locality diversity.
// But after we make the simulation in RebalanceTarget, we could avoid that happen.
// We make 5 stores in this test -- 3 in the same datacenter, and 1 each in
// 2 other datacenters. All of our replicas are distributed within these 3
// datacenters. Originally, the stores that are all alone in their datacenter
// are fuller than the other stores. If we didn't simulate RemoveTarget in
// RebalanceTarget, we would try to choose store 2 or 3 as the target store
// to make a rebalance. However, we would immediately remove the replica on
// store 1 or 2 to retain the locality diversity.
stores := []*roachpb.StoreDescriptor{
{
StoreID: 1,
Expand All @@ -728,7 +728,7 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
},
},
Capacity: roachpb.StoreCapacity{
RangeCount: 56,
RangeCount: 55,
},
},
{
Expand All @@ -737,7 +737,7 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
NodeID: 3,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{Key: "datacenter", Value: "b"},
{Key: "datacenter", Value: "a"},
},
},
},
Expand All @@ -749,14 +749,28 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
StoreID: 4,
Node: roachpb.NodeDescriptor{
NodeID: 4,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{Key: "datacenter", Value: "b"},
},
},
},
Capacity: roachpb.StoreCapacity{
RangeCount: 100,
},
},
{
StoreID: 5,
Node: roachpb.NodeDescriptor{
NodeID: 5,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{Key: "datacenter", Value: "c"},
},
},
},
Capacity: roachpb.StoreCapacity{
RangeCount: 150,
RangeCount: 100,
},
},
}
Expand All @@ -766,18 +780,9 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
st := a.storePool.st
EnableStatsBasedRebalancing.Override(&st.SV, false)
replicas := []roachpb.ReplicaDescriptor{
{
StoreID: 1,
NodeID: 1,
},
{
StoreID: 3,
NodeID: 3,
},
{
StoreID: 4,
NodeID: 4,
},
{NodeID: 1, StoreID: 1},
{NodeID: 4, StoreID: 4},
{NodeID: 5, StoreID: 5},
}
repl := &Replica{RangeID: firstRange}

Expand All @@ -804,15 +809,15 @@ func TestAllocatorRebalanceTarget(t *testing.T) {
}
}
for i := 0; i < 10; i++ {
result, _ := a.RebalanceTarget(
result, details := a.RebalanceTarget(
context.Background(),
config.Constraints{},
status,
rangeInfo,
storeFilterThrottled,
)
if result != nil {
t.Errorf("expected no rebalance, but got %d.", result.StoreID)
t.Fatalf("expected no rebalance, but got target s%d; details: %s", result.StoreID, details)
}
}
}
Expand Down

0 comments on commit fd46e3f

Please sign in to comment.