Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver: fix up/downreplication status of non-voters in range report #60029

Merged
merged 1 commit into from
Mar 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,9 @@ func GetNeededNonVoters(numVoters, zoneConfigNonVoterCount, clusterNodes int) in
// replica.
need = clusterNodes - numVoters
}
if need < 0 {
need = 0 // Must be non-negative.
}
return need
}

Expand Down
38 changes: 25 additions & 13 deletions pkg/kv/kvserver/replica_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func calcReplicaMetrics(
m.Ticking = ticking

m.RangeCounter, m.Unavailable, m.Underreplicated, m.Overreplicated =
calcRangeCounter(storeID, desc, livenessMap, *zone.NumReplicas, clusterNodes)
calcRangeCounter(storeID, desc, livenessMap, zone.GetNumVoters(), *zone.NumReplicas, clusterNodes)

// The raft leader computes the number of raft entries that replicas are
// behind.
Expand All @@ -139,10 +139,11 @@ func calcReplicaMetrics(
return m
}

// calcRangeCounter returns whether this replica is designated as the
// replica in the range responsible for range-level metrics, whether
// the range doesn't have a quorum of live replicas, and whether the
// range is currently under-replicated.
// calcRangeCounter returns whether this replica is designated as the replica in
// the range responsible for range-level metrics, whether the range doesn't have
// a quorum of live voting replicas, and whether the range is currently
// under-replicated (with regards to either the number of voting replicas or the
// number of non-voting replicas).
//
// Note: we compute an estimated range count across the cluster by counting the
// first live replica in each descriptor. Note that the first live replica is
Expand All @@ -158,12 +159,11 @@ func calcRangeCounter(
storeID roachpb.StoreID,
desc *roachpb.RangeDescriptor,
livenessMap liveness.IsLiveMap,
numReplicas int32,
numVoters, numReplicas int32,
clusterNodes int,
) (rangeCounter, unavailable, underreplicated, overreplicated bool) {
// It seems unlikely that a learner replica would be the first live one, but
// there's no particular reason to exclude them. Note that `All` returns the
// voters first.
// there's no particular reason to exclude them.
for _, rd := range desc.Replicas().Descriptors() {
if livenessMap[rd.NodeID].IsLive {
rangeCounter = rd.StoreID == storeID
Expand All @@ -176,11 +176,13 @@ func calcRangeCounter(
unavailable = !desc.Replicas().CanMakeProgress(func(rDesc roachpb.ReplicaDescriptor) bool {
return livenessMap[rDesc.NodeID].IsLive
})
needed := GetNeededVoters(numReplicas, clusterNodes)
liveVoterReplicas := calcLiveVoterReplicas(desc, livenessMap)
if needed > liveVoterReplicas {
neededVoters := GetNeededVoters(numVoters, clusterNodes)
liveVoters := calcLiveVoterReplicas(desc, livenessMap)
neededNonVoters := GetNeededNonVoters(int(numVoters), int(numReplicas-numVoters), clusterNodes)
liveNonVoters := calcLiveNonVoterReplicas(desc, livenessMap)
if neededVoters > liveVoters || neededNonVoters > liveNonVoters {
underreplicated = true
} else if needed < liveVoterReplicas {
} else if neededVoters < liveVoters || neededNonVoters < liveNonVoters {
overreplicated = true
}
}
Expand All @@ -192,8 +194,18 @@ func calcRangeCounter(
// method is used when indicating under-replication so only voter replicas are
// considered.
func calcLiveVoterReplicas(desc *roachpb.RangeDescriptor, livenessMap liveness.IsLiveMap) int {
return calcLiveReplicas(desc.Replicas().VoterDescriptors(), livenessMap)
}

// calcLiveNonVoterReplicas returns a count of the live non-voter replicas; a live
// replica is determined by checking its node in the provided liveness map.
func calcLiveNonVoterReplicas(desc *roachpb.RangeDescriptor, livenessMap liveness.IsLiveMap) int {
return calcLiveReplicas(desc.Replicas().NonVoterDescriptors(), livenessMap)
}

func calcLiveReplicas(repls []roachpb.ReplicaDescriptor, livenessMap liveness.IsLiveMap) int {
var live int
for _, rd := range desc.Replicas().VoterDescriptors() {
for _, rd := range repls {
if livenessMap[rd.NodeID].IsLive {
live++
}
Expand Down
78 changes: 73 additions & 5 deletions pkg/kv/kvserver/replica_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,26 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
// Regression test for a bug, see:
// https://github.com/cockroachdb/cockroach/pull/39936#pullrequestreview-359059629

desc := roachpb.NewRangeDescriptor(123, roachpb.RKeyMin, roachpb.RKeyMax,
threeVotersAndSingleNonVoter := roachpb.NewRangeDescriptor(123, roachpb.RKeyMin, roachpb.RKeyMax,
roachpb.MakeReplicaSet([]roachpb.ReplicaDescriptor{
{NodeID: 10, StoreID: 11, ReplicaID: 12, Type: roachpb.ReplicaTypeVoterFull()},
{NodeID: 100, StoreID: 110, ReplicaID: 120, Type: roachpb.ReplicaTypeVoterFull()},
{NodeID: 1000, StoreID: 1100, ReplicaID: 1200, Type: roachpb.ReplicaTypeVoterFull()},
{NodeID: 2000, StoreID: 2100, ReplicaID: 2200, Type: roachpb.ReplicaTypeNonVoter()},
}))

oneVoterAndThreeNonVoters := roachpb.NewRangeDescriptor(123, roachpb.RKeyMin, roachpb.RKeyMax,
roachpb.MakeReplicaSet([]roachpb.ReplicaDescriptor{
{NodeID: 10, StoreID: 11, ReplicaID: 12, Type: roachpb.ReplicaTypeVoterFull()},
{NodeID: 100, StoreID: 110, ReplicaID: 120, Type: roachpb.ReplicaTypeNonVoter()},
{NodeID: 1000, StoreID: 1100, ReplicaID: 1200, Type: roachpb.ReplicaTypeNonVoter()},
{NodeID: 2000, StoreID: 2100, ReplicaID: 2200, Type: roachpb.ReplicaTypeNonVoter()},
}))

{
ctr, down, under, over := calcRangeCounter(1100 /* storeID */, desc, liveness.IsLiveMap{
ctr, down, under, over := calcRangeCounter(1100, threeVotersAndSingleNonVoter, liveness.IsLiveMap{
1000: liveness.IsLiveMapEntry{IsLive: true}, // by NodeID
}, 3, 3)
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)

require.True(t, ctr)
require.True(t, down)
Expand All @@ -46,9 +55,9 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
}

{
ctr, down, under, over := calcRangeCounter(1000, desc, liveness.IsLiveMap{
ctr, down, under, over := calcRangeCounter(1000, threeVotersAndSingleNonVoter, liveness.IsLiveMap{
1000: liveness.IsLiveMapEntry{IsLive: false},
}, 3, 3)
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)

// Does not confuse a non-live entry for a live one. In other words,
// does not think that the liveness map has only entries for live nodes.
Expand All @@ -57,4 +66,63 @@ func TestCalcRangeCounterIsLiveMap(t *testing.T) {
require.False(t, under)
require.False(t, over)
}

{
ctr, down, under, over := calcRangeCounter(11, threeVotersAndSingleNonVoter, liveness.IsLiveMap{
10: liveness.IsLiveMapEntry{IsLive: true},
100: liveness.IsLiveMapEntry{IsLive: true},
1000: liveness.IsLiveMapEntry{IsLive: true},
2000: liveness.IsLiveMapEntry{IsLive: true},
}, 3 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)

require.True(t, ctr)
require.False(t, down)
require.False(t, under)
require.False(t, over)
}

{
// Single non-voter dead
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, liveness.IsLiveMap{
10: liveness.IsLiveMapEntry{IsLive: true},
100: liveness.IsLiveMapEntry{IsLive: true},
1000: liveness.IsLiveMapEntry{IsLive: false},
2000: liveness.IsLiveMapEntry{IsLive: true},
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)

require.True(t, ctr)
require.False(t, down)
require.True(t, under)
require.False(t, over)
}

{
// All non-voters are dead, but range is not unavailable
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, liveness.IsLiveMap{
10: liveness.IsLiveMapEntry{IsLive: true},
100: liveness.IsLiveMapEntry{IsLive: false},
1000: liveness.IsLiveMapEntry{IsLive: false},
2000: liveness.IsLiveMapEntry{IsLive: false},
}, 1 /* numVoters */, 4 /* numReplicas */, 4 /* clusterNodes */)

require.True(t, ctr)
require.False(t, down)
require.True(t, under)
require.False(t, over)
}

{
// More non-voters than needed
ctr, down, under, over := calcRangeCounter(11, oneVoterAndThreeNonVoters, liveness.IsLiveMap{
10: liveness.IsLiveMapEntry{IsLive: true},
100: liveness.IsLiveMapEntry{IsLive: true},
1000: liveness.IsLiveMapEntry{IsLive: true},
2000: liveness.IsLiveMapEntry{IsLive: true},
}, 1 /* numVoters */, 3 /* numReplicas */, 4 /* clusterNodes */)

require.True(t, ctr)
require.False(t, down)
require.False(t, under)
require.True(t, over)
}
}