Merge #24990

24990: storage: avoid acquiring raftMu in Replica.propose r=nvanbenschoten a=nvanbenschoten Related to #9827. This addresses a TODO to avoid acquiring `Replica.raftMu` in `Replica.propose` in the common case where the replica's internal raft group is not null. This showed up in blocking profiles when running sysbench's `oltp_insert` workload. Specifically, the locking of `raftMu` in `Replica.propose` was responsible for **29.3%** of blocking in the profile: ![before](https://user-images.githubusercontent.com/5438456/39137377-638396d4-46eb-11e8-8a95-876e3deb91f1.png) _raftMu locking in Replica.propose is responsible for the second large pillar_ This change increases throughput of the sysbench workload by **11%** and reduces the reported average latency by **10%**. This was averaged over a few runs. Below is the output of a representative run before and after. Before ``` SQL statistics: queries performed: read: 0 write: 202253 other: 0 total: 202253 transactions: 202253 (3368.51 per sec.) queries: 202253 (3368.51 per sec.) ignored errors: 0 (0.00 per sec.) reconnects: 0 (0.00 per sec.) Throughput: events/s (eps): 3368.5102 time elapsed: 60.0423s total number of events: 202253 Latency (ms): min: 0.99 avg: 37.98 max: 141.98 95th percentile: 58.92 sum: 7681469.72 Threads fairness: events (avg/stddev): 1580.1016/4.11 execution time (avg/stddev): 60.0115/0.01 ``` After ``` SQL statistics: queries performed: read: 0 write: 228630 other: 0 total: 228630 transactions: 228630 (3808.46 per sec.) queries: 228630 (3808.46 per sec.) ignored errors: 0 (0.00 per sec.) reconnects: 0 (0.00 per sec.) Throughput: events/s (eps): 3808.4610 time elapsed: 60.0321s total number of events: 228630 Latency (ms): min: 3.59 avg: 33.60 max: 156.34 95th percentile: 51.02 sum: 7680843.03 Threads fairness: events (avg/stddev): 1786.1719/3.89 execution time (avg/stddev): 60.0066/0.01 ``` After this change, the blocking profile shows that `Replica.propose` is no longer a problem: ![after](https://user-images.githubusercontent.com/5438456/39137437-8d4a53ea-46eb-11e8-8faa-eff3ed01914f.png) Release note (performance improvement): Reduce lock contention in Replica write path. Co-authored-by: Nathan VanBenschoten <[email protected]>
cockroachdb · Jul 30, 2018 · 5642c9f · 5642c9f
2 parents 5ef50a6 + 5ec656c
commit 5642c9f
Showing 1 changed file with 26 additions and 16 deletions.
diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
@@ -519,7 +519,9 @@ var _ KeyRange = &Replica{}
 // leader awoken). See handleRaftReady for an instance of where this value
 // varies.
 //
-// Requires that both Replica.mu and Replica.raftMu are held.
+// Requires that Replica.mu is held. Also requires that Replica.raftMu is held
+// if either the caller can't guarantee that r.mu.internalRaftGroup != nil or
+// the provided function requires Replica.raftMu.
 func (r *Replica) withRaftGroupLocked(
 	mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error),
 ) error {
@@ -535,9 +537,10 @@ func (r *Replica) withRaftGroupLocked(
 		return nil
 	}
 
-	ctx := r.AnnotateCtx(context.TODO())
-
 	if r.mu.internalRaftGroup == nil {
+		r.raftMu.Mutex.AssertHeld()
+
+		ctx := r.AnnotateCtx(context.TODO())
 		raftGroup, err := raft.NewRawNode(newRaftConfig(
 			raft.Storage((*replicaRaftStorage)(r)),
 			uint64(r.mu.replicaID),
@@ -578,7 +581,7 @@ func (r *Replica) withRaftGroupLocked(
 // messages (which may include MsgVotes from an election in progress,
 // and this election would be disrupted if we started our own).
 //
-// Requires that Replica.raftMu is held.
+// Has the same requirement for Replica.raftMu as withRaftGroupLocked.
 func (r *Replica) withRaftGroup(
 	mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error),
 ) error {
@@ -3504,20 +3507,27 @@ func (r *Replica) propose(
 		return nil, nil, roachpb.NewError(err)
 	}
 
-	// submitProposalLocked calls withRaftGroupLocked which requires that
-	// raftMu is held. In order to maintain our lock ordering we need to lock
-	// Replica.raftMu here before locking Replica.mu.
-	//
-	// TODO(peter): It appears that we only need to hold Replica.raftMu when
-	// calling raft.NewRawNode. We could get fancier with the locking here to
-	// optimize for the common case where Replica.mu.internalRaftGroup is
-	// non-nil, but that doesn't currently seem worth it. Always locking raftMu
-	// has a tiny (~1%) performance hit for single-node block_writer testing.
-	r.raftMu.Lock()
-	defer r.raftMu.Unlock()
+	// submitProposalLocked calls withRaftGroupLocked which requires that raftMu
+	// is held, but only if Replica.mu.internalRaftGroup == nil. To avoid
+	// locking Replica.raftMu in the common case where the raft group is
+	// non-nil, we lock only Replica.mu at first before checking the status of
+	// the internal raft group. If it equals nil then we fall back to the slow
+	// path of locking Replica.raftMu. However, in order to maintain our lock
+	// ordering we need to lock Replica.raftMu here before locking Replica.mu,
+	// so we unlock Replica.mu before locking them both again.
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	log.Event(proposal.ctx, "acquired {raft,replica}mu")
+	if r.mu.internalRaftGroup == nil {
+		// Unlock first before locking in {raft,replica}mu order.
+		r.mu.Unlock()
+
+		r.raftMu.Lock()
+		defer r.raftMu.Unlock()
+		r.mu.Lock()
+		log.Event(proposal.ctx, "acquired {raft,replica}mu")
+	} else {
+		log.Event(proposal.ctx, "acquired replica mu")
+	}
 
 	// Add size of proposal to commandSizes map.
 	if r.mu.commandSizes != nil {