cockroachdb
diff --git a/‎pkg/base/config.go
+29 b/‎pkg/base/config.go
+29
diff --git a/‎pkg/storage/client_raft_test.go
+4-3 b/‎pkg/storage/client_raft_test.go
+4-3
diff --git a/‎pkg/storage/replica.go
+3-108 b/‎pkg/storage/replica.go
+3-108
@@ -446,6 +446,23 @@ type RaftConfig struct {
 	// performing log truncations.
 	RaftLogMaxSize int64
 
+	// RaftProposalQuota controls the maximum aggregate size of Raft commands
+	// that a leader is allowed to propose concurrently.
+	//
+	// By default, the quota is set to a fraction of the RaftLogMaxSize. In
+	// doing so, we ensure all replicas have sufficiently up to date logs so
+	// that when the log gets truncated, the followers do not need
+	// non-preemptive snapshots. Changing this deserves care. Too low and
+	// everything comes to a grinding halt, too high and we're not really
+	// throttling anything (we'll still generate snapshots).
+	RaftProposalQuota int64
+
+	// RaftMaxUncommittedEntriesSize controls how large the uncommitted tail of
+	// the Raft log can grow. The limit is meant to provide protection against
+	// unbounded Raft log growth when quorum is lost and entries stop being
+	// committed but continue to be proposed.
+	RaftMaxUncommittedEntriesSize uint64
+
 	// RaftMaxSizePerMsg controls how many Raft log entries the leader will send to
 	// followers in a single MsgApp.
 	RaftMaxSizePerMsg uint64
@@ -474,6 +491,18 @@ func (cfg *RaftConfig) SetDefaults() {
 	if cfg.RaftLogMaxSize == 0 {
 		cfg.RaftLogMaxSize = defaultRaftLogMaxSize
 	}
+	if cfg.RaftProposalQuota == 0 {
+		// By default, set this to a fraction of RaftLogMaxSize. See comment
+		// above for the tradeoffs of setting this higher or lower.
+		cfg.RaftProposalQuota = cfg.RaftLogMaxSize / 4
+	}
+	if cfg.RaftMaxUncommittedEntriesSize == 0 {
+		// By default, set this to twice the RaftProposalQuota. The logic here
+		// is that the quotaPool should be responsible for throttling proposals
+		// in all cases except for unbounded Raft re-proposals because it queues
+		// efficiently instead of dropping proposals on the floor indiscriminately.
+		cfg.RaftMaxUncommittedEntriesSize = uint64(2 * cfg.RaftProposalQuota)
+	}
 	if cfg.RaftMaxSizePerMsg == 0 {
 		cfg.RaftMaxSizePerMsg = uint64(defaultRaftMaxSizePerMsg)
 	}
 
@@ -1155,6 +1155,8 @@ func TestLogGrowthWhenRefreshingPendingCommands(t *testing.T) {
 	sc.RaftTickInterval = 10 * time.Millisecond
 	// Don't timeout raft leader. We don't want leadership moving.
 	sc.RaftElectionTimeoutTicks = 1000000
+	// Reduce the max uncommitted entry size.
+	sc.RaftMaxUncommittedEntriesSize = 64 << 10 // 64 KB
 	// Disable leader transfers during leaseholder changes so that we
 	// can easily create leader-not-leaseholder scenarios.
 	sc.TestingKnobs.DisableLeaderFollowsLeaseholder = true
@@ -1233,7 +1235,7 @@ func TestLogGrowthWhenRefreshingPendingCommands(t *testing.T) {
 		// While a majority nodes are down, write some data.
 		putRes := make(chan *roachpb.Error)
 		go func() {
-			putArgs := putArgs([]byte("b"), make([]byte, 8<<10 /* 8 KB */))
+			putArgs := putArgs([]byte("b"), make([]byte, sc.RaftMaxUncommittedEntriesSize/8))
 			_, err := client.SendWrapped(context.Background(), propNode, putArgs)
 			putRes <- err
 		}()
@@ -1254,11 +1256,10 @@ func TestLogGrowthWhenRefreshingPendingCommands(t *testing.T) {
 				}
 
 				// Check raft log size.
-				const logSizeLimit = 64 << 10 // 64 KB
 				curlogSize := leaderRepl.GetRaftLogSize()
 				logSize := curlogSize - initLogSize
 				logSizeStr := humanizeutil.IBytes(logSize)
-				if logSize > logSizeLimit {
+				if uint64(logSize) > sc.RaftMaxUncommittedEntriesSize {
 					t.Fatalf("raft log size grew to %s", logSizeStr)
 				}
 				t.Logf("raft log size grew to %s", logSizeStr)
 
@@ -387,12 +387,7 @@ type Replica struct {
 		// map must only be referenced while Replica.mu is held, except if the
 		// element is removed from the map first. The notable exception is the
 		// contained RaftCommand, which we treat as immutable.
-		localProposals map[storagebase.CmdIDKey]*ProposalData
-		// remoteProposals is maintained by Raft leaders and stores in-flight
-		// commands that were forwarded to the leader during its current term.
-		// The set allows leaders to detect duplicate forwarded commands and
-		// avoid re-proposing the same forwarded command multiple times.
-		remoteProposals   map[storagebase.CmdIDKey]struct{}
+		localProposals    map[storagebase.CmdIDKey]*ProposalData
 		internalRaftGroup *raft.RawNode
 		// The ID of the replica within the Raft group. May be 0 if the replica has
 		// been created from a preemptive snapshot (i.e. before being added to the
@@ -883,7 +878,6 @@ func (r *Replica) cancelPendingCommandsLocked() {
 		r.cleanupFailedProposalLocked(p)
 		p.finishApplication(pr)
 	}
-	r.mu.remoteProposals = nil
 }
 
 // cleanupFailedProposalLocked cleans up after a proposal that has failed. It
@@ -1118,22 +1112,12 @@ func (r *Replica) updateProposalQuotaRaftMuLocked(
 				log.Fatalf(ctx, "len(r.mu.commandSizes) = %d, expected 0", commandSizesLen)
 			}
 
-			// We set the defaultProposalQuota to be less than RaftLogMaxSize,
-			// in doing so we ensure all replicas have sufficiently up to date
-			// logs so that when the log gets truncated, the followers do not
-			// need non-preemptive snapshots. Changing this deserves care. Too
-			// low and everything comes to a grinding halt, too high and we're
-			// not really throttling anything (we'll still generate snapshots).
-			//
-			// TODO(nvanbenschoten): clean this up in later commits.
-			proposalQuota := r.store.cfg.RaftLogMaxSize / 4
-
 			// Raft may propose commands itself (specifically the empty
 			// commands when leadership changes), and these commands don't go
 			// through the code paths where we acquire quota from the pool. To
 			// offset this we reset the quota pool whenever leadership changes
 			// hands.
-			r.mu.proposalQuota = newQuotaPool(proposalQuota)
+			r.mu.proposalQuota = newQuotaPool(r.store.cfg.RaftProposalQuota)
 			r.mu.lastUpdateTimes = make(map[roachpb.ReplicaID]time.Time)
 			r.mu.commandSizes = make(map[storagebase.CmdIDKey]int)
 		} else if r.mu.proposalQuota != nil {
@@ -1913,7 +1897,6 @@ func (r *Replica) State() storagepb.RangeInfo {
 	ri.ReplicaState = *(protoutil.Clone(&r.mu.state)).(*storagepb.ReplicaState)
 	ri.LastIndex = r.mu.lastIndex
 	ri.NumPending = uint64(len(r.mu.localProposals))
-	ri.NumRemotePending = uint64(len(r.mu.remoteProposals))
 	ri.RaftLogSize = r.mu.raftLogSize
 	ri.NumDropped = uint64(r.mu.droppedMessages)
 	if r.mu.proposalQuota != nil {
@@ -4042,20 +4025,7 @@ func (r *Replica) stepRaftGroup(req *RaftMessageRequest) error {
 		// we expect the originator to campaign instead.
 		r.unquiesceWithOptionsLocked(false /* campaignOnWake */)
 		r.refreshLastUpdateTimeForReplicaLocked(req.FromReplica.ReplicaID)
-
-		// Check if the message is a proposal that should be dropped.
-		if r.shouldDropForwardedProposalLocked(req) {
-			// If we could signal to the sender that its proposal was accepted
-			// or dropped then we wouldn't need to track anything.
-			return false /* unquiesceAndWakeLeader */, nil
-		}
-
 		err := raftGroup.Step(req.Message)
-		if err == nil {
-			// If we stepped successfully and the request is a proposal, consider
-			// tracking it so that we can ignore identical proposals in the future.
-			r.maybeTrackForwardedProposalLocked(raftGroup, req)
-		}
 		if err == raft.ErrProposalDropped {
 			// A proposal was forwarded to this replica but we couldn't propose it.
 			// Swallow the error since we don't have an effective way of signaling
@@ -4068,68 +4038,6 @@ func (r *Replica) stepRaftGroup(req *RaftMessageRequest) error {
 	})
 }
 
-func (r *Replica) shouldDropForwardedProposalLocked(req *RaftMessageRequest) bool {
-	if req.Message.Type != raftpb.MsgProp {
-		// Not a proposal.
-		return false
-	}
-
-	for _, e := range req.Message.Entries {
-		switch e.Type {
-		case raftpb.EntryNormal:
-			cmdID, _ := DecodeRaftCommand(e.Data)
-			if _, ok := r.mu.remoteProposals[cmdID]; !ok {
-				// Untracked remote proposal. Don't drop.
-				return false
-			}
-		case raftpb.EntryConfChange:
-			// Never drop EntryConfChange proposals.
-			return false
-		default:
-			log.Fatalf(context.TODO(), "unexpected Raft entry: %v", e)
-		}
-	}
-	// All entries tracked.
-	return true
-}
-
-func (r *Replica) maybeTrackForwardedProposalLocked(rg *raft.RawNode, req *RaftMessageRequest) {
-	if req.Message.Type != raftpb.MsgProp {
-		// Not a proposal.
-		return
-	}
-
-	if rg.Status().RaftState != raft.StateLeader {
-		// We're not the leader. We can't be sure that the proposal made it into
-		// the Raft log, so don't track it.
-		return
-	}
-
-	// Record that each of the proposal's entries was seen and appended. This
-	// allows us to catch duplicate forwarded proposals in the future and
-	// prevent them from being repeatedly appended to a leader's raft log.
-	for _, e := range req.Message.Entries {
-		switch e.Type {
-		case raftpb.EntryNormal:
-			cmdID, data := DecodeRaftCommand(e.Data)
-			if len(data) == 0 {
-				// An empty command is proposed to unquiesce a range and
-				// wake the leader. Don't keep track of these forwarded
-				// proposals because they will never be cleaned up.
-			} else {
-				if r.mu.remoteProposals == nil {
-					r.mu.remoteProposals = map[storagebase.CmdIDKey]struct{}{}
-				}
-				r.mu.remoteProposals[cmdID] = struct{}{}
-			}
-		case raftpb.EntryConfChange:
-			// Don't track EntryConfChanges.
-		default:
-			log.Fatalf(context.TODO(), "unexpected Raft entry: %v", e)
-		}
-	}
-}
-
 type handleRaftReadyStats struct {
 	processed int
 }
@@ -4394,7 +4302,6 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		r.mu.leaderID = leaderID
 		// Clear the remote proposal set. Would have been nil already if not
 		// previously the leader.
-		r.mu.remoteProposals = nil
 		becameLeader = r.mu.leaderID == r.mu.replicaID
 	}
 	r.mu.Unlock()
@@ -4599,22 +4506,13 @@ func (r *Replica) tick(livenessMap IsLiveMap) (bool, error) {
 	if knob := r.store.TestingKnobs().RefreshReasonTicksPeriod; knob > 0 {
 		refreshAtDelta = knob
 	}
-	if !r.store.TestingKnobs().DisableRefreshReasonTicks &&
-		r.mu.replicaID != r.mu.leaderID &&
-		r.mu.ticks%refreshAtDelta == 0 {
+	if !r.store.TestingKnobs().DisableRefreshReasonTicks && r.mu.ticks%refreshAtDelta == 0 {
 		// RaftElectionTimeoutTicks is a reasonable approximation of how long we
 		// should wait before deciding that our previous proposal didn't go
 		// through. Note that the combination of the above condition and passing
 		// RaftElectionTimeoutTicks to refreshProposalsLocked means that commands
 		// will be refreshed when they have been pending for 1 to 2 election
 		// cycles.
-		//
-		// However, we don't refresh proposals if we are the leader because
-		// doing so would be useless. The commands tracked by a leader replica
-		// were either all proposed when the replica was a leader or were
-		// re-proposed when the replica became a leader. Either way, they are
-		// guaranteed to be in the leader's Raft log so re-proposing won't do
-		// anything.
 		r.refreshProposalsLocked(refreshAtDelta, reasonTicks)
 	}
 	return true, nil
@@ -5407,9 +5305,6 @@ func (r *Replica) processRaftCommand(
 		delete(r.mu.localProposals, idKey)
 	}
 
-	// Delete the entry for a forwarded proposal set.
-	delete(r.mu.remoteProposals, idKey)
-
 	leaseIndex, proposalRetry, forcedErr := r.checkForcedErrLocked(ctx, idKey, raftCmd, proposal, proposedLocally)
 
 	r.mu.Unlock()