Skip to content

Commit 3f511eb

Browse files
authored
Merge pull request #1403 from hashicorp/f-hold-rpc
Gracefully handle short lived outages by holding RPC calls
2 parents 5798a23 + 33e655b commit 3f511eb

File tree

3 files changed

+78
-15
lines changed

3 files changed

+78
-15
lines changed

nomad/config.go

+8
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,13 @@ type Config struct {
181181

182182
// ConsulConfig is this Agent's Consul configuration
183183
ConsulConfig *config.ConsulConfig
184+
185+
// RPCHoldTimeout is how long an RPC can be "held" before it is errored.
186+
// This is used to paper over a loss of leadership by instead holding RPCs,
187+
// so that the caller experiences a slow response rather than an error.
188+
// This period is meant to be long enough for a leader election to take
189+
// place, and a small jitter is applied to avoid a thundering herd.
190+
RPCHoldTimeout time.Duration
184191
}
185192

186193
// CheckVersion is used to check if the ProtocolVersion is valid
@@ -227,6 +234,7 @@ func DefaultConfig() *Config {
227234
HeartbeatGrace: 10 * time.Second,
228235
FailoverHeartbeatTTL: 300 * time.Second,
229236
ConsulConfig: config.DefaultConsulConfig(),
237+
RPCHoldTimeout: 5 * time.Second,
230238
}
231239

232240
// Enable all known schedulers by default

nomad/rpc.go

+48-8
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ const (
3939

4040
// jitterFraction is a the limit to the amount of jitter we apply
4141
// to a user specified MaxQueryTime. We divide the specified time by
42-
// the fraction. So 16 == 6.25% limit of jitter
42+
// the fraction. So 16 == 6.25% limit of jitter. This jitter is also
43+
// applied to RPCHoldTimeout.
4344
jitterFraction = 16
4445

4546
// Warn if the Raft command is larger than this.
@@ -175,6 +176,8 @@ func (s *Server) handleNomadConn(conn net.Conn) {
175176
// forward is used to forward to a remote region or to forward to the local leader
176177
// Returns a bool of if forwarding was performed, as well as any error
177178
func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) {
179+
var firstCheck time.Time
180+
178181
region := info.RequestRegion()
179182
if region == "" {
180183
return true, fmt.Errorf("missing target RPC")
@@ -191,27 +194,64 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{},
191194
return false, nil
192195
}
193196

194-
// Handle leader forwarding
195-
if !s.IsLeader() {
196-
err := s.forwardLeader(method, args, reply)
197+
CHECK_LEADER:
198+
// Find the leader
199+
isLeader, remoteServer := s.getLeader()
200+
201+
// Handle the case we are the leader
202+
if isLeader {
203+
return false, nil
204+
}
205+
206+
// Handle the case of a known leader
207+
if remoteServer != nil {
208+
err := s.forwardLeader(remoteServer, method, args, reply)
197209
return true, err
198210
}
199-
return false, nil
211+
212+
// Gate the request until there is a leader
213+
if firstCheck.IsZero() {
214+
firstCheck = time.Now()
215+
}
216+
if time.Now().Sub(firstCheck) < s.config.RPCHoldTimeout {
217+
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
218+
select {
219+
case <-time.After(jitter):
220+
goto CHECK_LEADER
221+
case <-s.shutdownCh:
222+
}
223+
}
224+
225+
// No leader found and hold time exceeded
226+
return true, structs.ErrNoLeader
200227
}
201228

202-
// forwardLeader is used to forward an RPC call to the leader, or fail if no leader
203-
func (s *Server) forwardLeader(method string, args interface{}, reply interface{}) error {
229+
// getLeader returns if the current node is the leader, and if not
230+
// then it returns the leader which is potentially nil if the cluster
231+
// has not yet elected a leader.
232+
func (s *Server) getLeader() (bool, *serverParts) {
233+
// Check if we are the leader
234+
if s.IsLeader() {
235+
return true, nil
236+
}
237+
204238
// Get the leader
205239
leader := s.raft.Leader()
206240
if leader == "" {
207-
return structs.ErrNoLeader
241+
return false, nil
208242
}
209243

210244
// Lookup the server
211245
s.peerLock.RLock()
212246
server := s.localPeers[leader]
213247
s.peerLock.RUnlock()
214248

249+
// Server could be nil
250+
return false, server
251+
}
252+
253+
// forwardLeader is used to forward an RPC call to the leader, or fail if no leader
254+
func (s *Server) forwardLeader(server *serverParts, method string, args interface{}, reply interface{}) error {
215255
// Handle a missing server
216256
if server == nil {
217257
return structs.ErrNoLeader

nomad/rpc_test.go

+22-7
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,30 @@ func TestRPC_forwardLeader(t *testing.T) {
3333
testutil.WaitForLeader(t, s1.RPC)
3434
testutil.WaitForLeader(t, s2.RPC)
3535

36-
var out struct{}
37-
err := s1.forwardLeader("Status.Ping", struct{}{}, &out)
38-
if err != nil {
39-
t.Fatalf("err: %v", err)
36+
isLeader, remote := s1.getLeader()
37+
if !isLeader && remote == nil {
38+
t.Fatalf("missing leader")
4039
}
4140

42-
err = s2.forwardLeader("Status.Ping", struct{}{}, &out)
43-
if err != nil {
44-
t.Fatalf("err: %v", err)
41+
if remote != nil {
42+
var out struct{}
43+
err := s1.forwardLeader(remote, "Status.Ping", struct{}{}, &out)
44+
if err != nil {
45+
t.Fatalf("err: %v", err)
46+
}
47+
}
48+
49+
isLeader, remote = s2.getLeader()
50+
if !isLeader && remote == nil {
51+
t.Fatalf("missing leader")
52+
}
53+
54+
if remote != nil {
55+
var out struct{}
56+
err := s2.forwardLeader(remote, "Status.Ping", struct{}{}, &out)
57+
if err != nil {
58+
t.Fatalf("err: %v", err)
59+
}
4560
}
4661
}
4762

0 commit comments

Comments
 (0)