Skip to content

Commit cbc521e

Browse files
author
Mahmood Ali
committed
alloc_runner: wait when starting suspicious allocs
This commit aims to help users running with clients suseptible to the destroyed alloc being restrarted bug upgrade to latest. Without this, such users will have their tasks run unexpectedly on upgrade and only see the bug resolved after subsequent restart. If, on restore, the client sees a pending alloc without any other persisted info, then err on the side that it's an corrupt persisted state of an alloc instead of the client happening to be killed right when alloc is assigned to client. Few reasons motivate this behavior: Statistically speaking, corruption being the cause is more likely. A long running client will have higher chance of having allocs persisted incorrectly with pending state. Being killed right when an alloc is about to start is relatively unlikely. Also, delaying starting an alloc that hasn't started (by hopefully seconds) is not as severe as launching too many allocs that may bring client down. More importantly, this helps customers upgrade their clients without risking taking their clients down and destablizing their cluster. We don't want existing users to force triggering the bug while they upgrade and restart cluster.
1 parent f616370 commit cbc521e

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

client/allocrunner/alloc_runner.go

+56
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ type allocRunner struct {
141141
// servers have been contacted for the first time in case of a failed
142142
// restore.
143143
serversContactedCh chan struct{}
144+
145+
// waitOnServers defaults to false but will be set true if a restore
146+
// fails and the Run method should wait until serversContactedCh is
147+
// closed.
148+
waitOnServers bool
144149
}
145150

146151
// NewAllocRunner returns a new allocation runner.
@@ -243,6 +248,16 @@ func (ar *allocRunner) Run() {
243248
// Start the alloc update handler
244249
go ar.handleAllocUpdates()
245250

251+
if ar.waitOnServers {
252+
ar.logger.Info(" waiting to contact server before restarting")
253+
select {
254+
case <-ar.taskStateUpdateHandlerCh:
255+
return
256+
case <-ar.serversContactedCh:
257+
ar.logger.Info("server contacted; unblocking waiting alloc")
258+
}
259+
}
260+
246261
// If task update chan has been closed, that means we've been shutdown.
247262
select {
248263
case <-ar.taskStateUpdateHandlerCh:
@@ -353,9 +368,50 @@ func (ar *allocRunner) Restore() error {
353368
}
354369
}
355370

371+
ar.waitOnServers = ar.shouldWaitForServers(ds)
356372
return nil
357373
}
358374

375+
// shouldWaitForServers returns true if we suspect the alloc
376+
// is potentially a completed alloc that got resurrected after AR was destroyed.
377+
// In such cases, rerunning the alloc can lead to process and task exhaustion.
378+
//
379+
// The heaurstic used here is an alloc is suspect if it's in a pending state
380+
// and no other task/status info is found.
381+
//
382+
// See:
383+
// * https://github.com/hashicorp/nomad/pull/6207
384+
// * https://github.com/hashicorp/nomad/issues/5984
385+
//
386+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
387+
func (ar *allocRunner) shouldWaitForServers(ds *structs.AllocDeploymentStatus) bool {
388+
alloc := ar.Alloc()
389+
390+
if alloc.ClientStatus != structs.AllocClientStatusPending {
391+
return false
392+
}
393+
394+
// check if we restore a task but see no other data
395+
if ds != nil {
396+
return false
397+
}
398+
399+
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
400+
if tg == nil {
401+
// corrupt alloc?!
402+
return true
403+
}
404+
405+
for _, task := range tg.Tasks {
406+
ls, tr, _ := ar.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
407+
if ls != nil || tr != nil {
408+
return false
409+
}
410+
}
411+
412+
return true
413+
}
414+
359415
// persistDeploymentStatus stores AllocDeploymentStatus.
360416
func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
361417
if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {

client/allocrunner/alloc_runner_test.go

+41
Original file line numberDiff line numberDiff line change
@@ -1059,3 +1059,44 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
10591059
require.NoError(t, err)
10601060
require.Nil(t, ts)
10611061
}
1062+
1063+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1064+
func TestAllocRunner_WaitForServer_Detects_Suspicious_Allocs(t *testing.T) {
1065+
t.Parallel()
1066+
alloc := mock.BatchAlloc()
1067+
1068+
conf, cleanup := testAllocRunnerConfig(t, alloc)
1069+
conf.StateDB = state.NewMemDB(conf.Logger)
1070+
1071+
defer cleanup()
1072+
ar, err := NewAllocRunner(conf)
1073+
require.NoError(t, err)
1074+
defer destroy(ar)
1075+
1076+
defer destroy(ar)
1077+
go ar.Run()
1078+
1079+
select {
1080+
case <-ar.WaitCh():
1081+
case <-time.After(10 * time.Second):
1082+
require.Fail(t, "timed out waiting for alloc to complete")
1083+
}
1084+
1085+
// shouldn't wait after successful completion
1086+
require.False(t, ar.shouldWaitForServers(nil))
1087+
1088+
// new alloc runner shouldn't restore completed alloc
1089+
ar, err = NewAllocRunner(conf)
1090+
require.NoError(t, err)
1091+
ar.Restore()
1092+
require.False(t, ar.shouldWaitForServers(nil))
1093+
1094+
// simulate 0.9.5 behavior
1095+
require.NoError(t, conf.StateDB.DeleteAllocationBucket(alloc.ID))
1096+
require.NoError(t, conf.StateDB.PutAllocation(alloc))
1097+
1098+
ar, err = NewAllocRunner(conf)
1099+
require.NoError(t, err)
1100+
ar.Restore()
1101+
require.True(t, ar.shouldWaitForServers(nil))
1102+
}

0 commit comments

Comments
 (0)