alloc_runner: wait when starting suspicious allocs

Mahmood Ali · Mahmood Ali · commit cbc521e1e7b9 · 2019-08-26T22:05:31.000-04:00
This commit aims to help users running with clients suseptible to the
destroyed alloc being restrarted bug upgrade to latest.  Without this,
such users will have their tasks run unexpectedly on upgrade and only
see the bug resolved after subsequent restart.

If, on restore, the client sees a pending alloc without any other
persisted info, then err on the side that it's an corrupt persisted
state of an alloc instead of the client happening to be killed right
when alloc is assigned to client.

Few reasons motivate this behavior:

Statistically speaking, corruption being the cause is more likely.  A
long running client will have higher chance of having allocs persisted
incorrectly with pending state.  Being killed right when an alloc is
about to start is relatively unlikely.

Also, delaying starting an alloc that hasn't started (by hopefully
seconds) is not as severe as launching too many allocs that may bring
client down.

More importantly, this helps customers upgrade their clients without
risking taking their clients down and destablizing their cluster. We
don't want existing users to force triggering the bug while they upgrade
and restart cluster.
diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
@@ -141,6 +141,11 @@ type allocRunner struct {
 	// servers have been contacted for the first time in case of a failed
 	// restore.
 	serversContactedCh chan struct{}
+
+	// waitOnServers defaults to false but will be set true if a restore
+	// fails and the Run method should wait until serversContactedCh is
+	// closed.
+	waitOnServers bool
 }
 
 // NewAllocRunner returns a new allocation runner.
@@ -243,6 +248,16 @@ func (ar *allocRunner) Run() {
 	// Start the alloc update handler
 	go ar.handleAllocUpdates()
 
+	if ar.waitOnServers {
+		ar.logger.Info(" waiting to contact server before restarting")
+		select {
+		case <-ar.taskStateUpdateHandlerCh:
+			return
+		case <-ar.serversContactedCh:
+			ar.logger.Info("server contacted; unblocking waiting alloc")
+		}
+	}
+
 	// If task update chan has been closed, that means we've been shutdown.
 	select {
 	case <-ar.taskStateUpdateHandlerCh:
@@ -353,9 +368,50 @@ func (ar *allocRunner) Restore() error {
 		}
 	}
 
+	ar.waitOnServers = ar.shouldWaitForServers(ds)
 	return nil
 }
 
+// shouldWaitForServers returns true if we suspect the alloc
+// is potentially a completed alloc that got resurrected after AR was destroyed.
+// In such cases, rerunning the alloc can lead to process and task exhaustion.
+//
+// The heaurstic used here is an alloc is suspect if it's in a pending state
+// and no other task/status info is found.
+//
+// See:
+//  * https://github.com/hashicorp/nomad/pull/6207
+//  * https://github.com/hashicorp/nomad/issues/5984
+//
+// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
+func (ar *allocRunner) shouldWaitForServers(ds *structs.AllocDeploymentStatus) bool {
+	alloc := ar.Alloc()
+
+	if alloc.ClientStatus != structs.AllocClientStatusPending {
+		return false
+	}
+
+	// check if we restore a task but see no other data
+	if ds != nil {
+		return false
+	}
+
+	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
+	if tg == nil {
+		// corrupt alloc?!
+		return true
+	}
+
+	for _, task := range tg.Tasks {
+		ls, tr, _ := ar.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
+		if ls != nil || tr != nil {
+			return false
+		}
+	}
+
+	return true
+}
+
 // persistDeploymentStatus stores AllocDeploymentStatus.
 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
 	if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go
@@ -1059,3 +1059,44 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
 	require.NoError(t, err)
 	require.Nil(t, ts)
 }
+
+// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
+func TestAllocRunner_WaitForServer_Detects_Suspicious_Allocs(t *testing.T) {
+	t.Parallel()
+	alloc := mock.BatchAlloc()
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc)
+	conf.StateDB = state.NewMemDB(conf.Logger)
+
+	defer cleanup()
+	ar, err := NewAllocRunner(conf)
+	require.NoError(t, err)
+	defer destroy(ar)
+
+	defer destroy(ar)
+	go ar.Run()
+
+	select {
+	case <-ar.WaitCh():
+	case <-time.After(10 * time.Second):
+		require.Fail(t, "timed out waiting for alloc to complete")
+	}
+
+	// shouldn't wait after successful completion
+	require.False(t, ar.shouldWaitForServers(nil))
+
+	// new alloc runner shouldn't restore completed alloc
+	ar, err = NewAllocRunner(conf)
+	require.NoError(t, err)
+	ar.Restore()
+	require.False(t, ar.shouldWaitForServers(nil))
+
+	// simulate 0.9.5 behavior
+	require.NoError(t, conf.StateDB.DeleteAllocationBucket(alloc.ID))
+	require.NoError(t, conf.StateDB.PutAllocation(alloc))
+
+	ar, err = NewAllocRunner(conf)
+	require.NoError(t, err)
+	ar.Restore()
+	require.True(t, ar.shouldWaitForServers(nil))
+}