Skip to content

Commit 493945a

Browse files
author
Mahmood Ali
committed
Alternative approach: avoid restoring
This uses an alternative approach where we avoid restoring the alloc runner in the first place, if we suspect that the alloc may have been completed already.
1 parent cbc521e commit 493945a

File tree

4 files changed

+95
-102
lines changed

4 files changed

+95
-102
lines changed

client/allocrunner/alloc_runner.go

-56
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,6 @@ type allocRunner struct {
141141
// servers have been contacted for the first time in case of a failed
142142
// restore.
143143
serversContactedCh chan struct{}
144-
145-
// waitOnServers defaults to false but will be set true if a restore
146-
// fails and the Run method should wait until serversContactedCh is
147-
// closed.
148-
waitOnServers bool
149144
}
150145

151146
// NewAllocRunner returns a new allocation runner.
@@ -248,16 +243,6 @@ func (ar *allocRunner) Run() {
248243
// Start the alloc update handler
249244
go ar.handleAllocUpdates()
250245

251-
if ar.waitOnServers {
252-
ar.logger.Info(" waiting to contact server before restarting")
253-
select {
254-
case <-ar.taskStateUpdateHandlerCh:
255-
return
256-
case <-ar.serversContactedCh:
257-
ar.logger.Info("server contacted; unblocking waiting alloc")
258-
}
259-
}
260-
261246
// If task update chan has been closed, that means we've been shutdown.
262247
select {
263248
case <-ar.taskStateUpdateHandlerCh:
@@ -368,50 +353,9 @@ func (ar *allocRunner) Restore() error {
368353
}
369354
}
370355

371-
ar.waitOnServers = ar.shouldWaitForServers(ds)
372356
return nil
373357
}
374358

375-
// shouldWaitForServers returns true if we suspect the alloc
376-
// is potentially a completed alloc that got resurrected after AR was destroyed.
377-
// In such cases, rerunning the alloc can lead to process and task exhaustion.
378-
//
379-
// The heaurstic used here is an alloc is suspect if it's in a pending state
380-
// and no other task/status info is found.
381-
//
382-
// See:
383-
// * https://github.com/hashicorp/nomad/pull/6207
384-
// * https://github.com/hashicorp/nomad/issues/5984
385-
//
386-
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
387-
func (ar *allocRunner) shouldWaitForServers(ds *structs.AllocDeploymentStatus) bool {
388-
alloc := ar.Alloc()
389-
390-
if alloc.ClientStatus != structs.AllocClientStatusPending {
391-
return false
392-
}
393-
394-
// check if we restore a task but see no other data
395-
if ds != nil {
396-
return false
397-
}
398-
399-
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
400-
if tg == nil {
401-
// corrupt alloc?!
402-
return true
403-
}
404-
405-
for _, task := range tg.Tasks {
406-
ls, tr, _ := ar.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
407-
if ls != nil || tr != nil {
408-
return false
409-
}
410-
}
411-
412-
return true
413-
}
414-
415359
// persistDeploymentStatus stores AllocDeploymentStatus.
416360
func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
417361
if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {

client/allocrunner/alloc_runner_test.go

-41
Original file line numberDiff line numberDiff line change
@@ -1059,44 +1059,3 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
10591059
require.NoError(t, err)
10601060
require.Nil(t, ts)
10611061
}
1062-
1063-
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1064-
func TestAllocRunner_WaitForServer_Detects_Suspicious_Allocs(t *testing.T) {
1065-
t.Parallel()
1066-
alloc := mock.BatchAlloc()
1067-
1068-
conf, cleanup := testAllocRunnerConfig(t, alloc)
1069-
conf.StateDB = state.NewMemDB(conf.Logger)
1070-
1071-
defer cleanup()
1072-
ar, err := NewAllocRunner(conf)
1073-
require.NoError(t, err)
1074-
defer destroy(ar)
1075-
1076-
defer destroy(ar)
1077-
go ar.Run()
1078-
1079-
select {
1080-
case <-ar.WaitCh():
1081-
case <-time.After(10 * time.Second):
1082-
require.Fail(t, "timed out waiting for alloc to complete")
1083-
}
1084-
1085-
// shouldn't wait after successful completion
1086-
require.False(t, ar.shouldWaitForServers(nil))
1087-
1088-
// new alloc runner shouldn't restore completed alloc
1089-
ar, err = NewAllocRunner(conf)
1090-
require.NoError(t, err)
1091-
ar.Restore()
1092-
require.False(t, ar.shouldWaitForServers(nil))
1093-
1094-
// simulate 0.9.5 behavior
1095-
require.NoError(t, conf.StateDB.DeleteAllocationBucket(alloc.ID))
1096-
require.NoError(t, conf.StateDB.PutAllocation(alloc))
1097-
1098-
ar, err = NewAllocRunner(conf)
1099-
require.NoError(t, err)
1100-
ar.Restore()
1101-
require.True(t, ar.shouldWaitForServers(nil))
1102-
}

client/client.go

+50-3
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ import (
1414
"sync"
1515
"time"
1616

17-
"github.com/armon/go-metrics"
17+
metrics "github.com/armon/go-metrics"
1818
consulapi "github.com/hashicorp/consul/api"
1919
"github.com/hashicorp/consul/lib"
20-
"github.com/hashicorp/go-hclog"
21-
"github.com/hashicorp/go-multierror"
20+
hclog "github.com/hashicorp/go-hclog"
21+
multierror "github.com/hashicorp/go-multierror"
2222
"github.com/hashicorp/nomad/client/allocdir"
2323
"github.com/hashicorp/nomad/client/allocrunner"
2424
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
10061006
// Load each alloc back
10071007
for _, alloc := range allocs {
10081008

1009+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1010+
// See isPotentiallyCompletedAlloc for details. Skipping suspicious allocs
1011+
// now. If allocs should be run, they will be started when the client
1012+
// gets allocs from servers.
1013+
if c.isPotentiallyCompletedAlloc(alloc) {
1014+
c.logger.Warn("found a alloc that may have been completed already, skipping restore", "alloc_id", alloc.ID)
1015+
continue
1016+
}
1017+
10091018
//XXX On Restore we give up on watching previous allocs because
10101019
// we need the local AllocRunners initialized first. We could
10111020
// add a second loop to initialize just the alloc watcher.
@@ -1062,6 +1071,44 @@ func (c *Client) restoreState() error {
10621071
return nil
10631072
}
10641073

1074+
// isPotentiallyCompletedAlloc returns true if we suspect the alloc
1075+
// is potentially a completed alloc that got resurrected after AR was destroyed.
1076+
// In such cases, rerunning the alloc can lead to process and task exhaustion.
1077+
//
1078+
// The heuristic used here is an alloc is suspect if we see no other information
1079+
// and no other task/status info is found.
1080+
//
1081+
// See:
1082+
// * https://github.com/hashicorp/nomad/pull/6207
1083+
// * https://github.com/hashicorp/nomad/issues/5984
1084+
//
1085+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1086+
func (c *Client) isPotentiallyCompletedAlloc(alloc *structs.Allocation) bool {
1087+
if alloc.ClientStatus != structs.AllocClientStatusPending {
1088+
return false
1089+
}
1090+
1091+
ds, _ := c.stateDB.GetDeploymentStatus(alloc.ID)
1092+
if ds != nil {
1093+
return false
1094+
}
1095+
1096+
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
1097+
if tg == nil {
1098+
// corrupt alloc?!
1099+
return true
1100+
}
1101+
1102+
for _, task := range tg.Tasks {
1103+
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
1104+
if ls != nil || tr != nil {
1105+
return false
1106+
}
1107+
}
1108+
1109+
return true
1110+
}
1111+
10651112
func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
10661113
c.invalidAllocsLock.Lock()
10671114
c.invalidAllocs[alloc.ID] = struct{}{}

client/client_test.go

+45-2
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ import (
1010
"testing"
1111
"time"
1212

13-
"github.com/hashicorp/go-memdb"
13+
memdb "github.com/hashicorp/go-memdb"
14+
trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
1415
"github.com/hashicorp/nomad/client/config"
1516
consulApi "github.com/hashicorp/nomad/client/consul"
1617
"github.com/hashicorp/nomad/client/fingerprint"
18+
"github.com/hashicorp/nomad/client/state"
1719
"github.com/hashicorp/nomad/command/agent/consul"
1820
"github.com/hashicorp/nomad/helper/pluginutils/catalog"
1921
"github.com/hashicorp/nomad/helper/testlog"
@@ -27,7 +29,7 @@ import (
2729
"github.com/hashicorp/nomad/testutil"
2830
"github.com/stretchr/testify/assert"
2931

30-
"github.com/hashicorp/go-hclog"
32+
hclog "github.com/hashicorp/go-hclog"
3133
cstate "github.com/hashicorp/nomad/client/state"
3234
ctestutil "github.com/hashicorp/nomad/client/testutil"
3335
"github.com/stretchr/testify/require"
@@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
16441646
assert.EqualValues(t, n, un)
16451647
}
16461648
}
1649+
1650+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1651+
func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
1652+
t.Parallel()
1653+
1654+
c, cleanup := TestClient(t, nil)
1655+
defer cleanup()
1656+
1657+
c.stateDB = state.NewMemDB(c.logger)
1658+
1659+
t.Run("plain alloc", func(t *testing.T) {
1660+
alloc := mock.BatchAlloc()
1661+
c.stateDB.PutAllocation(alloc)
1662+
1663+
require.True(t, c.isPotentiallyCompletedAlloc(alloc))
1664+
})
1665+
1666+
t.Run("alloc with a task with local state", func(t *testing.T) {
1667+
alloc := mock.BatchAlloc()
1668+
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
1669+
ls := &trstate.LocalState{}
1670+
1671+
c.stateDB.PutAllocation(alloc)
1672+
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
1673+
1674+
require.False(t, c.isPotentiallyCompletedAlloc(alloc))
1675+
})
1676+
1677+
t.Run("alloc with a task with local state", func(t *testing.T) {
1678+
alloc := mock.BatchAlloc()
1679+
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
1680+
ts := &structs.TaskState{
1681+
State: structs.TaskStateRunning,
1682+
}
1683+
1684+
c.stateDB.PutAllocation(alloc)
1685+
c.stateDB.PutTaskState(alloc.ID, taskName, ts)
1686+
1687+
require.False(t, c.isPotentiallyCompletedAlloc(alloc))
1688+
})
1689+
}

0 commit comments

Comments
 (0)