Skip to content

Commit b2ef75e

Browse files
author
Mahmood Ali
authored
Merge pull request #6216 from hashicorp/b-recognize-pending-allocs
alloc_runner: wait when starting suspicious allocs
2 parents ddf2f6b + 8b05f87 commit b2ef75e

File tree

2 files changed

+93
-5
lines changed

2 files changed

+93
-5
lines changed

client/client.go

+48-3
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ import (
1414
"sync"
1515
"time"
1616

17-
"github.com/armon/go-metrics"
17+
metrics "github.com/armon/go-metrics"
1818
consulapi "github.com/hashicorp/consul/api"
1919
"github.com/hashicorp/consul/lib"
20-
"github.com/hashicorp/go-hclog"
21-
"github.com/hashicorp/go-multierror"
20+
hclog "github.com/hashicorp/go-hclog"
21+
multierror "github.com/hashicorp/go-multierror"
2222
"github.com/hashicorp/nomad/client/allocdir"
2323
"github.com/hashicorp/nomad/client/allocrunner"
2424
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
10061006
// Load each alloc back
10071007
for _, alloc := range allocs {
10081008

1009+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1010+
// See hasLocalState for details. Skipping suspicious allocs
1011+
// now. If allocs should be run, they will be started when the client
1012+
// gets allocs from servers.
1013+
if !c.hasLocalState(alloc) {
1014+
c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
1015+
continue
1016+
}
1017+
10091018
//XXX On Restore we give up on watching previous allocs because
10101019
// we need the local AllocRunners initialized first. We could
10111020
// add a second loop to initialize just the alloc watcher.
@@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
10621071
return nil
10631072
}
10641073

1074+
// hasLocalState returns true if we have any other associated state
1075+
// with alloc beyond the task itself
1076+
//
1077+
// Useful for detecting if a potentially completed alloc got resurrected
1078+
// after AR was destroyed. In such cases, re-running the alloc lead to
1079+
// unexpected reruns and may lead to process and task exhaustion on node.
1080+
//
1081+
// The heuristic used here is an alloc is suspect if we see no other information
1082+
// and no other task/status info is found.
1083+
//
1084+
// Also, an alloc without any client state will not be restored correctly; there will
1085+
// be no tasks processes to reattach to, etc. In such cases, client should
1086+
// wait until it gets allocs from server to launch them.
1087+
//
1088+
// See:
1089+
// * https://github.com/hashicorp/nomad/pull/6207
1090+
// * https://github.com/hashicorp/nomad/issues/5984
1091+
//
1092+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1093+
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
1094+
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
1095+
if tg == nil {
1096+
// corrupt alloc?!
1097+
return false
1098+
}
1099+
1100+
for _, task := range tg.Tasks {
1101+
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
1102+
if ls != nil || tr != nil {
1103+
return true
1104+
}
1105+
}
1106+
1107+
return false
1108+
}
1109+
10651110
func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
10661111
c.invalidAllocsLock.Lock()
10671112
c.invalidAllocs[alloc.ID] = struct{}{}

client/client_test.go

+45-2
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ import (
1010
"testing"
1111
"time"
1212

13-
"github.com/hashicorp/go-memdb"
13+
memdb "github.com/hashicorp/go-memdb"
14+
trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
1415
"github.com/hashicorp/nomad/client/config"
1516
consulApi "github.com/hashicorp/nomad/client/consul"
1617
"github.com/hashicorp/nomad/client/fingerprint"
18+
"github.com/hashicorp/nomad/client/state"
1719
"github.com/hashicorp/nomad/command/agent/consul"
1820
"github.com/hashicorp/nomad/helper/pluginutils/catalog"
1921
"github.com/hashicorp/nomad/helper/testlog"
@@ -27,7 +29,7 @@ import (
2729
"github.com/hashicorp/nomad/testutil"
2830
"github.com/stretchr/testify/assert"
2931

30-
"github.com/hashicorp/go-hclog"
32+
hclog "github.com/hashicorp/go-hclog"
3133
cstate "github.com/hashicorp/nomad/client/state"
3234
ctestutil "github.com/hashicorp/nomad/client/testutil"
3335
"github.com/stretchr/testify/require"
@@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
16441646
assert.EqualValues(t, n, un)
16451647
}
16461648
}
1649+
1650+
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1651+
func TestClient_hasLocalState(t *testing.T) {
1652+
t.Parallel()
1653+
1654+
c, cleanup := TestClient(t, nil)
1655+
defer cleanup()
1656+
1657+
c.stateDB = state.NewMemDB(c.logger)
1658+
1659+
t.Run("plain alloc", func(t *testing.T) {
1660+
alloc := mock.BatchAlloc()
1661+
c.stateDB.PutAllocation(alloc)
1662+
1663+
require.False(t, c.hasLocalState(alloc))
1664+
})
1665+
1666+
t.Run("alloc with a task with local state", func(t *testing.T) {
1667+
alloc := mock.BatchAlloc()
1668+
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
1669+
ls := &trstate.LocalState{}
1670+
1671+
c.stateDB.PutAllocation(alloc)
1672+
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
1673+
1674+
require.True(t, c.hasLocalState(alloc))
1675+
})
1676+
1677+
t.Run("alloc with a task with task state", func(t *testing.T) {
1678+
alloc := mock.BatchAlloc()
1679+
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
1680+
ts := &structs.TaskState{
1681+
State: structs.TaskStateRunning,
1682+
}
1683+
1684+
c.stateDB.PutAllocation(alloc)
1685+
c.stateDB.PutTaskState(alloc.ID, taskName, ts)
1686+
1687+
require.True(t, c.hasLocalState(alloc))
1688+
})
1689+
}

0 commit comments

Comments
 (0)