@@ -14,11 +14,11 @@ import (
14
14
"sync"
15
15
"time"
16
16
17
- "github.com/armon/go-metrics"
17
+ metrics "github.com/armon/go-metrics"
18
18
consulapi "github.com/hashicorp/consul/api"
19
19
"github.com/hashicorp/consul/lib"
20
- "github.com/hashicorp/go-hclog"
21
- "github.com/hashicorp/go-multierror"
20
+ hclog "github.com/hashicorp/go-hclog"
21
+ multierror "github.com/hashicorp/go-multierror"
22
22
"github.com/hashicorp/nomad/client/allocdir"
23
23
"github.com/hashicorp/nomad/client/allocrunner"
24
24
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
1006
1006
// Load each alloc back
1007
1007
for _ , alloc := range allocs {
1008
1008
1009
+ // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1010
+ // See hasLocalState for details. Skipping suspicious allocs
1011
+ // now. If allocs should be run, they will be started when the client
1012
+ // gets allocs from servers.
1013
+ if ! c .hasLocalState (alloc ) {
1014
+ c .logger .Warn ("found a alloc without any local state, skipping restore" , "alloc_id" , alloc .ID )
1015
+ continue
1016
+ }
1017
+
1009
1018
//XXX On Restore we give up on watching previous allocs because
1010
1019
// we need the local AllocRunners initialized first. We could
1011
1020
// add a second loop to initialize just the alloc watcher.
@@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
1062
1071
return nil
1063
1072
}
1064
1073
1074
+ // hasLocalState returns true if we have any other associated state
1075
+ // with alloc beyond the task itself
1076
+ //
1077
+ // Useful for detecting if a potentially completed alloc got resurrected
1078
+ // after AR was destroyed. In such cases, re-running the alloc lead to
1079
+ // unexpected reruns and may lead to process and task exhaustion on node.
1080
+ //
1081
+ // The heuristic used here is an alloc is suspect if we see no other information
1082
+ // and no other task/status info is found.
1083
+ //
1084
+ // Also, an alloc without any client state will not be restored correctly; there will
1085
+ // be no tasks processes to reattach to, etc. In such cases, client should
1086
+ // wait until it gets allocs from server to launch them.
1087
+ //
1088
+ // See:
1089
+ // * https://github.com/hashicorp/nomad/pull/6207
1090
+ // * https://github.com/hashicorp/nomad/issues/5984
1091
+ //
1092
+ // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
1093
+ func (c * Client ) hasLocalState (alloc * structs.Allocation ) bool {
1094
+ tg := alloc .Job .LookupTaskGroup (alloc .TaskGroup )
1095
+ if tg == nil {
1096
+ // corrupt alloc?!
1097
+ return false
1098
+ }
1099
+
1100
+ for _ , task := range tg .Tasks {
1101
+ ls , tr , _ := c .stateDB .GetTaskRunnerState (alloc .ID , task .Name )
1102
+ if ls != nil || tr != nil {
1103
+ return true
1104
+ }
1105
+ }
1106
+
1107
+ return false
1108
+ }
1109
+
1065
1110
func (c * Client ) handleInvalidAllocs (alloc * structs.Allocation , err error ) {
1066
1111
c .invalidAllocsLock .Lock ()
1067
1112
c .invalidAllocs [alloc .ID ] = struct {}{}
0 commit comments