Skip to content

Commit f143501

Browse files
authored
Merge pull request #3043 from hashicorp/f-2441-shutdown-delay
Add optional shutdown delay to tasks
2 parents 880c303 + fbe85b5 commit f143501

File tree

11 files changed

+119
-11
lines changed

11 files changed

+119
-11
lines changed

api/tasks.go

+1
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ type Task struct {
292292
Templates []*Template
293293
DispatchPayload *DispatchPayloadConfig
294294
Leader bool
295+
ShutdownDelay time.Duration `mapstructure:"shutdown_delay"`
295296
}
296297

297298
func (t *Task) Canonicalize(tg *TaskGroup, job *Job) {

client/task_runner.go

+7
Original file line numberDiff line numberDiff line change
@@ -1171,6 +1171,13 @@ func (r *TaskRunner) run() {
11711171
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
11721172
r.consul.RemoveTask(r.alloc.ID, interpTask)
11731173

1174+
// Delay actually killing the task if configured. See #244
1175+
if r.task.ShutdownDelay > 0 {
1176+
r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
1177+
r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
1178+
<-time.After(r.task.ShutdownDelay)
1179+
}
1180+
11741181
// Store the task event that provides context on the task
11751182
// destroy. The Killed event is set from the alloc_runner and
11761183
// doesn't add detail

client/task_runner_test.go

+64
Original file line numberDiff line numberDiff line change
@@ -1614,3 +1614,67 @@ func TestTaskRunner_Pre06ScriptCheck(t *testing.T) {
16141614
t.Run(run("0.5.6", "java", "tcp", false))
16151615
t.Run(run("0.5.6", "mock_driver", "tcp", false))
16161616
}
1617+
1618+
func TestTaskRunner_ShutdownDelay(t *testing.T) {
1619+
t.Parallel()
1620+
1621+
alloc := mock.Alloc()
1622+
task := alloc.Job.TaskGroups[0].Tasks[0]
1623+
task.Driver = "mock_driver"
1624+
task.Config = map[string]interface{}{
1625+
"run_for": "1000s",
1626+
}
1627+
1628+
// No shutdown escape hatch for this delay, so don't set it too high
1629+
task.ShutdownDelay = 500 * time.Duration(testutil.TestMultiplier()) * time.Millisecond
1630+
1631+
ctx := testTaskRunnerFromAlloc(t, true, alloc)
1632+
ctx.tr.MarkReceived()
1633+
go ctx.tr.Run()
1634+
defer ctx.Cleanup()
1635+
1636+
// Wait for the task to start
1637+
testWaitForTaskToStart(t, ctx)
1638+
1639+
// Begin the tear down
1640+
ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled))
1641+
destroyed := time.Now()
1642+
1643+
// Service should get removed quickly; loop until RemoveTask is called
1644+
found := false
1645+
mockConsul := ctx.tr.consul.(*mockConsulServiceClient)
1646+
deadline := destroyed.Add(task.ShutdownDelay)
1647+
for time.Now().Before(deadline) {
1648+
time.Sleep(5 * time.Millisecond)
1649+
1650+
mockConsul.mu.Lock()
1651+
n := len(mockConsul.ops)
1652+
if n < 2 {
1653+
mockConsul.mu.Unlock()
1654+
continue
1655+
}
1656+
1657+
lastOp := mockConsul.ops[n-1].op
1658+
mockConsul.mu.Unlock()
1659+
1660+
if lastOp == "remove" {
1661+
found = true
1662+
break
1663+
}
1664+
}
1665+
if !found {
1666+
t.Errorf("task was not removed from Consul first")
1667+
}
1668+
1669+
// Wait for actual exit
1670+
select {
1671+
case <-ctx.tr.WaitCh():
1672+
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
1673+
t.Fatalf("timeout")
1674+
}
1675+
1676+
// It should be impossible to reach here in less time than the shutdown delay
1677+
if time.Now().Before(destroyed.Add(task.ShutdownDelay)) {
1678+
t.Fatalf("task exited before shutdown delay")
1679+
}
1680+
}

command/agent/job_endpoint.go

+1
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) {
663663
structsTask.Env = apiTask.Env
664664
structsTask.Meta = apiTask.Meta
665665
structsTask.KillTimeout = *apiTask.KillTimeout
666+
structsTask.ShutdownDelay = apiTask.ShutdownDelay
666667

667668
if l := len(apiTask.Constraints); l != 0 {
668669
structsTask.Constraints = make([]*structs.Constraint, l)

jobspec/parse.go

+1
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ func parseTasks(jobName string, taskGroupName string, result *[]*api.Task, list
586586
"meta",
587587
"resources",
588588
"service",
589+
"shutdown_delay",
589590
"template",
590591
"user",
591592
"vault",

jobspec/parse_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ func TestParse(t *testing.T) {
148148
},
149149
},
150150
},
151-
KillTimeout: helper.TimeToPtr(22 * time.Second),
151+
KillTimeout: helper.TimeToPtr(22 * time.Second),
152+
ShutdownDelay: 11 * time.Second,
152153
LogConfig: &api.LogConfig{
153154
MaxFiles: helper.IntToPtr(14),
154155
MaxFileSizeMB: helper.IntToPtr(101),

jobspec/test-fixtures/basic.hcl

+2
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ job "binstore-storagelocker" {
129129

130130
kill_timeout = "22s"
131131

132+
shutdown_delay = "11s"
133+
132134
artifact {
133135
source = "http://foo.com/artifact"
134136

nomad/structs/diff_test.go

+21-9
Original file line numberDiff line numberDiff line change
@@ -1922,8 +1922,8 @@ func TestTaskGroupDiff(t *testing.T) {
19221922
Driver: "docker",
19231923
},
19241924
{
1925-
Name: "baz",
1926-
Driver: "docker",
1925+
Name: "baz",
1926+
ShutdownDelay: 1 * time.Second,
19271927
},
19281928
},
19291929
},
@@ -1933,14 +1933,14 @@ func TestTaskGroupDiff(t *testing.T) {
19331933
Name: "bar",
19341934
Driver: "docker",
19351935
},
1936-
{
1937-
Name: "baz",
1938-
Driver: "exec",
1939-
},
19401936
{
19411937
Name: "bam",
19421938
Driver: "docker",
19431939
},
1940+
{
1941+
Name: "baz",
1942+
ShutdownDelay: 2 * time.Second,
1943+
},
19441944
},
19451945
},
19461946
Expected: &TaskGroupDiff{
@@ -1968,6 +1968,12 @@ func TestTaskGroupDiff(t *testing.T) {
19681968
Old: "",
19691969
New: "false",
19701970
},
1971+
{
1972+
Type: DiffTypeAdded,
1973+
Name: "ShutdownDelay",
1974+
Old: "",
1975+
New: "0",
1976+
},
19711977
},
19721978
},
19731979
{
@@ -1980,9 +1986,9 @@ func TestTaskGroupDiff(t *testing.T) {
19801986
Fields: []*FieldDiff{
19811987
{
19821988
Type: DiffTypeEdited,
1983-
Name: "Driver",
1984-
Old: "docker",
1985-
New: "exec",
1989+
Name: "ShutdownDelay",
1990+
Old: "1000000000",
1991+
New: "2000000000",
19861992
},
19871993
},
19881994
},
@@ -2008,6 +2014,12 @@ func TestTaskGroupDiff(t *testing.T) {
20082014
Old: "false",
20092015
New: "",
20102016
},
2017+
{
2018+
Type: DiffTypeDeleted,
2019+
Name: "ShutdownDelay",
2020+
Old: "0",
2021+
New: "",
2022+
},
20112023
},
20122024
},
20132025
},

nomad/structs/structs.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -2997,6 +2997,10 @@ type Task struct {
29972997
// Leader marks the task as the leader within the group. When the leader
29982998
// task exits, other tasks will be gracefully terminated.
29992999
Leader bool
3000+
3001+
// ShutdownDelay is the duration of the delay between deregistering a
3002+
// task from Consul and sending it a signal to shutdown. See #2441
3003+
ShutdownDelay time.Duration
30003004
}
30013005

30023006
func (t *Task) Copy() *Task {
@@ -3104,9 +3108,12 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk) error {
31043108
if t.Driver == "" {
31053109
mErr.Errors = append(mErr.Errors, errors.New("Missing task driver"))
31063110
}
3107-
if t.KillTimeout.Nanoseconds() < 0 {
3111+
if t.KillTimeout < 0 {
31083112
mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value"))
31093113
}
3114+
if t.ShutdownDelay < 0 {
3115+
mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value"))
3116+
}
31103117

31113118
// Validate the resources.
31123119
if t.Resources == nil {

website/source/api/json-jobs.html.md

+6
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,12 @@ The `Task` object supports the following keys:
367367
- `TLSSkipVerify`: If true, Consul will not attempt to verify the
368368
certificate when performing HTTPS checks. Requires Consul >= 0.7.2.
369369

370+
- `ShutdownDelay` - Specifies the duration to wait when killing a task between
371+
removing it from Consul and sending it a shutdown signal. Ideally services
372+
would fail healthchecks once they receive a shutdown signal. Alternatively
373+
`ShutdownDelay` may be set to give in flight requests time to complete before
374+
shutting down.
375+
370376
- `Templates` - Specifies the set of [`Template`](#template) objects to render for the task.
371377
Templates can be used to inject both static and dynamic configuration with
372378
data populated from environment variables, Consul and Vault.

website/source/docs/job-specification/task.html.md

+6
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ job "docs" {
7171
[Consul][] for service discovery. Nomad automatically registers when a task
7272
is started and de-registers it when the task dies.
7373

74+
- `shutdown_delay` `(string: "0s")` - Specifies the duration to wait when
75+
killing a task between removing it from Consul and sending it a shutdown
76+
signal. Ideally services would fail healthchecks once they receive a shutdown
77+
signal. Alternatively `shutdown_delay` may be set to give in flight requests
78+
time to complete before shutting down.
79+
7480
- `user` `(string: <varies>)` - Specifies the user that will run the task.
7581
Defaults to `nobody` for the [`exec`][exec] and [`java`][java] drivers.
7682
[Docker][] and [rkt][] images specify their own default users. This can only

0 commit comments

Comments
 (0)