From f9b11a045d84b8cff622190be1483f0f08de69a8 Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Fri, 3 Jun 2022 10:24:17 -0400 Subject: [PATCH 1/6] task_runner: Add IsSidecarTask helper and correct doc string --- client/allocrunner/task_hook_coordinator.go | 8 +++----- client/allocrunner/taskrunner/task_runner_getters.go | 5 +++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/client/allocrunner/task_hook_coordinator.go b/client/allocrunner/task_hook_coordinator.go index 09f90d22d67..a056fa3eb83 100644 --- a/client/allocrunner/task_hook_coordinator.go +++ b/client/allocrunner/task_hook_coordinator.go @@ -179,8 +179,7 @@ func (c *taskHookCoordinator) StartPoststopTasks() { // hasNonSidecarTasks returns false if all the passed tasks are sidecar tasks func hasNonSidecarTasks(tasks []*taskrunner.TaskRunner) bool { for _, tr := range tasks { - lc := tr.Task().Lifecycle - if lc == nil || !lc.Sidecar { + if !tr.IsSidecarTask() { return true } } @@ -188,11 +187,10 @@ func hasNonSidecarTasks(tasks []*taskrunner.TaskRunner) bool { return false } -// hasSidecarTasks returns true if all the passed tasks are sidecar tasks +// hasSidecarTasks returns true if any of the passed tasks are sidecar tasks func hasSidecarTasks(tasks map[string]*taskrunner.TaskRunner) bool { for _, tr := range tasks { - lc := tr.Task().Lifecycle - if lc != nil && lc.Sidecar { + if tr.IsSidecarTask() { return true } } diff --git a/client/allocrunner/taskrunner/task_runner_getters.go b/client/allocrunner/taskrunner/task_runner_getters.go index fcf3189b7a1..4d9c35e6e41 100644 --- a/client/allocrunner/taskrunner/task_runner_getters.go +++ b/client/allocrunner/taskrunner/task_runner_getters.go @@ -33,6 +33,11 @@ func (tr *TaskRunner) IsPoststopTask() bool { return tr.Task().Lifecycle != nil && tr.Task().Lifecycle.Hook == structs.TaskLifecycleHookPoststop } +// IsSidecarTask returns true if this task is a sidecar task in its task group. +func (tr *TaskRunner) IsSidecarTask() bool { + return tr.Task().Lifecycle != nil && tr.Task().Lifecycle.Sidecar +} + func (tr *TaskRunner) Task() *structs.Task { tr.taskLock.RLock() defer tr.taskLock.RUnlock() From 340fe58ce351cb22a20d964067f7c93280eb1148 Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Fri, 3 Jun 2022 10:24:44 -0400 Subject: [PATCH 2/6] alloc_runner: stop sidecar tasks last --- client/allocrunner/alloc_runner.go | 38 ++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index c5294e22301..15a035be3ff 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -626,16 +626,16 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) } - state := tr.TaskState() - states[name] = state + taskState := tr.TaskState() + states[name] = taskState break } - // Kill the rest concurrently + // Kill the rest non-sidecar or poststop tasks concurrently wg := sync.WaitGroup{} for name, tr := range ar.tasks { - // Filter out poststop tasks so they run after all the other tasks are killed - if tr.IsLeader() || tr.IsPoststopTask() { + // Filter out poststop and sidecar tasks so that they stop after all the other tasks are killed + if tr.IsLeader() || tr.IsPoststopTask() || tr.IsSidecarTask() { continue } @@ -649,9 +649,33 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { ar.logger.Warn("error stopping task", "error", err, "task_name", name) } - state := tr.TaskState() + taskState := tr.TaskState() mu.Lock() - states[name] = state + states[name] = taskState + mu.Unlock() + }(name, tr) + } + wg.Wait() + + // Kill the sidecar tasks last. + for name, tr := range ar.tasks { + if !tr.IsSidecarTask() || tr.IsLeader() || tr.IsPoststopTask() { + continue + } + + wg.Add(1) + go func(name string, tr *taskrunner.TaskRunner) { + defer wg.Done() + taskEvent := structs.NewTaskEvent(structs.TaskKilling) + taskEvent.SetKillTimeout(tr.Task().KillTimeout) + err := tr.Kill(context.TODO(), taskEvent) + if err != nil && err != taskrunner.ErrTaskNotRunning { + ar.logger.Warn("error stopping task", "error", err, "task_name", name) + } + + taskState := tr.TaskState() + mu.Lock() + states[name] = taskState mu.Unlock() }(name, tr) } From be7adb9a84c0c32f102b414b20c7621114dbbc27 Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Fri, 3 Jun 2022 10:26:08 -0400 Subject: [PATCH 3/6] alloc_runner: rename variable to avoid import collision --- client/allocrunner/alloc_runner.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 15a035be3ff..c9480c7c44f 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -525,21 +525,21 @@ func (ar *allocRunner) handleTaskStateUpdates() { states := make(map[string]*structs.TaskState, trNum) for name, tr := range ar.tasks { - state := tr.TaskState() - states[name] = state + taskState := tr.TaskState() + states[name] = taskState if tr.IsPoststopTask() { continue } // Capture live task runners in case we need to kill them - if state.State != structs.TaskStateDead { + if taskState.State != structs.TaskStateDead { liveRunners = append(liveRunners, tr) continue } // Task is dead, determine if other tasks should be killed - if state.Failed { + if taskState.Failed { // Only set failed event if no event has been // set yet to give dead leaders priority. if killEvent == nil { From 6a4777ee3fc44432153faf3a7cf050e1ca3f3efb Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Tue, 7 Jun 2022 06:58:20 -0400 Subject: [PATCH 4/6] alloc_runner: differentiate logging messages on task shutdown --- client/allocrunner/alloc_runner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index c9480c7c44f..5ef9e5743f2 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -670,7 +670,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { taskEvent.SetKillTimeout(tr.Task().KillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { - ar.logger.Warn("error stopping task", "error", err, "task_name", name) + ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) } taskState := tr.TaskState() From 0311a4d04ac4e814522c8923273f95f3fdabac2e Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Tue, 7 Jun 2022 08:59:28 -0400 Subject: [PATCH 5/6] alloc_runner: add test for task shutdown order --- client/allocrunner/alloc_runner_test.go | 130 +++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 510d19940dd..a33df2dd6ef 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -1648,7 +1648,6 @@ func TestAllocRunner_Reconnect(t *testing.T) { require.Equal(t, tc.clientStatus, ar.AllocState().ClientStatus) - // Make sure the runner's alloc indexes match the update. require.Equal(t, update.AllocModifyIndex, ar.Alloc().AllocModifyIndex) require.Equal(t, update.ModifyIndex, ar.Alloc().ModifyIndex) @@ -1683,3 +1682,132 @@ func TestAllocRunner_Reconnect(t *testing.T) { }) } } + +// TestAllocRunner_Lifecycle_Shutdown_Order asserts that a service job with 3 +// lifecycle hooks (1 sidecar, 1 ephemeral, 1 poststop) starts all 4 tasks, and shuts down +// the sidecar after main, but before poststop. +func TestAllocRunner_Lifecycle_Shutdown_Order(t *testing.T) { + alloc := mock.LifecycleAllocWithPoststopDeploy() + + alloc.Job.Type = structs.JobTypeService + + mainTask := alloc.Job.TaskGroups[0].Tasks[0] + mainTask.Config["run_for"] = "100s" + + sidecarTask := alloc.Job.TaskGroups[0].Tasks[1] + sidecarTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststart + sidecarTask.Config["run_for"] = "100s" + + poststopTask := alloc.Job.TaskGroups[0].Tasks[2] + ephemeralTask := alloc.Job.TaskGroups[0].Tasks[3] + + alloc.Job.TaskGroups[0].Tasks = []*structs.Task{mainTask, ephemeralTask, sidecarTask, poststopTask} + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + ar, err := NewAllocRunner(conf) + require.NoError(t, err) + defer destroy(ar) + go ar.Run() + + upd := conf.StateUpdater.(*MockStateUpdater) + + // Wait for main and sidecar tasks to be running, and that the + // ephemeral task ran and exited. + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + + if last.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus) + } + + if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning { + return false, fmt.Errorf("expected main task to be running not %s", s) + } + + if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateRunning { + return false, fmt.Errorf("expected sidecar task to be running not %s", s) + } + + if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected ephemeral task to be dead not %s", s) + } + + if last.TaskStates[ephemeralTask.Name].Failed { + return false, fmt.Errorf("expected ephemeral task to be successful not failed") + } + + return true, nil + }, func(err error) { + t.Fatalf("error waiting for initial state:\n%v", err) + }) + + // Tell the alloc to stop + stopAlloc := alloc.Copy() + stopAlloc.DesiredStatus = structs.AllocDesiredStatusStop + ar.Update(stopAlloc) + + // Wait for tasks to stop. + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + + if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected ephemeral task to be dead not %s", s) + } + + if last.TaskStates[ephemeralTask.Name].Failed { + return false, fmt.Errorf("expected ephemeral task to be successful not failed") + } + + if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected main task to be dead not %s", s) + } + + if last.TaskStates[mainTask.Name].Failed { + return false, fmt.Errorf("expected main task to be successful not failed") + } + + if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected sidecar task to be dead not %s", s) + } + + if last.TaskStates[sidecarTask.Name].Failed { + return false, fmt.Errorf("expected sidecar task to be successful not failed") + } + + if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStateRunning { + return false, fmt.Errorf("expected poststop task to be running not %s", s) + } + + return true, nil + }, func(err error) { + t.Fatalf("error waiting for kill state:\n%v", err) + }) + + last := upd.Last() + require.Less(t, last.TaskStates[ephemeralTask.Name].FinishedAt, last.TaskStates[mainTask.Name].FinishedAt) + require.Less(t, last.TaskStates[mainTask.Name].FinishedAt, last.TaskStates[sidecarTask.Name].FinishedAt) + + // Wait for poststop task to stop. + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + + if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStateDead { + return false, fmt.Errorf("expected poststop task to be dead not %s", s) + } + + if last.TaskStates[poststopTask.Name].Failed { + return false, fmt.Errorf("expected poststop task to be successful not failed") + } + + return true, nil + }, func(err error) { + t.Fatalf("error waiting for poststop state:\n%v", err) + }) + + last = upd.Last() + require.Less(t, last.TaskStates[sidecarTask.Name].FinishedAt, last.TaskStates[poststopTask.Name].FinishedAt) +} From d78cfa9e96ab4dc6e7524c755a5513d8ca77442a Mon Sep 17 00:00:00 2001 From: Derek Strickland Date: Tue, 7 Jun 2022 09:07:11 -0400 Subject: [PATCH 6/6] changelog: 13055 stop sidecars last --- .changelog/13055.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/13055.txt diff --git a/.changelog/13055.txt b/.changelog/13055.txt new file mode 100644 index 00000000000..f04454a1938 --- /dev/null +++ b/.changelog/13055.txt @@ -0,0 +1,3 @@ +```release-note:bug +lifecycle: fixed a bug where sidecar tasks were not being stopped last +```