Skip to content

Commit e620cd1

Browse files
committed
taskrunner: add different Restart modes
Using the task event to differentiate between the allocrunner restart methods proved to be confusing for developers to understand how it all worked. So instead of relying on the event type, this commit separated the logic of restarting an taskRunner into two methods: - `Restart` will retain the current behaviour and only will only restart the task if it's currently running. - `ForceRestart` is the new method where a `dead` task is allowed to restart if its `Run()` method is still active. Callers will need to restart the allocRunner taskCoordinator to make sure it will allow the task to run again.
1 parent 647f071 commit e620cd1

File tree

7 files changed

+72
-76
lines changed

7 files changed

+72
-76
lines changed

client/allocrunner/alloc_runner.go

+13-30
Original file line numberDiff line numberDiff line change
@@ -1230,25 +1230,12 @@ func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventH
12301230

12311231
// Restart satisfies the WorkloadRestarter interface and restarts all tasks
12321232
// that are currently running.
1233-
//
1234-
// The event type will be set to TaskRestartRunningSignal to comply with
1235-
// internal restart logic requirements.
12361233
func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
1237-
if event.Type != structs.TaskRestartRunningSignal {
1238-
event.Type = structs.TaskRestartRunningSignal
1239-
}
1240-
return ar.restartTasks(ctx, event, failure)
1234+
return ar.restartTasks(ctx, event, failure, false)
12411235
}
12421236

12431237
// RestartTask restarts the provided task.
1244-
//
1245-
// The event type will be set to TaskRestartSignal to comply with internal
1246-
// restart logic requirements.
12471238
func (ar *allocRunner) RestartTask(taskName string, event *structs.TaskEvent) error {
1248-
if event.Type != structs.TaskRestartSignal {
1249-
event.Type = structs.TaskRestartSignal
1250-
}
1251-
12521239
tr, ok := ar.tasks[taskName]
12531240
if !ok {
12541241
return fmt.Errorf("Could not find task runner for task: %s", taskName)
@@ -1258,31 +1245,20 @@ func (ar *allocRunner) RestartTask(taskName string, event *structs.TaskEvent) er
12581245
}
12591246

12601247
// RestartRunning restarts all tasks that are currently running.
1261-
//
1262-
// The event type will be set to TaskRestartRunningSignal to comply with
1263-
// internal restart logic requirements.
12641248
func (ar *allocRunner) RestartRunning(event *structs.TaskEvent) error {
1265-
if event.Type != structs.TaskRestartRunningSignal {
1266-
event.Type = structs.TaskRestartRunningSignal
1267-
}
1268-
return ar.restartTasks(context.TODO(), event, false)
1249+
return ar.restartTasks(context.TODO(), event, false, false)
12691250
}
12701251

12711252
// RestartAll restarts all tasks in the allocation, including dead ones. They
1272-
// will restart following their lifecycle order. Only the TaskRestartAllSignal
1273-
// event type may be used.
1253+
// will restart following their lifecycle order.
12741254
func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
1275-
if event.Type != structs.TaskRestartAllSignal {
1276-
return fmt.Errorf("Invalid event %s for all tasks restart request", event.Type)
1277-
}
1278-
12791255
// Restart the taskCoordinator to allow dead tasks to run again.
12801256
ar.taskCoordinator.Restart()
1281-
return ar.restartTasks(context.TODO(), event, false)
1257+
return ar.restartTasks(context.TODO(), event, false, true)
12821258
}
12831259

12841260
// restartTasks restarts all task runners concurrently.
1285-
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool) error {
1261+
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {
12861262
waitCh := make(chan struct{})
12871263
var err *multierror.Error
12881264
var errMutex sync.Mutex
@@ -1297,7 +1273,14 @@ func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEven
12971273
wg.Add(1)
12981274
go func(taskName string, taskRunner *taskrunner.TaskRunner) {
12991275
defer wg.Done()
1300-
e := taskRunner.Restart(ctx, event.Copy(), failure)
1276+
1277+
var e error
1278+
if force {
1279+
e = taskRunner.ForceRestart(ctx, event.Copy(), failure)
1280+
} else {
1281+
e = taskRunner.Restart(ctx, event.Copy(), failure)
1282+
}
1283+
13011284
// Ignore ErrTaskNotRunning errors since tasks that are not
13021285
// running are expected to not be restarted.
13031286
if e != nil && e != taskrunner.ErrTaskNotRunning {

client/allocrunner/alloc_runner_test.go

+6-10
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,8 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
504504
Mode: structs.RestartPolicyModeFail,
505505
}
506506

507+
ev := &structs.TaskEvent{Type: structs.TaskRestartSignal}
508+
507509
testCases := []struct {
508510
name string
509511
taskDefs []mock.LifecycleTaskDef
@@ -516,7 +518,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
516518
{
517519
name: "restart entire allocation",
518520
action: func(ar *allocRunner, alloc *structs.Allocation) error {
519-
ev := &structs.TaskEvent{Type: structs.TaskRestartAllSignal}
520521
return ar.RestartAll(ev)
521522
},
522523
expectedAfter: map[string]structs.TaskState{
@@ -531,7 +532,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
531532
{
532533
name: "restart only running tasks",
533534
action: func(ar *allocRunner, alloc *structs.Allocation) error {
534-
ev := &structs.TaskEvent{Type: structs.TaskRestartRunningSignal}
535535
return ar.RestartRunning(ev)
536536
},
537537
expectedAfter: map[string]structs.TaskState{
@@ -555,7 +555,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
555555
},
556556
isBatch: true,
557557
action: func(ar *allocRunner, alloc *structs.Allocation) error {
558-
ev := &structs.TaskEvent{Type: structs.TaskRestartAllSignal}
559558
return ar.RestartAll(ev)
560559
},
561560
expectedAfter: map[string]structs.TaskState{
@@ -579,7 +578,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
579578
},
580579
isBatch: true,
581580
action: func(ar *allocRunner, alloc *structs.Allocation) error {
582-
ev := &structs.TaskEvent{Type: structs.TaskRestartRunningSignal}
583581
return ar.RestartRunning(ev)
584582
},
585583
expectedAfter: map[string]structs.TaskState{
@@ -595,7 +593,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
595593
name: "restart entire allocation with leader",
596594
hasLeader: true,
597595
action: func(ar *allocRunner, alloc *structs.Allocation) error {
598-
ev := &structs.TaskEvent{Type: structs.TaskRestartAllSignal}
599596
return ar.RestartAll(ev)
600597
},
601598
expectedAfter: map[string]structs.TaskState{
@@ -627,7 +624,6 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
627624
{
628625
name: "restart main task",
629626
action: func(ar *allocRunner, alloc *structs.Allocation) error {
630-
ev := &structs.TaskEvent{Type: structs.TaskRestartSignal}
631627
return ar.RestartTask("main", ev)
632628
},
633629
expectedAfter: map[string]structs.TaskState{
@@ -643,7 +639,7 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
643639
name: "restart leader main task",
644640
hasLeader: true,
645641
action: func(ar *allocRunner, alloc *structs.Allocation) error {
646-
return ar.RestartTask("main", &structs.TaskEvent{Type: structs.TaskRestartSignal})
642+
return ar.RestartTask("main", ev)
647643
},
648644
expectedAfter: map[string]structs.TaskState{
649645
"main": structs.TaskState{State: "running", Restarts: 1},
@@ -761,7 +757,7 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
761757
// make sure main task has had a chance to restart once on its
762758
// own and fail again before we try to manually restart it
763759
time.Sleep(5 * time.Second)
764-
return ar.RestartTask("main", &structs.TaskEvent{Type: structs.TaskRestartSignal})
760+
return ar.RestartTask("main", ev)
765761
},
766762
expectedErr: "Task not running",
767763
expectedAfter: map[string]structs.TaskState{
@@ -776,7 +772,7 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
776772
{
777773
name: "restart prestart-sidecar task",
778774
action: func(ar *allocRunner, alloc *structs.Allocation) error {
779-
return ar.RestartTask("prestart-sidecar", &structs.TaskEvent{Type: structs.TaskRestartSignal})
775+
return ar.RestartTask("prestart-sidecar", ev)
780776
},
781777
expectedAfter: map[string]structs.TaskState{
782778
"main": structs.TaskState{State: "running", Restarts: 0},
@@ -790,7 +786,7 @@ func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
790786
{
791787
name: "restart poststart-sidecar task",
792788
action: func(ar *allocRunner, alloc *structs.Allocation) error {
793-
return ar.RestartTask("poststart-sidecar", &structs.TaskEvent{Type: structs.TaskRestartSignal})
789+
return ar.RestartTask("poststart-sidecar", ev)
794790
},
795791
expectedAfter: map[string]structs.TaskState{
796792
"main": structs.TaskState{State: "running", Restarts: 0},

client/allocrunner/taskrunner/lifecycle.go

+46-9
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,34 @@ import (
66
"github.com/hashicorp/nomad/nomad/structs"
77
)
88

9-
// Restart a task. Returns immediately if no task is running. Blocks until
10-
// existing task exits or passed-in context is canceled.
9+
// Restart restarts a task that is already running. Returns an error if the
10+
// task is not running. Blocks until existing task exits or passed-in context
11+
// is canceled.
1112
func (tr *TaskRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
1213
tr.logger.Trace("Restart requested", "failure", failure, "event", event.GoString())
1314

14-
// Check if the task is able to restart based on its state and the type of
15-
// restart event that was triggered.
15+
taskState := tr.TaskState()
16+
if taskState == nil {
17+
return ErrTaskNotRunning
18+
}
19+
20+
switch taskState.State {
21+
case structs.TaskStatePending, structs.TaskStateDead:
22+
return ErrTaskNotRunning
23+
}
24+
25+
return tr.restartImpl(ctx, event, failure)
26+
}
27+
28+
// ForceRestart restarts a task that is already running or reruns it if dead.
29+
// Returns an error if the task is not able to rerun. Blocks until existing
30+
// task exits or passed-in context is canceled.
31+
//
32+
// Callers must restart the AllocRuner taskCoordinator beforehand to make sure
33+
// the task will be able to run again.
34+
func (tr *TaskRunner) ForceRestart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
35+
tr.logger.Trace("Force restart requested", "failure", failure, "event", event.GoString())
36+
1637
taskState := tr.TaskState()
1738
if taskState == nil {
1839
return ErrTaskNotRunning
@@ -21,23 +42,39 @@ func (tr *TaskRunner) Restart(ctx context.Context, event *structs.TaskEvent, fai
2142
tr.stateLock.Lock()
2243
localState := tr.localState.Copy()
2344
tr.stateLock.Unlock()
45+
2446
if localState == nil {
2547
return ErrTaskNotRunning
2648
}
2749

2850
switch taskState.State {
2951
case structs.TaskStatePending:
30-
// Tasks that are "pending" are never allowed to restart.
3152
return ErrTaskNotRunning
53+
3254
case structs.TaskStateDead:
33-
// Tasks that are "dead" are only allowed to restart when restarting
34-
// all tasks in the alloc, otherwise the taskCoordinator will prevent
35-
// it from running again, and if their Run method is still running.
36-
if event.Type != structs.TaskRestartAllSignal || localState.RunComplete {
55+
// Tasks that are in the "dead" state are only allowed to restart if
56+
// their Run() method is still active.
57+
if localState.RunComplete {
3758
return ErrTaskNotRunning
3859
}
3960
}
4061

62+
return tr.restartImpl(ctx, event, failure)
63+
}
64+
65+
// restartImpl implements to task restart process.
66+
//
67+
// It should never be called directly as it doesn't verify if the task state
68+
// allows for a restart.
69+
func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent, failure bool) error {
70+
71+
// Check if the task is able to restart based on its state and the type of
72+
// restart event that was triggered.
73+
taskState := tr.TaskState()
74+
if taskState == nil {
75+
return ErrTaskNotRunning
76+
}
77+
4178
// Emit the event since it may take a long time to kill
4279
tr.EmitEvent(event)
4380

client/allocrunner/taskrunner/task_runner_test.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ func TestTaskRunner_Restore_Dead(t *testing.T) {
397397
// Verify that we can restart task.
398398
// Retry a few times as the newTR.Run() may not have started yet.
399399
testutil.WaitForResult(func() (bool, error) {
400-
ev := &structs.TaskEvent{Type: structs.TaskRestartAllSignal}
401-
err = newTR.Restart(context.Background(), ev, false)
400+
ev := &structs.TaskEvent{Type: structs.TaskRestartSignal}
401+
err = newTR.Rerun(context.Background(), ev, false)
402402
return err == nil, err
403403
}, func(err error) {
404404
require.NoError(t, err)
@@ -425,8 +425,8 @@ func TestTaskRunner_Restore_Dead(t *testing.T) {
425425
go newTR2.Run()
426426
defer newTR2.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
427427

428-
ev := &structs.TaskEvent{Type: structs.TaskRestartAllSignal}
429-
err = newTR2.Restart(context.Background(), ev, false)
428+
ev := &structs.TaskEvent{Type: structs.TaskRestartSignal}
429+
err = newTR2.Rerun(context.Background(), ev, false)
430430
require.Equal(t, err, ErrTaskNotRunning)
431431
}
432432

client/client.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -943,12 +943,12 @@ func (c *Client) RestartAllocation(allocID, taskName string, allTasks bool) erro
943943
}
944944

945945
if allTasks {
946-
event := structs.NewTaskEvent(structs.TaskRestartAllSignal).
946+
event := structs.NewTaskEvent(structs.TaskRestartSignal).
947947
SetRestartReason("User requested all tasks to restart")
948948
return ar.RestartAll(event)
949949
}
950950

951-
event := structs.NewTaskEvent(structs.TaskRestartRunningSignal).
951+
event := structs.NewTaskEvent(structs.TaskRestartSignal).
952952
SetRestartReason("User requested running tasks to restart")
953953
return ar.RestartRunning(event)
954954
}

command/agent/consul/check_watcher.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string)
103103

104104
// Tell TaskRunner to restart due to failure
105105
reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
106-
event := structs.NewTaskEvent(structs.TaskRestartRunningSignal).SetRestartReason(reason)
106+
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
107107
go asyncRestart(ctx, c.logger, c.task, event)
108108
return true
109109
}

nomad/structs/structs.go

-20
Original file line numberDiff line numberDiff line change
@@ -8022,14 +8022,6 @@ const (
80228022
// restarted
80238023
TaskRestartSignal = "Restart Signaled"
80248024

8025-
// TaskRestartRunningSignal indicates that all tasks in the allocation that
8026-
// are currently running have been signaled to be restarted.
8027-
TaskRestartRunningSignal = "Restart Running Signaled"
8028-
8029-
// TaskRestartAllSignal indicates that all tasks in the allocation have
8030-
// been signaled to be restarted, even the ones that have already run.
8031-
TaskRestartAllSignal = "Restart All Signaled"
8032-
80338025
// TaskSignaling indicates that the task is being signalled.
80348026
TaskSignaling = "Signaling"
80358027

@@ -8287,18 +8279,6 @@ func (e *TaskEvent) PopulateEventDisplayMessage() {
82878279
} else {
82888280
desc = "Task signaled to restart"
82898281
}
8290-
case TaskRestartRunningSignal:
8291-
if e.RestartReason != "" {
8292-
desc = e.RestartReason
8293-
} else {
8294-
desc = "Running tasks signaled to restart"
8295-
}
8296-
case TaskRestartAllSignal:
8297-
if e.RestartReason != "" {
8298-
desc = e.RestartReason
8299-
} else {
8300-
desc = "All tasks signaled to restart"
8301-
}
83028282
case TaskDriverMessage:
83038283
desc = e.DriverMessage
83048284
case TaskLeaderDead:

0 commit comments

Comments
 (0)