Skip to content

Commit e20527f

Browse files
author
Mahmood Ali
committed
per-task restart policy
1 parent 79ce20a commit e20527f

File tree

9 files changed

+300
-40
lines changed

9 files changed

+300
-40
lines changed

api/jobs_test.go

+203-14
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,10 @@ func TestJobs_Canonicalize(t *testing.T) {
157157
Migrate: DefaultMigrateStrategy(),
158158
Tasks: []*Task{
159159
{
160-
KillTimeout: timeToPtr(5 * time.Second),
161-
LogConfig: DefaultLogConfig(),
162-
Resources: DefaultResources(),
160+
KillTimeout: timeToPtr(5 * time.Second),
161+
LogConfig: DefaultLogConfig(),
162+
Resources: DefaultResources(),
163+
RestartPolicy: defaultServiceJobRestartPolicy(),
163164
},
164165
},
165166
},
@@ -222,9 +223,10 @@ func TestJobs_Canonicalize(t *testing.T) {
222223
},
223224
Tasks: []*Task{
224225
{
225-
KillTimeout: timeToPtr(5 * time.Second),
226-
LogConfig: DefaultLogConfig(),
227-
Resources: DefaultResources(),
226+
KillTimeout: timeToPtr(5 * time.Second),
227+
LogConfig: DefaultLogConfig(),
228+
Resources: DefaultResources(),
229+
RestartPolicy: defaultBatchJobRestartPolicy(),
228230
},
229231
},
230232
},
@@ -316,10 +318,11 @@ func TestJobs_Canonicalize(t *testing.T) {
316318
Migrate: DefaultMigrateStrategy(),
317319
Tasks: []*Task{
318320
{
319-
Name: "task1",
320-
LogConfig: DefaultLogConfig(),
321-
Resources: DefaultResources(),
322-
KillTimeout: timeToPtr(5 * time.Second),
321+
Name: "task1",
322+
LogConfig: DefaultLogConfig(),
323+
Resources: DefaultResources(),
324+
KillTimeout: timeToPtr(5 * time.Second),
325+
RestartPolicy: defaultServiceJobRestartPolicy(),
323326
},
324327
},
325328
},
@@ -363,6 +366,10 @@ func TestJobs_Canonicalize(t *testing.T) {
363366
"db": 6379,
364367
}},
365368
},
369+
RestartPolicy: &RestartPolicy{
370+
// inherit other values from TG
371+
Attempts: intToPtr(20),
372+
},
366373
Resources: &Resources{
367374
CPU: intToPtr(500),
368375
MemoryMB: intToPtr(256),
@@ -487,6 +494,12 @@ func TestJobs_Canonicalize(t *testing.T) {
487494
"db": 6379,
488495
}},
489496
},
497+
RestartPolicy: &RestartPolicy{
498+
Interval: timeToPtr(5 * time.Minute),
499+
Attempts: intToPtr(20),
500+
Delay: timeToPtr(25 * time.Second),
501+
Mode: stringToPtr("delay"),
502+
},
490503
Resources: &Resources{
491504
CPU: intToPtr(500),
492505
MemoryMB: intToPtr(256),
@@ -713,10 +726,11 @@ func TestJobs_Canonicalize(t *testing.T) {
713726
Migrate: DefaultMigrateStrategy(),
714727
Tasks: []*Task{
715728
{
716-
Name: "task1",
717-
LogConfig: DefaultLogConfig(),
718-
Resources: DefaultResources(),
719-
KillTimeout: timeToPtr(5 * time.Second),
729+
Name: "task1",
730+
LogConfig: DefaultLogConfig(),
731+
Resources: DefaultResources(),
732+
KillTimeout: timeToPtr(5 * time.Second),
733+
RestartPolicy: defaultServiceJobRestartPolicy(),
720734
},
721735
},
722736
},
@@ -754,12 +768,187 @@ func TestJobs_Canonicalize(t *testing.T) {
754768
AutoPromote: boolToPtr(false),
755769
},
756770
Migrate: DefaultMigrateStrategy(),
771+
Tasks: []*Task{
772+
{
773+
Name: "task1",
774+
LogConfig: DefaultLogConfig(),
775+
Resources: DefaultResources(),
776+
KillTimeout: timeToPtr(5 * time.Second),
777+
RestartPolicy: defaultServiceJobRestartPolicy(),
778+
},
779+
},
780+
},
781+
},
782+
},
783+
},
784+
785+
{
786+
name: "restart_merge",
787+
input: &Job{
788+
Name: stringToPtr("foo"),
789+
ID: stringToPtr("bar"),
790+
ParentID: stringToPtr("lol"),
791+
TaskGroups: []*TaskGroup{
792+
{
793+
Name: stringToPtr("bar"),
794+
RestartPolicy: &RestartPolicy{
795+
Delay: timeToPtr(15 * time.Second),
796+
Attempts: intToPtr(2),
797+
Interval: timeToPtr(30 * time.Minute),
798+
Mode: stringToPtr("fail"),
799+
},
800+
Tasks: []*Task{
801+
{
802+
Name: "task1",
803+
RestartPolicy: &RestartPolicy{
804+
Attempts: intToPtr(5),
805+
Delay: timeToPtr(1 * time.Second),
806+
},
807+
},
808+
},
809+
},
810+
{
811+
Name: stringToPtr("baz"),
812+
RestartPolicy: &RestartPolicy{
813+
Delay: timeToPtr(20 * time.Second),
814+
Attempts: intToPtr(2),
815+
Interval: timeToPtr(30 * time.Minute),
816+
Mode: stringToPtr("fail"),
817+
},
818+
Tasks: []*Task{
819+
{
820+
Name: "task1",
821+
},
822+
},
823+
},
824+
},
825+
},
826+
expected: &Job{
827+
Namespace: stringToPtr(DefaultNamespace),
828+
ID: stringToPtr("bar"),
829+
Name: stringToPtr("foo"),
830+
Region: stringToPtr("global"),
831+
Type: stringToPtr("service"),
832+
ParentID: stringToPtr("lol"),
833+
Priority: intToPtr(50),
834+
AllAtOnce: boolToPtr(false),
835+
ConsulToken: stringToPtr(""),
836+
VaultToken: stringToPtr(""),
837+
Stop: boolToPtr(false),
838+
Stable: boolToPtr(false),
839+
Version: uint64ToPtr(0),
840+
Status: stringToPtr(""),
841+
StatusDescription: stringToPtr(""),
842+
CreateIndex: uint64ToPtr(0),
843+
ModifyIndex: uint64ToPtr(0),
844+
JobModifyIndex: uint64ToPtr(0),
845+
Update: &UpdateStrategy{
846+
Stagger: timeToPtr(30 * time.Second),
847+
MaxParallel: intToPtr(1),
848+
HealthCheck: stringToPtr("checks"),
849+
MinHealthyTime: timeToPtr(10 * time.Second),
850+
HealthyDeadline: timeToPtr(5 * time.Minute),
851+
ProgressDeadline: timeToPtr(10 * time.Minute),
852+
AutoRevert: boolToPtr(false),
853+
Canary: intToPtr(0),
854+
AutoPromote: boolToPtr(false),
855+
},
856+
TaskGroups: []*TaskGroup{
857+
{
858+
Name: stringToPtr("bar"),
859+
Count: intToPtr(1),
860+
EphemeralDisk: &EphemeralDisk{
861+
Sticky: boolToPtr(false),
862+
Migrate: boolToPtr(false),
863+
SizeMB: intToPtr(300),
864+
},
865+
RestartPolicy: &RestartPolicy{
866+
Delay: timeToPtr(15 * time.Second),
867+
Attempts: intToPtr(2),
868+
Interval: timeToPtr(30 * time.Minute),
869+
Mode: stringToPtr("fail"),
870+
},
871+
ReschedulePolicy: &ReschedulePolicy{
872+
Attempts: intToPtr(0),
873+
Interval: timeToPtr(0),
874+
DelayFunction: stringToPtr("exponential"),
875+
Delay: timeToPtr(30 * time.Second),
876+
MaxDelay: timeToPtr(1 * time.Hour),
877+
Unlimited: boolToPtr(true),
878+
},
879+
Update: &UpdateStrategy{
880+
Stagger: timeToPtr(30 * time.Second),
881+
MaxParallel: intToPtr(1),
882+
HealthCheck: stringToPtr("checks"),
883+
MinHealthyTime: timeToPtr(10 * time.Second),
884+
HealthyDeadline: timeToPtr(5 * time.Minute),
885+
ProgressDeadline: timeToPtr(10 * time.Minute),
886+
AutoRevert: boolToPtr(false),
887+
Canary: intToPtr(0),
888+
AutoPromote: boolToPtr(false),
889+
},
890+
Migrate: DefaultMigrateStrategy(),
757891
Tasks: []*Task{
758892
{
759893
Name: "task1",
760894
LogConfig: DefaultLogConfig(),
761895
Resources: DefaultResources(),
762896
KillTimeout: timeToPtr(5 * time.Second),
897+
RestartPolicy: &RestartPolicy{
898+
Attempts: intToPtr(5),
899+
Delay: timeToPtr(1 * time.Second),
900+
Interval: timeToPtr(30 * time.Minute),
901+
Mode: stringToPtr("fail"),
902+
},
903+
},
904+
},
905+
},
906+
{
907+
Name: stringToPtr("baz"),
908+
Count: intToPtr(1),
909+
EphemeralDisk: &EphemeralDisk{
910+
Sticky: boolToPtr(false),
911+
Migrate: boolToPtr(false),
912+
SizeMB: intToPtr(300),
913+
},
914+
RestartPolicy: &RestartPolicy{
915+
Delay: timeToPtr(20 * time.Second),
916+
Attempts: intToPtr(2),
917+
Interval: timeToPtr(30 * time.Minute),
918+
Mode: stringToPtr("fail"),
919+
},
920+
ReschedulePolicy: &ReschedulePolicy{
921+
Attempts: intToPtr(0),
922+
Interval: timeToPtr(0),
923+
DelayFunction: stringToPtr("exponential"),
924+
Delay: timeToPtr(30 * time.Second),
925+
MaxDelay: timeToPtr(1 * time.Hour),
926+
Unlimited: boolToPtr(true),
927+
},
928+
Update: &UpdateStrategy{
929+
Stagger: timeToPtr(30 * time.Second),
930+
MaxParallel: intToPtr(1),
931+
HealthCheck: stringToPtr("checks"),
932+
MinHealthyTime: timeToPtr(10 * time.Second),
933+
HealthyDeadline: timeToPtr(5 * time.Minute),
934+
ProgressDeadline: timeToPtr(10 * time.Minute),
935+
AutoRevert: boolToPtr(false),
936+
Canary: intToPtr(0),
937+
AutoPromote: boolToPtr(false),
938+
},
939+
Migrate: DefaultMigrateStrategy(),
940+
Tasks: []*Task{
941+
{
942+
Name: "task1",
943+
LogConfig: DefaultLogConfig(),
944+
Resources: DefaultResources(),
945+
KillTimeout: timeToPtr(5 * time.Second),
946+
RestartPolicy: &RestartPolicy{
947+
Delay: timeToPtr(20 * time.Second),
948+
Attempts: intToPtr(2),
949+
Interval: timeToPtr(30 * time.Minute),
950+
Mode: stringToPtr("fail"),
951+
},
763952
},
764953
},
765954
},

api/tasks.go

+37-19
Original file line numberDiff line numberDiff line change
@@ -443,9 +443,6 @@ func (g *TaskGroup) Canonicalize(job *Job) {
443443
if g.Count == nil {
444444
g.Count = intToPtr(1)
445445
}
446-
for _, t := range g.Tasks {
447-
t.Canonicalize(g, job)
448-
}
449446
if g.EphemeralDisk == nil {
450447
g.EphemeralDisk = DefaultEphemeralDisk()
451448
} else {
@@ -505,30 +502,20 @@ func (g *TaskGroup) Canonicalize(job *Job) {
505502
var defaultRestartPolicy *RestartPolicy
506503
switch *job.Type {
507504
case "service", "system":
508-
// These needs to be in sync with DefaultServiceJobRestartPolicy in
509-
// in nomad/structs/structs.go
510-
defaultRestartPolicy = &RestartPolicy{
511-
Delay: timeToPtr(15 * time.Second),
512-
Attempts: intToPtr(2),
513-
Interval: timeToPtr(30 * time.Minute),
514-
Mode: stringToPtr(RestartPolicyModeFail),
515-
}
505+
defaultRestartPolicy = defaultServiceJobRestartPolicy()
516506
default:
517-
// These needs to be in sync with DefaultBatchJobRestartPolicy in
518-
// in nomad/structs/structs.go
519-
defaultRestartPolicy = &RestartPolicy{
520-
Delay: timeToPtr(15 * time.Second),
521-
Attempts: intToPtr(3),
522-
Interval: timeToPtr(24 * time.Hour),
523-
Mode: stringToPtr(RestartPolicyModeFail),
524-
}
507+
defaultRestartPolicy = defaultBatchJobRestartPolicy()
525508
}
526509

527510
if g.RestartPolicy != nil {
528511
defaultRestartPolicy.Merge(g.RestartPolicy)
529512
}
530513
g.RestartPolicy = defaultRestartPolicy
531514

515+
for _, t := range g.Tasks {
516+
t.Canonicalize(g, job)
517+
}
518+
532519
for _, spread := range g.Spreads {
533520
spread.Canonicalize()
534521
}
@@ -543,6 +530,28 @@ func (g *TaskGroup) Canonicalize(job *Job) {
543530
}
544531
}
545532

533+
// These needs to be in sync with DefaultServiceJobRestartPolicy in
534+
// in nomad/structs/structs.go
535+
func defaultServiceJobRestartPolicy() *RestartPolicy {
536+
return &RestartPolicy{
537+
Delay: timeToPtr(15 * time.Second),
538+
Attempts: intToPtr(2),
539+
Interval: timeToPtr(30 * time.Minute),
540+
Mode: stringToPtr(RestartPolicyModeFail),
541+
}
542+
}
543+
544+
// These needs to be in sync with DefaultBatchJobRestartPolicy in
545+
// in nomad/structs/structs.go
546+
func defaultBatchJobRestartPolicy() *RestartPolicy {
547+
return &RestartPolicy{
548+
Delay: timeToPtr(15 * time.Second),
549+
Attempts: intToPtr(3),
550+
Interval: timeToPtr(24 * time.Hour),
551+
Mode: stringToPtr(RestartPolicyModeFail),
552+
}
553+
}
554+
546555
// Constrain is used to add a constraint to a task group.
547556
func (g *TaskGroup) Constrain(c *Constraint) *TaskGroup {
548557
g.Constraints = append(g.Constraints, c)
@@ -620,6 +629,7 @@ type Task struct {
620629
Env map[string]string
621630
Services []*Service
622631
Resources *Resources
632+
RestartPolicy *RestartPolicy
623633
Meta map[string]string
624634
KillTimeout *time.Duration `mapstructure:"kill_timeout"`
625635
LogConfig *LogConfig `mapstructure:"logs"`
@@ -665,6 +675,14 @@ func (t *Task) Canonicalize(tg *TaskGroup, job *Job) {
665675
for _, vm := range t.VolumeMounts {
666676
vm.Canonicalize()
667677
}
678+
if t.RestartPolicy == nil {
679+
t.RestartPolicy = tg.RestartPolicy
680+
} else {
681+
tgrp := &RestartPolicy{}
682+
*tgrp = *tg.RestartPolicy
683+
tgrp.Merge(t.RestartPolicy)
684+
t.RestartPolicy = tgrp
685+
}
668686
}
669687

670688
// TaskArtifact is used to download artifacts before running a task.

client/allocrunner/taskrunner/task_runner.go

+9-5
Original file line numberDiff line numberDiff line change
@@ -315,12 +315,16 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) {
315315
tr.taskResources = tres
316316

317317
// Build the restart tracker.
318-
tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup)
319-
if tg == nil {
320-
tr.logger.Error("alloc missing task group")
321-
return nil, fmt.Errorf("alloc missing task group")
318+
rp := config.Task.RestartPolicy
319+
if rp == nil {
320+
tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup)
321+
if tg == nil {
322+
tr.logger.Error("alloc missing task group")
323+
return nil, fmt.Errorf("alloc missing task group")
324+
}
325+
rp = tg.RestartPolicy
322326
}
323-
tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type)
327+
tr.restartTracker = restarts.NewRestartTracker(rp, tr.alloc.Job.Type)
324328

325329
// Get the driver
326330
if err := tr.initDriver(); err != nil {

0 commit comments

Comments
 (0)