Skip to content

Commit b807491

Browse files
author
Mahmood Ali
authored
Merge pull request #7288 from hashicorp/f-task-restart-policy
Support per-task RestartPolicy
2 parents 10bdc6f + 8083022 commit b807491

File tree

11 files changed

+312
-42
lines changed

11 files changed

+312
-42
lines changed

api/jobs_test.go

+203-14
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,10 @@ func TestJobs_Canonicalize(t *testing.T) {
157157
Migrate: DefaultMigrateStrategy(),
158158
Tasks: []*Task{
159159
{
160-
KillTimeout: timeToPtr(5 * time.Second),
161-
LogConfig: DefaultLogConfig(),
162-
Resources: DefaultResources(),
160+
KillTimeout: timeToPtr(5 * time.Second),
161+
LogConfig: DefaultLogConfig(),
162+
Resources: DefaultResources(),
163+
RestartPolicy: defaultServiceJobRestartPolicy(),
163164
},
164165
},
165166
},
@@ -222,9 +223,10 @@ func TestJobs_Canonicalize(t *testing.T) {
222223
},
223224
Tasks: []*Task{
224225
{
225-
KillTimeout: timeToPtr(5 * time.Second),
226-
LogConfig: DefaultLogConfig(),
227-
Resources: DefaultResources(),
226+
KillTimeout: timeToPtr(5 * time.Second),
227+
LogConfig: DefaultLogConfig(),
228+
Resources: DefaultResources(),
229+
RestartPolicy: defaultBatchJobRestartPolicy(),
228230
},
229231
},
230232
},
@@ -316,10 +318,11 @@ func TestJobs_Canonicalize(t *testing.T) {
316318
Migrate: DefaultMigrateStrategy(),
317319
Tasks: []*Task{
318320
{
319-
Name: "task1",
320-
LogConfig: DefaultLogConfig(),
321-
Resources: DefaultResources(),
322-
KillTimeout: timeToPtr(5 * time.Second),
321+
Name: "task1",
322+
LogConfig: DefaultLogConfig(),
323+
Resources: DefaultResources(),
324+
KillTimeout: timeToPtr(5 * time.Second),
325+
RestartPolicy: defaultServiceJobRestartPolicy(),
323326
},
324327
},
325328
},
@@ -363,6 +366,10 @@ func TestJobs_Canonicalize(t *testing.T) {
363366
"db": 6379,
364367
}},
365368
},
369+
RestartPolicy: &RestartPolicy{
370+
// inherit other values from TG
371+
Attempts: intToPtr(20),
372+
},
366373
Resources: &Resources{
367374
CPU: intToPtr(500),
368375
MemoryMB: intToPtr(256),
@@ -486,6 +493,12 @@ func TestJobs_Canonicalize(t *testing.T) {
486493
"db": 6379,
487494
}},
488495
},
496+
RestartPolicy: &RestartPolicy{
497+
Interval: timeToPtr(5 * time.Minute),
498+
Attempts: intToPtr(20),
499+
Delay: timeToPtr(25 * time.Second),
500+
Mode: stringToPtr("delay"),
501+
},
489502
Resources: &Resources{
490503
CPU: intToPtr(500),
491504
MemoryMB: intToPtr(256),
@@ -712,10 +725,11 @@ func TestJobs_Canonicalize(t *testing.T) {
712725
Migrate: DefaultMigrateStrategy(),
713726
Tasks: []*Task{
714727
{
715-
Name: "task1",
716-
LogConfig: DefaultLogConfig(),
717-
Resources: DefaultResources(),
718-
KillTimeout: timeToPtr(5 * time.Second),
728+
Name: "task1",
729+
LogConfig: DefaultLogConfig(),
730+
Resources: DefaultResources(),
731+
KillTimeout: timeToPtr(5 * time.Second),
732+
RestartPolicy: defaultServiceJobRestartPolicy(),
719733
},
720734
},
721735
},
@@ -753,12 +767,187 @@ func TestJobs_Canonicalize(t *testing.T) {
753767
AutoPromote: boolToPtr(false),
754768
},
755769
Migrate: DefaultMigrateStrategy(),
770+
Tasks: []*Task{
771+
{
772+
Name: "task1",
773+
LogConfig: DefaultLogConfig(),
774+
Resources: DefaultResources(),
775+
KillTimeout: timeToPtr(5 * time.Second),
776+
RestartPolicy: defaultServiceJobRestartPolicy(),
777+
},
778+
},
779+
},
780+
},
781+
},
782+
},
783+
784+
{
785+
name: "restart_merge",
786+
input: &Job{
787+
Name: stringToPtr("foo"),
788+
ID: stringToPtr("bar"),
789+
ParentID: stringToPtr("lol"),
790+
TaskGroups: []*TaskGroup{
791+
{
792+
Name: stringToPtr("bar"),
793+
RestartPolicy: &RestartPolicy{
794+
Delay: timeToPtr(15 * time.Second),
795+
Attempts: intToPtr(2),
796+
Interval: timeToPtr(30 * time.Minute),
797+
Mode: stringToPtr("fail"),
798+
},
799+
Tasks: []*Task{
800+
{
801+
Name: "task1",
802+
RestartPolicy: &RestartPolicy{
803+
Attempts: intToPtr(5),
804+
Delay: timeToPtr(1 * time.Second),
805+
},
806+
},
807+
},
808+
},
809+
{
810+
Name: stringToPtr("baz"),
811+
RestartPolicy: &RestartPolicy{
812+
Delay: timeToPtr(20 * time.Second),
813+
Attempts: intToPtr(2),
814+
Interval: timeToPtr(30 * time.Minute),
815+
Mode: stringToPtr("fail"),
816+
},
817+
Tasks: []*Task{
818+
{
819+
Name: "task1",
820+
},
821+
},
822+
},
823+
},
824+
},
825+
expected: &Job{
826+
Namespace: stringToPtr(DefaultNamespace),
827+
ID: stringToPtr("bar"),
828+
Name: stringToPtr("foo"),
829+
Region: stringToPtr("global"),
830+
Type: stringToPtr("service"),
831+
ParentID: stringToPtr("lol"),
832+
Priority: intToPtr(50),
833+
AllAtOnce: boolToPtr(false),
834+
ConsulToken: stringToPtr(""),
835+
VaultToken: stringToPtr(""),
836+
Stop: boolToPtr(false),
837+
Stable: boolToPtr(false),
838+
Version: uint64ToPtr(0),
839+
Status: stringToPtr(""),
840+
StatusDescription: stringToPtr(""),
841+
CreateIndex: uint64ToPtr(0),
842+
ModifyIndex: uint64ToPtr(0),
843+
JobModifyIndex: uint64ToPtr(0),
844+
Update: &UpdateStrategy{
845+
Stagger: timeToPtr(30 * time.Second),
846+
MaxParallel: intToPtr(1),
847+
HealthCheck: stringToPtr("checks"),
848+
MinHealthyTime: timeToPtr(10 * time.Second),
849+
HealthyDeadline: timeToPtr(5 * time.Minute),
850+
ProgressDeadline: timeToPtr(10 * time.Minute),
851+
AutoRevert: boolToPtr(false),
852+
Canary: intToPtr(0),
853+
AutoPromote: boolToPtr(false),
854+
},
855+
TaskGroups: []*TaskGroup{
856+
{
857+
Name: stringToPtr("bar"),
858+
Count: intToPtr(1),
859+
EphemeralDisk: &EphemeralDisk{
860+
Sticky: boolToPtr(false),
861+
Migrate: boolToPtr(false),
862+
SizeMB: intToPtr(300),
863+
},
864+
RestartPolicy: &RestartPolicy{
865+
Delay: timeToPtr(15 * time.Second),
866+
Attempts: intToPtr(2),
867+
Interval: timeToPtr(30 * time.Minute),
868+
Mode: stringToPtr("fail"),
869+
},
870+
ReschedulePolicy: &ReschedulePolicy{
871+
Attempts: intToPtr(0),
872+
Interval: timeToPtr(0),
873+
DelayFunction: stringToPtr("exponential"),
874+
Delay: timeToPtr(30 * time.Second),
875+
MaxDelay: timeToPtr(1 * time.Hour),
876+
Unlimited: boolToPtr(true),
877+
},
878+
Update: &UpdateStrategy{
879+
Stagger: timeToPtr(30 * time.Second),
880+
MaxParallel: intToPtr(1),
881+
HealthCheck: stringToPtr("checks"),
882+
MinHealthyTime: timeToPtr(10 * time.Second),
883+
HealthyDeadline: timeToPtr(5 * time.Minute),
884+
ProgressDeadline: timeToPtr(10 * time.Minute),
885+
AutoRevert: boolToPtr(false),
886+
Canary: intToPtr(0),
887+
AutoPromote: boolToPtr(false),
888+
},
889+
Migrate: DefaultMigrateStrategy(),
756890
Tasks: []*Task{
757891
{
758892
Name: "task1",
759893
LogConfig: DefaultLogConfig(),
760894
Resources: DefaultResources(),
761895
KillTimeout: timeToPtr(5 * time.Second),
896+
RestartPolicy: &RestartPolicy{
897+
Attempts: intToPtr(5),
898+
Delay: timeToPtr(1 * time.Second),
899+
Interval: timeToPtr(30 * time.Minute),
900+
Mode: stringToPtr("fail"),
901+
},
902+
},
903+
},
904+
},
905+
{
906+
Name: stringToPtr("baz"),
907+
Count: intToPtr(1),
908+
EphemeralDisk: &EphemeralDisk{
909+
Sticky: boolToPtr(false),
910+
Migrate: boolToPtr(false),
911+
SizeMB: intToPtr(300),
912+
},
913+
RestartPolicy: &RestartPolicy{
914+
Delay: timeToPtr(20 * time.Second),
915+
Attempts: intToPtr(2),
916+
Interval: timeToPtr(30 * time.Minute),
917+
Mode: stringToPtr("fail"),
918+
},
919+
ReschedulePolicy: &ReschedulePolicy{
920+
Attempts: intToPtr(0),
921+
Interval: timeToPtr(0),
922+
DelayFunction: stringToPtr("exponential"),
923+
Delay: timeToPtr(30 * time.Second),
924+
MaxDelay: timeToPtr(1 * time.Hour),
925+
Unlimited: boolToPtr(true),
926+
},
927+
Update: &UpdateStrategy{
928+
Stagger: timeToPtr(30 * time.Second),
929+
MaxParallel: intToPtr(1),
930+
HealthCheck: stringToPtr("checks"),
931+
MinHealthyTime: timeToPtr(10 * time.Second),
932+
HealthyDeadline: timeToPtr(5 * time.Minute),
933+
ProgressDeadline: timeToPtr(10 * time.Minute),
934+
AutoRevert: boolToPtr(false),
935+
Canary: intToPtr(0),
936+
AutoPromote: boolToPtr(false),
937+
},
938+
Migrate: DefaultMigrateStrategy(),
939+
Tasks: []*Task{
940+
{
941+
Name: "task1",
942+
LogConfig: DefaultLogConfig(),
943+
Resources: DefaultResources(),
944+
KillTimeout: timeToPtr(5 * time.Second),
945+
RestartPolicy: &RestartPolicy{
946+
Delay: timeToPtr(20 * time.Second),
947+
Attempts: intToPtr(2),
948+
Interval: timeToPtr(30 * time.Minute),
949+
Mode: stringToPtr("fail"),
950+
},
762951
},
763952
},
764953
},

api/tasks.go

+37-19
Original file line numberDiff line numberDiff line change
@@ -453,9 +453,6 @@ func (g *TaskGroup) Canonicalize(job *Job) {
453453
if g.Scaling != nil {
454454
g.Scaling.Canonicalize(*g.Count)
455455
}
456-
for _, t := range g.Tasks {
457-
t.Canonicalize(g, job)
458-
}
459456
if g.EphemeralDisk == nil {
460457
g.EphemeralDisk = DefaultEphemeralDisk()
461458
} else {
@@ -515,30 +512,20 @@ func (g *TaskGroup) Canonicalize(job *Job) {
515512
var defaultRestartPolicy *RestartPolicy
516513
switch *job.Type {
517514
case "service", "system":
518-
// These needs to be in sync with DefaultServiceJobRestartPolicy in
519-
// in nomad/structs/structs.go
520-
defaultRestartPolicy = &RestartPolicy{
521-
Delay: timeToPtr(15 * time.Second),
522-
Attempts: intToPtr(2),
523-
Interval: timeToPtr(30 * time.Minute),
524-
Mode: stringToPtr(RestartPolicyModeFail),
525-
}
515+
defaultRestartPolicy = defaultServiceJobRestartPolicy()
526516
default:
527-
// These needs to be in sync with DefaultBatchJobRestartPolicy in
528-
// in nomad/structs/structs.go
529-
defaultRestartPolicy = &RestartPolicy{
530-
Delay: timeToPtr(15 * time.Second),
531-
Attempts: intToPtr(3),
532-
Interval: timeToPtr(24 * time.Hour),
533-
Mode: stringToPtr(RestartPolicyModeFail),
534-
}
517+
defaultRestartPolicy = defaultBatchJobRestartPolicy()
535518
}
536519

537520
if g.RestartPolicy != nil {
538521
defaultRestartPolicy.Merge(g.RestartPolicy)
539522
}
540523
g.RestartPolicy = defaultRestartPolicy
541524

525+
for _, t := range g.Tasks {
526+
t.Canonicalize(g, job)
527+
}
528+
542529
for _, spread := range g.Spreads {
543530
spread.Canonicalize()
544531
}
@@ -553,6 +540,28 @@ func (g *TaskGroup) Canonicalize(job *Job) {
553540
}
554541
}
555542

543+
// These needs to be in sync with DefaultServiceJobRestartPolicy in
544+
// in nomad/structs/structs.go
545+
func defaultServiceJobRestartPolicy() *RestartPolicy {
546+
return &RestartPolicy{
547+
Delay: timeToPtr(15 * time.Second),
548+
Attempts: intToPtr(2),
549+
Interval: timeToPtr(30 * time.Minute),
550+
Mode: stringToPtr(RestartPolicyModeFail),
551+
}
552+
}
553+
554+
// These needs to be in sync with DefaultBatchJobRestartPolicy in
555+
// in nomad/structs/structs.go
556+
func defaultBatchJobRestartPolicy() *RestartPolicy {
557+
return &RestartPolicy{
558+
Delay: timeToPtr(15 * time.Second),
559+
Attempts: intToPtr(3),
560+
Interval: timeToPtr(24 * time.Hour),
561+
Mode: stringToPtr(RestartPolicyModeFail),
562+
}
563+
}
564+
556565
// Constrain is used to add a constraint to a task group.
557566
func (g *TaskGroup) Constrain(c *Constraint) *TaskGroup {
558567
g.Constraints = append(g.Constraints, c)
@@ -645,6 +654,7 @@ type Task struct {
645654
Env map[string]string
646655
Services []*Service
647656
Resources *Resources
657+
RestartPolicy *RestartPolicy
648658
Meta map[string]string
649659
KillTimeout *time.Duration `mapstructure:"kill_timeout"`
650660
LogConfig *LogConfig `mapstructure:"logs"`
@@ -697,6 +707,14 @@ func (t *Task) Canonicalize(tg *TaskGroup, job *Job) {
697707
if t.CSIPluginConfig != nil {
698708
t.CSIPluginConfig.Canonicalize()
699709
}
710+
if t.RestartPolicy == nil {
711+
t.RestartPolicy = tg.RestartPolicy
712+
} else {
713+
tgrp := &RestartPolicy{}
714+
*tgrp = *tg.RestartPolicy
715+
tgrp.Merge(t.RestartPolicy)
716+
t.RestartPolicy = tgrp
717+
}
700718
}
701719

702720
// TaskArtifact is used to download artifacts before running a task.

client/alloc_endpoint_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,12 @@ func TestAllocations_GarbageCollect(t *testing.T) {
216216

217217
a := mock.Alloc()
218218
a.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
219-
a.Job.TaskGroups[0].RestartPolicy = &nstructs.RestartPolicy{
219+
rp := &nstructs.RestartPolicy{
220220
Attempts: 0,
221221
Mode: nstructs.RestartPolicyModeFail,
222222
}
223+
a.Job.TaskGroups[0].RestartPolicy = rp
224+
a.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
223225
a.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
224226
"run_for": "10ms",
225227
}

0 commit comments

Comments
 (0)