Skip to content

Commit 6711525

Browse files
committed
Merge pull request #755 from hashicorp/b-alloc-runner-destroy
Fix AllocRunner not capturing destroy signal and tests
2 parents c87a4f9 + 3308841 commit 6711525

File tree

4 files changed

+241
-19
lines changed

4 files changed

+241
-19
lines changed

client/alloc_runner.go

+23-12
Original file line numberDiff line numberDiff line change
@@ -326,15 +326,8 @@ func (r *AllocRunner) Run() {
326326
defer close(r.waitCh)
327327
go r.dirtySyncState()
328328

329-
// Check if the allocation is in a terminal status
330-
alloc := r.alloc
331-
if alloc.TerminalStatus() {
332-
r.logger.Printf("[DEBUG] client: aborting runner for alloc '%s', terminal status", r.alloc.ID)
333-
return
334-
}
335-
r.logger.Printf("[DEBUG] client: starting runner for alloc '%s'", r.alloc.ID)
336-
337329
// Find the task group to run in the allocation
330+
alloc := r.alloc
338331
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
339332
if tg == nil {
340333
r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
@@ -353,7 +346,18 @@ func (r *AllocRunner) Run() {
353346
r.ctx = driver.NewExecContext(allocDir, r.alloc.ID)
354347
}
355348

349+
// Check if the allocation is in a terminal status. In this case, we don't
350+
// start any of the task runners and directly wait for the destroy signal to
351+
// clean up the allocation.
352+
if alloc.TerminalStatus() {
353+
r.logger.Printf("[DEBUG] client: alloc %q in terminal status, waiting for destroy", r.alloc.ID)
354+
r.handleDestroy()
355+
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
356+
return
357+
}
358+
356359
// Start the task runners
360+
r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID)
357361
r.taskLock.Lock()
358362
for _, task := range tg.Tasks {
359363
if _, ok := r.restored[task.Name]; ok {
@@ -415,7 +419,6 @@ OUTER:
415419

416420
// Destroy each sub-task
417421
r.taskLock.Lock()
418-
defer r.taskLock.Unlock()
419422
for _, tr := range r.tasks {
420423
tr.Destroy()
421424
}
@@ -424,12 +427,21 @@ OUTER:
424427
for _, tr := range r.tasks {
425428
<-tr.WaitCh()
426429
}
430+
r.taskLock.Unlock()
427431

428432
// Final state sync
429433
r.retrySyncState(nil)
430434

431-
// Check if we should destroy our state
432-
if r.destroy {
435+
// Block until we should destroy the state of the alloc
436+
r.handleDestroy()
437+
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
438+
}
439+
440+
// handleDestroy blocks till the AllocRunner should be destroyed and does the
441+
// necessary cleanup.
442+
func (r *AllocRunner) handleDestroy() {
443+
select {
444+
case <-r.destroyCh:
433445
if err := r.DestroyContext(); err != nil {
434446
r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v",
435447
r.alloc.ID, err)
@@ -439,7 +451,6 @@ OUTER:
439451
r.alloc.ID, err)
440452
}
441453
}
442-
r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
443454
}
444455

445456
// Update is used to update the allocation of the context

client/alloc_runner_test.go

+214-3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,95 @@ func TestAllocRunner_SimpleRun(t *testing.T) {
6262
})
6363
}
6464

65+
func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
66+
ctestutil.ExecCompatible(t)
67+
upd, ar := testAllocRunner(false)
68+
69+
// Ensure task takes some time
70+
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
71+
task.Config["command"] = "/bin/sleep"
72+
task.Config["args"] = []string{"10"}
73+
go ar.Run()
74+
75+
testutil.WaitForResult(func() (bool, error) {
76+
if upd.Count == 0 {
77+
return false, fmt.Errorf("No updates")
78+
}
79+
last := upd.Allocs[upd.Count-1]
80+
if last.ClientStatus == structs.AllocClientStatusRunning {
81+
return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
82+
}
83+
return true, nil
84+
}, func(err error) {
85+
t.Fatalf("err: %v", err)
86+
})
87+
88+
// Update the alloc to be terminal which should cause the alloc runner to
89+
// stop the tasks and wait for a destroy.
90+
update := ar.alloc.Copy()
91+
update.DesiredStatus = structs.AllocDesiredStatusStop
92+
ar.Update(update)
93+
94+
testutil.WaitForResult(func() (bool, error) {
95+
if upd.Count == 0 {
96+
return false, nil
97+
}
98+
99+
// Check the status has changed.
100+
last := upd.Allocs[upd.Count-1]
101+
if last.ClientStatus != structs.AllocClientStatusDead {
102+
return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusDead)
103+
}
104+
105+
// Check the state still exists
106+
if _, err := os.Stat(ar.stateFilePath()); err != nil {
107+
return false, fmt.Errorf("state file destroyed: %v", err)
108+
}
109+
110+
// Check the alloc directory still exists
111+
if _, err := os.Stat(ar.ctx.AllocDir.AllocDir); err != nil {
112+
return false, fmt.Errorf("alloc dir destroyed: %v", ar.ctx.AllocDir.AllocDir)
113+
}
114+
115+
return true, nil
116+
}, func(err error) {
117+
t.Fatalf("err: %v", err)
118+
})
119+
120+
// Send the destroy signal and ensure the AllocRunner cleans up.
121+
ar.Destroy()
122+
123+
testutil.WaitForResult(func() (bool, error) {
124+
if upd.Count == 0 {
125+
return false, nil
126+
}
127+
128+
// Check the status has changed.
129+
last := upd.Allocs[upd.Count-1]
130+
if last.ClientStatus != structs.AllocClientStatusDead {
131+
return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusDead)
132+
}
133+
134+
// Check the state was cleaned
135+
if _, err := os.Stat(ar.stateFilePath()); err == nil {
136+
return false, fmt.Errorf("state file still exists: %v", ar.stateFilePath())
137+
} else if !os.IsNotExist(err) {
138+
return false, fmt.Errorf("stat err: %v", err)
139+
}
140+
141+
// Check the alloc directory was cleaned
142+
if _, err := os.Stat(ar.ctx.AllocDir.AllocDir); err == nil {
143+
return false, fmt.Errorf("alloc dir still exists: %v", ar.ctx.AllocDir.AllocDir)
144+
} else if !os.IsNotExist(err) {
145+
return false, fmt.Errorf("stat err: %v", err)
146+
}
147+
148+
return true, nil
149+
}, func(err error) {
150+
t.Fatalf("err: %v", err)
151+
})
152+
}
153+
65154
func TestAllocRunner_Destroy(t *testing.T) {
66155
ctestutil.ExecCompatible(t)
67156
upd, ar := testAllocRunner(false)
@@ -83,10 +172,30 @@ func TestAllocRunner_Destroy(t *testing.T) {
83172
if upd.Count == 0 {
84173
return false, nil
85174
}
175+
176+
// Check the status has changed.
86177
last := upd.Allocs[upd.Count-1]
87-
return last.ClientStatus == structs.AllocClientStatusDead, nil
178+
if last.ClientStatus != structs.AllocClientStatusDead {
179+
return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusDead)
180+
}
181+
182+
// Check the state was cleaned
183+
if _, err := os.Stat(ar.stateFilePath()); err == nil {
184+
return false, fmt.Errorf("state file still exists: %v", ar.stateFilePath())
185+
} else if !os.IsNotExist(err) {
186+
return false, fmt.Errorf("stat err: %v", err)
187+
}
188+
189+
// Check the alloc directory was cleaned
190+
if _, err := os.Stat(ar.ctx.AllocDir.AllocDir); err == nil {
191+
return false, fmt.Errorf("alloc dir still exists: %v", ar.ctx.AllocDir.AllocDir)
192+
} else if !os.IsNotExist(err) {
193+
return false, fmt.Errorf("stat err: %v", err)
194+
}
195+
196+
return true, nil
88197
}, func(err error) {
89-
t.Fatalf("err: %v %#v %#v", err, upd.Allocs[0], ar.alloc.TaskStates)
198+
t.Fatalf("err: %v", err)
90199
})
91200

92201
if time.Since(start) > 15*time.Second {
@@ -129,7 +238,6 @@ func TestAllocRunner_SaveRestoreState(t *testing.T) {
129238
task.Config["command"] = "/bin/sleep"
130239
task.Config["args"] = []string{"10"}
131240
go ar.Run()
132-
defer ar.Destroy()
133241

134242
// Snapshot state
135243
testutil.WaitForResult(func() (bool, error) {
@@ -171,3 +279,106 @@ func TestAllocRunner_SaveRestoreState(t *testing.T) {
171279
t.Fatalf("took too long to terminate")
172280
}
173281
}
282+
283+
func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
284+
ctestutil.ExecCompatible(t)
285+
upd, ar := testAllocRunner(false)
286+
287+
// Ensure task takes some time
288+
task := ar.alloc.Job.TaskGroups[0].Tasks[0]
289+
task.Config["command"] = "/bin/sleep"
290+
task.Config["args"] = []string{"10"}
291+
go ar.Run()
292+
293+
testutil.WaitForResult(func() (bool, error) {
294+
if upd.Count == 0 {
295+
return false, fmt.Errorf("No updates")
296+
}
297+
last := upd.Allocs[upd.Count-1]
298+
if last.ClientStatus == structs.AllocClientStatusRunning {
299+
return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
300+
}
301+
return true, nil
302+
}, func(err error) {
303+
t.Fatalf("err: %v", err)
304+
})
305+
306+
// Update the alloc to be terminal which should cause the alloc runner to
307+
// stop the tasks and wait for a destroy.
308+
update := ar.alloc.Copy()
309+
update.DesiredStatus = structs.AllocDesiredStatusStop
310+
ar.Update(update)
311+
312+
testutil.WaitForResult(func() (bool, error) {
313+
return ar.alloc.DesiredStatus == structs.AllocDesiredStatusStop, nil
314+
}, func(err error) {
315+
t.Fatalf("err: %v", err)
316+
})
317+
318+
err := ar.SaveState()
319+
if err != nil {
320+
t.Fatalf("err: %v", err)
321+
}
322+
323+
// Ensure both alloc runners don't destroy
324+
ar.destroy = true
325+
326+
// Create a new alloc runner
327+
consulClient, err := NewConsulService(&consulServiceConfig{ar.logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{}})
328+
ar2 := NewAllocRunner(ar.logger, ar.config, upd.Update,
329+
&structs.Allocation{ID: ar.alloc.ID}, consulClient)
330+
err = ar2.RestoreState()
331+
if err != nil {
332+
t.Fatalf("err: %v", err)
333+
}
334+
go ar2.Run()
335+
336+
testutil.WaitForResult(func() (bool, error) {
337+
// Check the state still exists
338+
if _, err := os.Stat(ar.stateFilePath()); err != nil {
339+
return false, fmt.Errorf("state file destroyed: %v", err)
340+
}
341+
342+
// Check the alloc directory still exists
343+
if _, err := os.Stat(ar.ctx.AllocDir.AllocDir); err != nil {
344+
return false, fmt.Errorf("alloc dir destroyed: %v", ar.ctx.AllocDir.AllocDir)
345+
}
346+
347+
return true, nil
348+
}, func(err error) {
349+
t.Fatalf("err: %v %#v %#v", err, upd.Allocs[0], ar.alloc.TaskStates)
350+
})
351+
352+
// Send the destroy signal and ensure the AllocRunner cleans up.
353+
ar2.Destroy()
354+
355+
testutil.WaitForResult(func() (bool, error) {
356+
if upd.Count == 0 {
357+
return false, nil
358+
}
359+
360+
// Check the status has changed.
361+
last := upd.Allocs[upd.Count-1]
362+
if last.ClientStatus != structs.AllocClientStatusDead {
363+
return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusDead)
364+
}
365+
366+
// Check the state was cleaned
367+
if _, err := os.Stat(ar.stateFilePath()); err == nil {
368+
return false, fmt.Errorf("state file still exists: %v", ar.stateFilePath())
369+
} else if !os.IsNotExist(err) {
370+
return false, fmt.Errorf("stat err: %v", err)
371+
}
372+
373+
// Check the alloc directory was cleaned
374+
if _, err := os.Stat(ar.ctx.AllocDir.AllocDir); err == nil {
375+
return false, fmt.Errorf("alloc dir still exists: %v", ar.ctx.AllocDir.AllocDir)
376+
} else if !os.IsNotExist(err) {
377+
return false, fmt.Errorf("stat err: %v", err)
378+
}
379+
380+
return true, nil
381+
}, func(err error) {
382+
t.Fatalf("err: %v", err)
383+
})
384+
}

client/allocdir/alloc_dir.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ type AllocDir struct {
3636
TaskDirs map[string]string
3737

3838
// A list of locations the shared alloc has been mounted to.
39-
mounted []string
39+
Mounted []string
4040
}
4141

4242
// AllocFileInfo holds information about a file inside the AllocDir
@@ -64,7 +64,7 @@ func NewAllocDir(allocDir string) *AllocDir {
6464
// Tears down previously build directory structure.
6565
func (d *AllocDir) Destroy() error {
6666
// Unmount all mounted shared alloc dirs.
67-
for _, m := range d.mounted {
67+
for _, m := range d.Mounted {
6868
if err := d.unmountSharedDir(m); err != nil {
6969
return fmt.Errorf("Failed to unmount shared directory: %v", err)
7070
}
@@ -233,7 +233,7 @@ func (d *AllocDir) MountSharedDir(task string) error {
233233
return fmt.Errorf("Failed to mount shared directory for task %v: %v", task, err)
234234
}
235235

236-
d.mounted = append(d.mounted, taskLoc)
236+
d.Mounted = append(d.Mounted, taskLoc)
237237
return nil
238238
}
239239

testutil/wait.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ type testFn func() (bool, error)
1818
type errorFn func(error)
1919

2020
func WaitForResult(test testFn, error errorFn) {
21-
WaitForResultRetries(1000*TestMultiplier(), test, error)
21+
WaitForResultRetries(2000*TestMultiplier(), test, error)
2222
}
2323

2424
func WaitForResultRetries(retries int64, test testFn, error errorFn) {

0 commit comments

Comments
 (0)