Skip to content

Commit f616370

Browse files
author
Mahmood Ali
authored
Merge pull request #6207 from hashicorp/b-gc-destroyed-allocs-rerun
Don't persist allocs of destroyed alloc runners
2 parents e2efeb4 + ff3dedd commit f616370

File tree

3 files changed

+83
-5
lines changed

3 files changed

+83
-5
lines changed

client/allocrunner/alloc_runner.go

+23-4
Original file line numberDiff line numberDiff line change
@@ -765,14 +765,15 @@ func (ar *allocRunner) destroyImpl() {
765765
// state if Run() ran at all.
766766
<-ar.taskStateUpdateHandlerCh
767767

768-
// Cleanup state db
768+
// Mark alloc as destroyed
769+
ar.destroyedLock.Lock()
770+
771+
// Cleanup state db; while holding the lock to avoid
772+
// a race periodic PersistState that may resurrect the alloc
769773
if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
770774
ar.logger.Warn("failed to delete allocation state", "error", err)
771775
}
772776

773-
// Mark alloc as destroyed
774-
ar.destroyedLock.Lock()
775-
776777
if !ar.shutdown {
777778
ar.shutdown = true
778779
close(ar.shutdownCh)
@@ -784,6 +785,24 @@ func (ar *allocRunner) destroyImpl() {
784785
ar.destroyedLock.Unlock()
785786
}
786787

788+
func (ar *allocRunner) PersistState() error {
789+
ar.destroyedLock.Lock()
790+
defer ar.destroyedLock.Unlock()
791+
792+
if ar.destroyed {
793+
err := ar.stateDB.DeleteAllocationBucket(ar.id)
794+
if err != nil {
795+
ar.logger.Warn("failed to delete allocation bucket", "error", err)
796+
}
797+
return nil
798+
}
799+
800+
// TODO: consider persisting deployment state along with task status.
801+
// While we study why only the alloc is persisted, I opted to maintain current
802+
// behavior and not risk adding yet more IO calls unnecessarily.
803+
return ar.stateDB.PutAllocation(ar.Alloc())
804+
}
805+
787806
// Destroy the alloc runner by stopping it if it is still running and cleaning
788807
// up all of its resources.
789808
//

client/allocrunner/alloc_runner_test.go

+58
Original file line numberDiff line numberDiff line change
@@ -1001,3 +1001,61 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
10011001
require.Fail(t, "err: %v", err)
10021002
})
10031003
}
1004+
1005+
// TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
1006+
func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
1007+
t.Parallel()
1008+
1009+
alloc := mock.BatchAlloc()
1010+
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
1011+
1012+
conf, cleanup := testAllocRunnerConfig(t, alloc)
1013+
conf.StateDB = state.NewMemDB(conf.Logger)
1014+
1015+
defer cleanup()
1016+
ar, err := NewAllocRunner(conf)
1017+
require.NoError(t, err)
1018+
defer destroy(ar)
1019+
1020+
go ar.Run()
1021+
1022+
select {
1023+
case <-ar.WaitCh():
1024+
case <-time.After(10 * time.Second):
1025+
require.Fail(t, "timed out waiting for alloc to complete")
1026+
}
1027+
1028+
// test final persisted state upon completion
1029+
require.NoError(t, ar.PersistState())
1030+
allocs, _, err := conf.StateDB.GetAllAllocations()
1031+
require.NoError(t, err)
1032+
require.Len(t, allocs, 1)
1033+
require.Equal(t, alloc.ID, allocs[0].ID)
1034+
_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
1035+
require.NoError(t, err)
1036+
require.Equal(t, structs.TaskStateDead, ts.State)
1037+
1038+
// check that DB alloc is empty after destroying AR
1039+
ar.Destroy()
1040+
select {
1041+
case <-ar.DestroyCh():
1042+
case <-time.After(10 * time.Second):
1043+
require.Fail(t, "timedout waiting for destruction")
1044+
}
1045+
1046+
allocs, _, err = conf.StateDB.GetAllAllocations()
1047+
require.NoError(t, err)
1048+
require.Empty(t, allocs)
1049+
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
1050+
require.NoError(t, err)
1051+
require.Nil(t, ts)
1052+
1053+
// check that DB alloc is empty after persisting state of destroyed AR
1054+
ar.PersistState()
1055+
allocs, _, err = conf.StateDB.GetAllAllocations()
1056+
require.NoError(t, err)
1057+
require.Empty(t, allocs)
1058+
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
1059+
require.NoError(t, err)
1060+
require.Nil(t, ts)
1061+
}

client/client.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ type AllocRunner interface {
139139
ShutdownCh() <-chan struct{}
140140
Signal(taskName, signal string) error
141141
GetTaskEventHandler(taskName string) drivermanager.EventHandler
142+
PersistState() error
142143

143144
RestartTask(taskName string, taskEvent *structs.TaskEvent) error
144145
RestartAll(taskEvent *structs.TaskEvent) error
@@ -1084,7 +1085,7 @@ func (c *Client) saveState() error {
10841085

10851086
for id, ar := range runners {
10861087
go func(id string, ar AllocRunner) {
1087-
err := c.stateDB.PutAllocation(ar.Alloc())
1088+
err := ar.PersistState()
10881089
if err != nil {
10891090
c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
10901091
l.Lock()

0 commit comments

Comments
 (0)