Skip to content

Commit ba5bb14

Browse files
Backport of client: ignore restart issued to terminal allocations into release/1.5.x (#17211)
This pull request was automerged via backport-assistant
1 parent c09e319 commit ba5bb14

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

.changelog/17175.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly
3+
```

client/allocrunner/alloc_runner.go

+6
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
12641264

12651265
// restartTasks restarts all task runners concurrently.
12661266
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {
1267+
1268+
// ensure we are not trying to restart an alloc that is terminal
1269+
if !ar.shouldRun() {
1270+
return fmt.Errorf("restart of an alloc that should not run")
1271+
}
1272+
12671273
waitCh := make(chan struct{})
12681274
var err *multierror.Error
12691275
var errMutex sync.Mutex

e2e/clientstate/allocs_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Copyright (c) HashiCorp, Inc.
2+
// SPDX-License-Identifier: MPL-2.0
3+
4+
package clientstate
5+
6+
import (
7+
"testing"
8+
"time"
9+
10+
"github.com/hashicorp/nomad/e2e/e2eutil"
11+
"github.com/hashicorp/nomad/helper/uuid"
12+
"github.com/shoenig/test/must"
13+
"github.com/shoenig/test/wait"
14+
)
15+
16+
func TestClientAllocs(t *testing.T) {
17+
nomad := e2eutil.NomadClient(t)
18+
19+
e2eutil.WaitForLeader(t, nomad)
20+
e2eutil.WaitForNodesReady(t, nomad, 1)
21+
22+
t.Run("testAllocZombie", testAllocZombie)
23+
}
24+
25+
// testAllocZombie ensures that a restart of a dead allocation does not cause
26+
// it to come back to life in a not-quite alive state.
27+
//
28+
// https://github.com/hashicorp/nomad/issues/17079
29+
func testAllocZombie(t *testing.T) {
30+
nomad := e2eutil.NomadClient(t)
31+
32+
jobID := "alloc-zombie-" + uuid.Short()
33+
jobIDs := []string{jobID}
34+
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
35+
36+
// start the job and wait for alloc to become failed
37+
err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad")
38+
must.NoError(t, err)
39+
40+
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
41+
42+
// wait for alloc to be marked as failed
43+
e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed")
44+
45+
// wait for additional failures to know we got rescheduled
46+
must.Wait(t, wait.InitialSuccess(
47+
wait.BoolFunc(func() bool {
48+
statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "")
49+
must.NoError(t, err)
50+
return len(statuses) > 2
51+
}),
52+
wait.Timeout(1*time.Minute),
53+
wait.Gap(1*time.Second),
54+
))
55+
56+
// now attempt to restart our initial allocation
57+
// which should do nothing but give us an error
58+
output, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
59+
must.ErrorContains(t, err, "restart of an alloc that should not run")
60+
must.StrContains(t, output, "Failed to restart allocation")
61+
}
+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) HashiCorp, Inc.
2+
# SPDX-License-Identifier: MPL-2.0
3+
4+
job "alloc_zombie" {
5+
6+
group "group" {
7+
network {
8+
mode = "host"
9+
port "http" {}
10+
}
11+
12+
service {
13+
name = "alloczombie"
14+
port = "http"
15+
provider = "nomad"
16+
17+
check {
18+
name = "alloczombiecheck"
19+
type = "http"
20+
port = "http"
21+
path = "/does/not/exist.txt"
22+
interval = "2s"
23+
timeout = "1s"
24+
check_restart {
25+
limit = 1
26+
grace = "3s"
27+
}
28+
}
29+
}
30+
31+
reschedule {
32+
attempts = 3
33+
interval = "1m"
34+
delay = "5s"
35+
delay_function = "constant"
36+
unlimited = false
37+
}
38+
39+
restart {
40+
attempts = 0
41+
delay = "5s"
42+
mode = "fail"
43+
}
44+
45+
task "python" {
46+
driver = "raw_exec"
47+
48+
config {
49+
command = "python3"
50+
args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"]
51+
}
52+
53+
resources {
54+
cpu = 10
55+
memory = 64
56+
}
57+
}
58+
}
59+
}

0 commit comments

Comments
 (0)