Skip to content

Commit 3cb5551

Browse files
author
Mahmood Ali
authored
Merge pull request #7944 from hashicorp/b-health-checks-after-task-health
Allocs are healthy if service checks get healthy before task health
2 parents cf47153 + 22b65f2 commit 3cb5551

File tree

2 files changed

+119
-3
lines changed

2 files changed

+119
-3
lines changed

client/allochealth/tracker.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
210210
}
211211

212212
// setCheckHealth is used to mark the checks as either healthy or unhealthy.
213-
func (t *Tracker) setCheckHealth(healthy bool) {
213+
// returns true if health is propagated and no more health monitoring is needed
214+
func (t *Tracker) setCheckHealth(healthy bool) bool {
214215
t.l.Lock()
215216
defer t.l.Unlock()
216217

@@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {
220221

221222
// Only signal if we are healthy and so is the tasks
222223
if !t.checksHealthy {
223-
return
224+
return false
224225
}
225226

226227
select {
@@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {
230231

231232
// Shutdown the tracker
232233
t.cancelFn()
234+
return true
233235
}
234236

235237
// markAllocStopped is used to mark the allocation as having stopped.
@@ -379,7 +381,12 @@ OUTER:
379381
allocReg = newAllocReg
380382
}
381383
case <-healthyTimer.C:
382-
t.setCheckHealth(true)
384+
if t.setCheckHealth(true) {
385+
// final health set and propagated
386+
return
387+
}
388+
// tasks are unhealthy, reset and wait until all is healthy
389+
primed = false
383390
}
384391

385392
if allocReg == nil {

client/allochealth/tracker_test.go

+109
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
227227
require.Fail(t, "expected a health status")
228228
}
229229
}
230+
231+
// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
232+
// healthy, if the checks pass before task health pass
233+
func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
234+
t.Parallel()
235+
236+
alloc := mock.Alloc()
237+
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
238+
task := alloc.Job.TaskGroups[0].Tasks[0]
239+
240+
// new task starting unhealthy, without services
241+
task2 := task.Copy()
242+
task2.Name = task2.Name + "2"
243+
task2.Services = nil
244+
alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
245+
246+
// Synthesize running alloc and tasks
247+
alloc.ClientStatus = structs.AllocClientStatusRunning
248+
alloc.TaskStates = map[string]*structs.TaskState{
249+
task.Name: {
250+
State: structs.TaskStateRunning,
251+
StartedAt: time.Now(),
252+
},
253+
task2.Name: {
254+
State: structs.TaskStatePending,
255+
},
256+
}
257+
258+
// Make Consul response
259+
check := &consulapi.AgentCheck{
260+
Name: task.Services[0].Checks[0].Name,
261+
Status: consulapi.HealthPassing,
262+
}
263+
taskRegs := map[string]*agentconsul.ServiceRegistrations{
264+
task.Name: {
265+
Services: map[string]*agentconsul.ServiceRegistration{
266+
task.Services[0].Name: {
267+
Service: &consulapi.AgentService{
268+
ID: "foo",
269+
Service: task.Services[0].Name,
270+
},
271+
Checks: []*consulapi.AgentCheck{check},
272+
},
273+
},
274+
},
275+
}
276+
277+
logger := testlog.HCLogger(t)
278+
b := cstructs.NewAllocBroadcaster(logger)
279+
defer b.Close()
280+
281+
// Don't reply on the first call
282+
var called uint64
283+
consul := consul.NewMockConsulServiceClient(t, logger)
284+
consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) {
285+
if atomic.AddUint64(&called, 1) == 1 {
286+
return nil, nil
287+
}
288+
289+
reg := &agentconsul.AllocRegistration{
290+
Tasks: taskRegs,
291+
}
292+
293+
return reg, nil
294+
}
295+
296+
ctx, cancelFn := context.WithCancel(context.Background())
297+
defer cancelFn()
298+
299+
checkInterval := 10 * time.Millisecond
300+
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
301+
time.Millisecond, true)
302+
tracker.checkLookupInterval = checkInterval
303+
tracker.Start()
304+
305+
// assert that we don't get marked healthy
306+
select {
307+
case <-time.After(4 * checkInterval):
308+
// still unhealthy, good
309+
case h := <-tracker.HealthyCh():
310+
require.Fail(t, "unexpected health event", h)
311+
}
312+
require.False(t, tracker.tasksHealthy)
313+
require.False(t, tracker.checksHealthy)
314+
315+
// now set task to healthy
316+
runningAlloc := alloc.Copy()
317+
runningAlloc.TaskStates = map[string]*structs.TaskState{
318+
task.Name: {
319+
State: structs.TaskStateRunning,
320+
StartedAt: time.Now(),
321+
},
322+
task2.Name: {
323+
State: structs.TaskStateRunning,
324+
StartedAt: time.Now(),
325+
},
326+
}
327+
err := b.Send(runningAlloc)
328+
require.NoError(t, err)
329+
330+
// eventually, it is marked as healthy
331+
select {
332+
case <-time.After(4 * checkInterval):
333+
require.Fail(t, "timed out while waiting for health")
334+
case h := <-tracker.HealthyCh():
335+
require.True(t, h)
336+
}
337+
338+
}

0 commit comments

Comments
 (0)