Skip to content

Commit 108a422

Browse files
authored
failed core jobs should not have follow-ups (#8682)
If a core job fails more than the delivery limit, the leader will create a new eval with the TriggeredBy field set to `failed-follow-up`. Evaluations for core jobs have the leader's ACL, which is not valid on another leader after an election. The `failed-follow-up` evals do not have ACLs, so core job evals that fail more than the delivery limit or core job evals that span leader elections will never succeed and will be re-enqueued forever. So we should not retry with a `failed-follow-up`.
1 parent a8b144c commit 108a422

File tree

2 files changed

+85
-18
lines changed

2 files changed

+85
-18
lines changed

nomad/core_sched_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -2394,3 +2394,64 @@ func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) {
23942394
}, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly")
23952395

23962396
}
2397+
2398+
func TestCoreScheduler_FailLoop(t *testing.T) {
2399+
t.Parallel()
2400+
require := require.New(t)
2401+
2402+
srv, cleanupSrv := TestServer(t, func(c *Config) {
2403+
c.NumSchedulers = 0 // Prevent automatic dequeue
2404+
c.EvalDeliveryLimit = 2
2405+
c.EvalFailedFollowupBaselineDelay = time.Duration(50 * time.Millisecond)
2406+
c.EvalFailedFollowupDelayRange = time.Duration(1 * time.Millisecond)
2407+
})
2408+
defer cleanupSrv()
2409+
codec := rpcClient(t, srv)
2410+
sched := []string{structs.JobTypeCore}
2411+
2412+
testutil.WaitForResult(func() (bool, error) {
2413+
return srv.evalBroker.Enabled(), nil
2414+
}, func(err error) {
2415+
t.Fatalf("should enable eval broker")
2416+
})
2417+
2418+
// Enqueue a core job eval that can never succeed because it was enqueued
2419+
// by another leader that's now gone
2420+
expected := srv.coreJobEval(structs.CoreJobCSIPluginGC, 100)
2421+
expected.LeaderACL = "nonsense"
2422+
srv.evalBroker.Enqueue(expected)
2423+
2424+
nack := func(evalID, token string) error {
2425+
req := &structs.EvalAckRequest{
2426+
EvalID: evalID,
2427+
Token: token,
2428+
WriteRequest: structs.WriteRequest{Region: "global"},
2429+
}
2430+
var resp structs.GenericResponse
2431+
return msgpackrpc.CallWithCodec(codec, "Eval.Nack", req, &resp)
2432+
}
2433+
2434+
out, token, err := srv.evalBroker.Dequeue(sched, time.Second*5)
2435+
require.NoError(err)
2436+
require.NotNil(out)
2437+
require.Equal(expected, out)
2438+
2439+
// first fail
2440+
require.NoError(nack(out.ID, token))
2441+
2442+
out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
2443+
require.NoError(err)
2444+
require.NotNil(out)
2445+
require.Equal(expected, out)
2446+
2447+
// second fail, should not result in failed-follow-up
2448+
require.NoError(nack(out.ID, token))
2449+
2450+
out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
2451+
require.NoError(err)
2452+
if out != nil {
2453+
t.Fatalf(
2454+
"failed core jobs should not result in follow-up. TriggeredBy: %v",
2455+
out.TriggeredBy)
2456+
}
2457+
}

nomad/leader.go

+24-18
Original file line numberDiff line numberDiff line change
@@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
640640
updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
641641
s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())
642642

643-
// Create a follow-up evaluation that will be used to retry the
644-
// scheduling for the job after the cluster is hopefully more stable
645-
// due to the fairly large backoff.
646-
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
647-
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
648-
649-
followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
650-
updateEval.NextEval = followupEval.ID
651-
updateEval.UpdateModifyTime()
652-
653-
// Update via Raft
654-
req := structs.EvalUpdateRequest{
655-
Evals: []*structs.Evaluation{updateEval, followupEval},
656-
}
657-
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
658-
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
659-
continue
643+
// Core job evals that fail or span leader elections will never
644+
// succeed because the follow-up doesn't have the leader ACL. We
645+
// rely on the leader to schedule new core jobs periodically
646+
// instead.
647+
if eval.Type != structs.JobTypeCore {
648+
649+
// Create a follow-up evaluation that will be used to retry the
650+
// scheduling for the job after the cluster is hopefully more stable
651+
// due to the fairly large backoff.
652+
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
653+
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
654+
655+
followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
656+
updateEval.NextEval = followupEval.ID
657+
updateEval.UpdateModifyTime()
658+
659+
// Update via Raft
660+
req := structs.EvalUpdateRequest{
661+
Evals: []*structs.Evaluation{updateEval, followupEval},
662+
}
663+
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
664+
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
665+
continue
666+
}
660667
}
661-
662668
// Ack completion
663669
s.evalBroker.Ack(eval.ID, token)
664670
}

0 commit comments

Comments
 (0)