Skip to content

Commit 025a613

Browse files
committed
don't emit follow-up eval for core jobs
1 parent 5895555 commit 025a613

File tree

2 files changed

+29
-21
lines changed

2 files changed

+29
-21
lines changed

nomad/core_sched_test.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -2449,7 +2449,9 @@ func TestCoreScheduler_FailLoop(t *testing.T) {
24492449

24502450
out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
24512451
require.NoError(err)
2452-
require.Nil(out,
2453-
"failed core jobs should not result in follow-up. TriggeredBy: %v",
2454-
out.TriggeredBy)
2452+
if out != nil {
2453+
t.Fatalf(
2454+
"failed core jobs should not result in follow-up. TriggeredBy: %v",
2455+
out.TriggeredBy)
2456+
}
24552457
}

nomad/leader.go

+24-18
Original file line numberDiff line numberDiff line change
@@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
640640
updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
641641
s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())
642642

643-
// Create a follow-up evaluation that will be used to retry the
644-
// scheduling for the job after the cluster is hopefully more stable
645-
// due to the fairly large backoff.
646-
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
647-
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
648-
649-
followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
650-
updateEval.NextEval = followupEval.ID
651-
updateEval.UpdateModifyTime()
652-
653-
// Update via Raft
654-
req := structs.EvalUpdateRequest{
655-
Evals: []*structs.Evaluation{updateEval, followupEval},
656-
}
657-
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
658-
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
659-
continue
643+
// Core job evals that fail or span leader elections will never
644+
// succeed because the follow-up doesn't have the leader ACL. We
645+
// rely on the leader to schedule new core jobs periodically
646+
// instead.
647+
if eval.Type != structs.JobTypeCore {
648+
649+
// Create a follow-up evaluation that will be used to retry the
650+
// scheduling for the job after the cluster is hopefully more stable
651+
// due to the fairly large backoff.
652+
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
653+
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
654+
655+
followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
656+
updateEval.NextEval = followupEval.ID
657+
updateEval.UpdateModifyTime()
658+
659+
// Update via Raft
660+
req := structs.EvalUpdateRequest{
661+
Evals: []*structs.Evaluation{updateEval, followupEval},
662+
}
663+
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
664+
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
665+
continue
666+
}
660667
}
661-
662668
// Ack completion
663669
s.evalBroker.Ack(eval.ID, token)
664670
}

0 commit comments

Comments
 (0)