Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[15090] Ensure no leakage of evaluations for batch jobs. #15097

Merged
merged 12 commits into from
Jan 31, 2023
Prev Previous commit
Next Next commit
Changes as requested by lgfa29 and tgross. Primarily handling of batch
evals in face of purge.
  • Loading branch information
stswidwinski committed Jan 30, 2023
commit 4609d8e0b6112cadb5157ea275e9221914dd10ea
2 changes: 1 addition & 1 deletion nomad/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ func DefaultConfig() *Config {
ReconcileInterval: 60 * time.Second,
EvalGCInterval: 5 * time.Minute,
EvalGCThreshold: 1 * time.Hour,
BatchEvalGCThreshold: 168 * time.Hour,
BatchEvalGCThreshold: 24 * time.Hour,
JobGCInterval: 5 * time.Minute,
JobGCThreshold: 4 * time.Hour,
NodeGCInterval: 5 * time.Minute,
Expand Down
34 changes: 11 additions & 23 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ OUTER:
allEvalsGC := true
var jobAlloc, jobEval []string
for _, eval := range evals {
gc, allocs, err := c.gcEval(eval, oldThreshold, oldThreshold, true)
gc, allocs, err := c.gcEval(eval, oldThreshold, true)
if err != nil {
continue OUTER
} else if gc {
Expand Down Expand Up @@ -246,9 +246,12 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
for raw := iter.Next(); raw != nil; raw = iter.Next() {
eval := raw.(*structs.Evaluation)

// The Evaluation GC should not handle batch jobs since those need to be
// garbage collected in one shot
gc, allocs, err := c.gcEval(eval, oldThreshold, batchOldThreshold, false)
gcThreshold := oldThreshold
if eval.Type == structs.JobTypeBatch {
gcThreshold = batchOldThreshold
}

gc, allocs, err := c.gcEval(eval, gcThreshold, false)
if err != nil {
return err
}
Expand All @@ -274,15 +277,10 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
// allocs are not older than the threshold. If the eval should be garbage
// collected, the associated alloc ids that should also be removed are also
// returned
func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, batchThresholdIndex uint64, allowBatch bool) (
func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, allowBatch bool) (
bool, []string, error) {
// Ignore non-terminal and new evaluations
if !eval.TerminalStatus() {
return false, nil, nil
}

if (eval.Type == structs.JobTypeBatch && eval.ModifyIndex > batchThresholdIndex) ||
(eval.Type != structs.JobTypeBatch && eval.ModifyIndex > thresholdIndex) {
if !eval.TerminalStatus() || eval.ModifyIndex > thresholdIndex {
return false, nil, nil
}

Expand Down Expand Up @@ -319,19 +317,9 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
// - allowBatch and the job is dead
//
// If we cannot collect outright, check if a partial GC may occur
collect := false
if job == nil {
collect = true
} else if job.Status != structs.JobStatusDead {
collect = false
} else if job.Stop {
collect = true
} else if allowBatch {
collect = true
}

collect := job == nil || job.Status == structs.JobStatusDead && (job.Stop || allowBatch)
if !collect {
oldAllocs := olderVersionTerminalAllocs(allocs, job, batchThresholdIndex)
oldAllocs := olderVersionTerminalAllocs(allocs, job, thresholdIndex)
gcEval := (len(oldAllocs) == len(allocs))
return gcEval, oldAllocs, nil
}
Expand Down