Skip to content

Commit 3d4c185

Browse files
committed
Reconciling the queued allocations during restore
1 parent a9c995b commit 3d4c185

File tree

4 files changed

+144
-10
lines changed

4 files changed

+144
-10
lines changed

nomad/fsm.go

+65-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/armon/go-metrics"
1010
"github.com/hashicorp/nomad/nomad/state"
1111
"github.com/hashicorp/nomad/nomad/structs"
12+
"github.com/hashicorp/nomad/scheduler"
1213
"github.com/hashicorp/raft"
1314
"github.com/ugorji/go/codec"
1415
)
@@ -579,11 +580,73 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error {
579580

580581
// Create Job Summaries
581582
// COMPAT 0.4 -> 0.4.1
582-
if err := restore.CreateJobSummaries(); err != nil {
583+
jobs, err := restore.JobsWithoutSummary()
584+
if err != nil {
585+
fmt.Errorf("error retreiving jobs during restore: %v", err)
586+
}
587+
if err := restore.CreateJobSummaries(jobs); err != nil {
583588
return fmt.Errorf("error creating job summaries: %v", err)
584589
}
585590

586-
// Commit the state restore
591+
restore.Commit()
592+
return n.reconcileSummaries(jobs)
593+
}
594+
595+
// reconcileSummaries re-calculates the queued allocations for every job that we
596+
// created a Job Summary during the snap shot restore
597+
func (n *nomadFSM) reconcileSummaries(jobs []*structs.Job) error {
598+
// Start the state restore
599+
restore, err := n.state.Restore()
600+
if err != nil {
601+
return err
602+
}
603+
defer restore.Abort()
604+
snap, err := n.state.Snapshot()
605+
if err != nil {
606+
return fmt.Errorf("unable to create snapshot: %v", err)
607+
}
608+
for _, job := range jobs {
609+
planner := &scheduler.Harness{
610+
State: &snap.StateStore,
611+
}
612+
// Create an eval and mark it as requiring annotations and insert that as well
613+
eval := &structs.Evaluation{
614+
ID: structs.GenerateUUID(),
615+
Priority: job.Priority,
616+
Type: job.Type,
617+
TriggeredBy: structs.EvalTriggerJobRegister,
618+
JobID: job.ID,
619+
JobModifyIndex: job.JobModifyIndex + 1,
620+
Status: structs.EvalStatusPending,
621+
AnnotatePlan: true,
622+
}
623+
624+
// Create the scheduler and run it
625+
sched, err := scheduler.NewScheduler(eval.Type, n.logger, snap, planner)
626+
if err != nil {
627+
return err
628+
}
629+
630+
if err := sched.Process(eval); err != nil {
631+
return err
632+
}
633+
summary, err := snap.JobSummaryByID(job.ID)
634+
if err != nil {
635+
return err
636+
}
637+
for tg, queued := range planner.Evals[0].QueuedAllocations {
638+
tgSummary, ok := summary.Summary[tg]
639+
if !ok {
640+
return fmt.Errorf("task group %q not found while updating queued count", tg)
641+
}
642+
tgSummary.Queued = queued
643+
summary.Summary[tg] = tgSummary
644+
}
645+
646+
if err := restore.JobSummaryRestore(summary); err != nil {
647+
return err
648+
}
649+
}
587650
restore.Commit()
588651
return nil
589652
}

nomad/fsm_test.go

+26
Original file line numberDiff line numberDiff line change
@@ -972,3 +972,29 @@ func TestFSM_SnapshotRestore_JobSummary(t *testing.T) {
972972
t.Fatalf("bad: \n%#v\n%#v", js2, out2)
973973
}
974974
}
975+
976+
func TestFSM_SnapshotRestore_AddMissingSummary(t *testing.T) {
977+
// Add some state
978+
fsm := testFSM(t)
979+
state := fsm.State()
980+
981+
job1 := mock.Job()
982+
state.UpsertJob(1000, job1)
983+
state.DeleteJobSummary(1010, job1.ID)
984+
985+
fsm2 := testSnapshotRestore(t, fsm)
986+
state2 := fsm2.State()
987+
out1, _ := state2.JobSummaryByID(job1.ID)
988+
expected := structs.JobSummary{
989+
JobID: job1.ID,
990+
Summary: map[string]structs.TaskGroupSummary{
991+
"web": structs.TaskGroupSummary{
992+
Queued: 10,
993+
},
994+
},
995+
}
996+
997+
if !reflect.DeepEqual(&expected, out1) {
998+
t.Fatalf("expected: %#v, actual: %#v", expected, out1)
999+
}
1000+
}

nomad/state/state_store.go

+28-7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,20 @@ func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSumma
9898
return nil
9999
}
100100

101+
// DeleteJobSummary deletes the job summary with the given ID. This is for
102+
// testing purposes only.
103+
func (s *StateStore) DeleteJobSummary(index uint64, id string) error {
104+
txn := s.db.Txn(true)
105+
defer txn.Abort()
106+
107+
// Delete the job summary
108+
if _, err := txn.DeleteAll("job_summary", "id", id); err != nil {
109+
return fmt.Errorf("deleting job summary failed: %v", err)
110+
}
111+
txn.Commit()
112+
return nil
113+
}
114+
101115
// UpsertNode is used to register a node or update a node definition
102116
// This is assumed to be triggered by the client, so we retain the value
103117
// of drain which is set by the scheduler.
@@ -1501,13 +1515,13 @@ func (r *StateRestore) JobSummaryRestore(jobSummary *structs.JobSummary) error {
15011515
return nil
15021516
}
15031517

1504-
// CreateJobSummaries computes the job summaries for all the jobs
1505-
func (r *StateRestore) CreateJobSummaries() error {
1518+
// JobsWithoutSummary returns the list of jobs which don't have any summary
1519+
func (r *StateRestore) JobsWithoutSummary() ([]*structs.Job, error) {
15061520
// Get all the jobs
15071521
var jobs []*structs.Job
15081522
iter, err := r.txn.Get("jobs", "id")
15091523
if err != nil {
1510-
return fmt.Errorf("couldn't retrieve jobs: %v", err)
1524+
return nil, fmt.Errorf("couldn't retrieve jobs: %v", err)
15111525
}
15121526
for {
15131527
raw := iter.Next()
@@ -1517,21 +1531,24 @@ func (r *StateRestore) CreateJobSummaries() error {
15171531

15181532
// Filter the jobs which have summaries
15191533
job := raw.(*structs.Job)
1520-
jobSummary, err := r.txn.Get("job_summary", "id", job.ID)
1534+
jobSummary, err := r.txn.First("job_summary", "id", job.ID)
15211535
if err != nil {
1522-
return fmt.Errorf("unable to get job summary: %v", err)
1536+
return nil, fmt.Errorf("unable to get job summary: %v", err)
15231537
}
15241538
if jobSummary != nil {
15251539
continue
15261540
}
15271541

15281542
jobs = append(jobs, job)
15291543
}
1544+
return jobs, nil
1545+
}
15301546

1547+
// CreateJobSummaries computes the job summaries for all the jobs
1548+
func (r *StateRestore) CreateJobSummaries(jobs []*structs.Job) error {
15311549
for _, job := range jobs {
1532-
15331550
// Get all the allocations for the job
1534-
iter, err = r.txn.Get("allocs", "job", job.ID)
1551+
iter, err := r.txn.Get("allocs", "job", job.ID)
15351552
if err != nil {
15361553
return fmt.Errorf("couldn't retrieve allocations for job %v: %v", job.ID, err)
15371554
}
@@ -1549,6 +1566,9 @@ func (r *StateRestore) CreateJobSummaries() error {
15491566
JobID: job.ID,
15501567
Summary: make(map[string]structs.TaskGroupSummary),
15511568
}
1569+
for _, tg := range job.TaskGroups {
1570+
summary.Summary[tg.Name] = structs.TaskGroupSummary{}
1571+
}
15521572
// Calculate the summary for the job
15531573
for _, alloc := range allocs {
15541574
if _, ok := summary.Summary[alloc.TaskGroup]; !ok {
@@ -1570,6 +1590,7 @@ func (r *StateRestore) CreateJobSummaries() error {
15701590
summary.Summary[alloc.TaskGroup] = tg
15711591
}
15721592
// Insert the job summary
1593+
15731594
if err := r.txn.Insert("job_summary", summary); err != nil {
15741595
return fmt.Errorf("error inserting job summary: %v", err)
15751596
}

nomad/state/state_store_test.go

+25-1
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,7 @@ func TestStateStore_CreateJobSummaries(t *testing.T) {
11321132
}
11331133

11341134
// Create the job summaries
1135-
if err := restore.CreateJobSummaries(); err != nil {
1135+
if err := restore.CreateJobSummaries([]*structs.Job{job}); err != nil {
11361136
t.Fatalf("err: %v", err)
11371137
}
11381138
restore.Commit()
@@ -1155,6 +1155,30 @@ func TestStateStore_CreateJobSummaries(t *testing.T) {
11551155
}
11561156
}
11571157

1158+
func TestStateRestore_JobsWithoutSummaries(t *testing.T) {
1159+
state := testStateStore(t)
1160+
restore, err := state.Restore()
1161+
if err != nil {
1162+
t.Fatalf("err: %v", err)
1163+
}
1164+
// Restore a job
1165+
job := mock.Job()
1166+
if err := restore.JobRestore(job); err != nil {
1167+
t.Fatalf("err: %v", err)
1168+
}
1169+
1170+
jobs, err := restore.JobsWithoutSummary()
1171+
if err != nil {
1172+
t.Fatalf("err: %v", err)
1173+
}
1174+
if len(jobs) != 1 {
1175+
t.Fatalf("expected: %v, actual: %v", 1, len(jobs))
1176+
}
1177+
if !reflect.DeepEqual(job, jobs[0]) {
1178+
t.Fatalf("Bad: %#v %#v", job, jobs[0])
1179+
}
1180+
}
1181+
11581182
func TestStateStore_Indexes(t *testing.T) {
11591183
state := testStateStore(t)
11601184
node := mock.Node()

0 commit comments

Comments
 (0)