From 674a1b52fa3951dcc5b4dbc7b2197cc0baa7d62b Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 4 Feb 2022 13:42:06 -0500 Subject: [PATCH 1/2] scheduler: recover from panic If processing a specific evaluation causes the scheduler (and therefore the entire server) to panic, that evaluation will never get a chance to be nack'd and cleared from the state store. It will get dequeued by another scheduler, causing that server to panic, and so forth until all servers are in a panic loop. This prevents the operator from intervening to remove the evaluation or update the state. Recover the goroutine from the top-level `Process` methods for each scheduler so that this condition can be detected without panicking the server process. This will lead to a loop of recovering the scheduler goroutine until the eval can be removed or nack'd, but that's much better than taking a downtime. --- .changelog/12009.txt | 3 +++ scheduler/generic_sched.go | 9 ++++++++- scheduler/scheduler_system.go | 8 +++++++- 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 .changelog/12009.txt diff --git a/.changelog/12009.txt b/.changelog/12009.txt new file mode 100644 index 00000000000..e57cbca6a45 --- /dev/null +++ b/.changelog/12009.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: recover scheduler goroutines on panic +``` diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f263d864b24..2d83caad195 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -125,7 +125,14 @@ func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state Sta } // Process is used to handle a single evaluation -func (s *GenericScheduler) Process(eval *structs.Evaluation) error { +func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() + // Store the evaluation s.eval = eval diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index dff79aa8bc2..fefdc0101cf 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -72,7 +72,13 @@ func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state } // Process is used to handle a single evaluation. -func (s *SystemScheduler) Process(eval *structs.Evaluation) error { +func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() // Store the evaluation s.eval = eval From f13c7beea62df33be2fb54fb84ac438b85b5e72e Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 7 Feb 2022 11:11:28 -0500 Subject: [PATCH 2/2] address comments from code review --- scheduler/generic_sched.go | 2 +- scheduler/scheduler_system.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 2d83caad195..6d6c262db6c 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -129,7 +129,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { defer func() { if r := recover(); r != nil { - err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r) } }() diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index fefdc0101cf..59de58d8986 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -76,7 +76,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { defer func() { if r := recover(); r != nil { - err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r) } }()