Skip to content

Commit

Permalink
agent: Add runner panics metrics (#180)
Browse files Browse the repository at this point in the history
Typically, panics would be visible in other ways, like K8s events. But
because the autoscaler-agent isolates panics just to the threads
handling a single VM, these can go unnoticed unless we do something
about it.
  • Loading branch information
sharnoff authored Apr 19, 2023
1 parent 7d2c3cb commit 649ace1
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
35 changes: 35 additions & 0 deletions pkg/agent/prommetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ type PromMetrics struct {
schedulerRequests *prometheus.CounterVec
informantRequestsOutbound *prometheus.CounterVec
informantRequestsInbound *prometheus.CounterVec
runnerThreadPanics prometheus.Counter
}

func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Registry) {
Expand All @@ -33,6 +34,37 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
},
[]string{"endpoint", "code"},
)
runnerThreadPanics := prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_agent_runner_thread_panics_total",
Help: "Number of panics from autoscaler-agent per-VM runner threads",
},
)
totalPanickedVMs := prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "autoscaling_panicked_vm_runners_current",
Help: "Number of VMs whose per-VM runner has panicked (and not restarted)",
},
func() float64 {
globalstate.lock.Lock()
defer globalstate.lock.Unlock()

count := 0

for _, p := range globalstate.pods {
func() {
p.status.mu.Lock()
defer p.status.mu.Unlock()

if p.status.panicked {
count += 1
}
}()
}

return float64(count)
},
)
totalVMs := prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "autoscaling_agent_tracked_vms_current",
Expand Down Expand Up @@ -73,6 +105,8 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
schedulerRequests,
informantRequestsOutbound,
informantRequestsInbound,
runnerThreadPanics,
totalPanickedVMs,
totalVMs,
totalVMsWithUnhealthyInformants,
)
Expand All @@ -81,5 +115,6 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
schedulerRequests: schedulerRequests,
informantRequestsOutbound: informantRequestsOutbound,
informantRequestsInbound: informantRequestsInbound,
runnerThreadPanics: runnerThreadPanics,
}, reg
}
2 changes: 2 additions & 0 deletions pkg/agent/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,8 @@ func (r *Runner) spawnBackgroundWorker(ctx context.Context, name string, f func(
r.backgroundWorkerCount.Add(-1)

if v := recover(); v != nil {
r.global.metrics.runnerThreadPanics.Inc()

err := fmt.Errorf("background worker %q panicked: %v", name, v)
r.logger.Errorf("%s", err)
// note: In Go, the stack doesn't "unwind" on panic. Instead, a panic will traverse up
Expand Down

0 comments on commit 649ace1

Please sign in to comment.