Skip to content

Commit

Permalink
add metric for tracking failure to start the controller-runtime manag…
Browse files Browse the repository at this point in the history
…er (#1860)

Signed-off-by: Evan Baker <[email protected]>
  • Loading branch information
rbtr authored Sep 8, 2023
1 parent 16eeee7 commit 554055b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
3 changes: 2 additions & 1 deletion cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,8 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
if err := manager.Start(ctx); err != nil {
logger.Errorf("[Azure CNS] Failed to start request controller: %v", err)
// retry to start the request controller
// todo: add a CNS metric to count # of failures
// inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else {
logger.Printf("exiting NodeNetworkConfig reconciler")
return
Expand Down
23 changes: 23 additions & 0 deletions cns/service/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package main

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing and retrying.
var managerStartFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "manager_start_failures_total",
Help: "Number of times the controller-runtime manager failed to start.",
},
)

func init() {
metrics.Registry.MustRegister(
managerStartFailures,
)
}

0 comments on commit 554055b

Please sign in to comment.