From 554055b794c2cd8ef200eae9a78151e6dad04e96 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Tue, 21 Mar 2023 11:20:26 -0500 Subject: [PATCH] add metric for tracking failure to start the controller-runtime manager (#1860) Signed-off-by: Evan Baker --- cns/service/main.go | 3 ++- cns/service/metrics.go | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 cns/service/metrics.go diff --git a/cns/service/main.go b/cns/service/main.go index efa509e382..3e6b902c98 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -1236,7 +1236,8 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn if err := manager.Start(ctx); err != nil { logger.Errorf("[Azure CNS] Failed to start request controller: %v", err) // retry to start the request controller - // todo: add a CNS metric to count # of failures + // inc the managerStartFailures metric for failure tracking + managerStartFailures.Inc() } else { logger.Printf("exiting NodeNetworkConfig reconciler") return diff --git a/cns/service/metrics.go b/cns/service/metrics.go new file mode 100644 index 0000000000..2416a858bd --- /dev/null +++ b/cns/service/metrics.go @@ -0,0 +1,23 @@ +package main + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime +// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate +// of increase over a period of time. A positive rate of change indicates that the CNS is actively +// failing and retrying. +var managerStartFailures = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "manager_start_failures_total", + Help: "Number of times the controller-runtime manager failed to start.", + }, +) + +func init() { + metrics.Registry.MustRegister( + managerStartFailures, + ) +}