Skip to content

Commit

Permalink
Add client-go metrics (#131)
Browse files Browse the repository at this point in the history
## Overview
Register a few metric callbacks with the client-go metrics interface so that we can monitor request latencies and rate limiting of kubeclient.

```
❯ curl http://localhost:10254/metrics | rg k8s_client
# HELP k8s_client_rate_limiter_latency Kubernetes client rate limiter latency in seconds
# TYPE k8s_client_rate_limiter_latency histogram
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.005"} 84
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.01"} 87
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.025"} 89
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.05"} 99
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.1"} 114
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.25"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="0.5"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="1"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="2.5"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="5"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="10"} 117
k8s_client_rate_limiter_latency_bucket{verb="GET",le="+Inf"} 117
k8s_client_rate_limiter_latency_sum{verb="GET"} 1.9358371670000003
k8s_client_rate_limiter_latency_count{verb="GET"} 117
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.005"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.01"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.025"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.05"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.1"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.25"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="0.5"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="1"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="2.5"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="5"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="10"} 6
k8s_client_rate_limiter_latency_bucket{verb="POST",le="+Inf"} 6
k8s_client_rate_limiter_latency_sum{verb="POST"} 1.0542e-05
k8s_client_rate_limiter_latency_count{verb="POST"} 6
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.005"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.01"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.025"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.05"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.1"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.25"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="0.5"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="1"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="2.5"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="5"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="10"} 1
k8s_client_rate_limiter_latency_bucket{verb="PUT",le="+Inf"} 1
k8s_client_rate_limiter_latency_sum{verb="PUT"} 5e-07
k8s_client_rate_limiter_latency_count{verb="PUT"} 1
# HELP k8s_client_request_latency Kubernetes client request latency in seconds
# TYPE k8s_client_request_latency histogram
k8s_client_request_latency_bucket{verb="GET",le="0.005"} 84
k8s_client_request_latency_bucket{verb="GET",le="0.01"} 86
k8s_client_request_latency_bucket{verb="GET",le="0.025"} 89
k8s_client_request_latency_bucket{verb="GET",le="0.05"} 99
k8s_client_request_latency_bucket{verb="GET",le="0.1"} 112
k8s_client_request_latency_bucket{verb="GET",le="0.25"} 117
k8s_client_request_latency_bucket{verb="GET",le="0.5"} 117
k8s_client_request_latency_bucket{verb="GET",le="1"} 117
k8s_client_request_latency_bucket{verb="GET",le="2.5"} 117
k8s_client_request_latency_bucket{verb="GET",le="5"} 117
k8s_client_request_latency_bucket{verb="GET",le="10"} 117
k8s_client_request_latency_bucket{verb="GET",le="+Inf"} 117
k8s_client_request_latency_sum{verb="GET"} 2.1254330859999997
k8s_client_request_latency_count{verb="GET"} 117
k8s_client_request_latency_bucket{verb="POST",le="0.005"} 5
k8s_client_request_latency_bucket{verb="POST",le="0.01"} 5
k8s_client_request_latency_bucket{verb="POST",le="0.025"} 5
k8s_client_request_latency_bucket{verb="POST",le="0.05"} 6
k8s_client_request_latency_bucket{verb="POST",le="0.1"} 6
k8s_client_request_latency_bucket{verb="POST",le="0.25"} 6
k8s_client_request_latency_bucket{verb="POST",le="0.5"} 6
k8s_client_request_latency_bucket{verb="POST",le="1"} 6
k8s_client_request_latency_bucket{verb="POST",le="2.5"} 6
k8s_client_request_latency_bucket{verb="POST",le="5"} 6
k8s_client_request_latency_bucket{verb="POST",le="10"} 6
k8s_client_request_latency_bucket{verb="POST",le="+Inf"} 6
k8s_client_request_latency_sum{verb="POST"} 0.048558582
k8s_client_request_latency_count{verb="POST"} 6
k8s_client_request_latency_bucket{verb="PUT",le="0.005"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.01"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.025"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.05"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.1"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.25"} 1
k8s_client_request_latency_bucket{verb="PUT",le="0.5"} 1
k8s_client_request_latency_bucket{verb="PUT",le="1"} 1
k8s_client_request_latency_bucket{verb="PUT",le="2.5"} 1
k8s_client_request_latency_bucket{verb="PUT",le="5"} 1
k8s_client_request_latency_bucket{verb="PUT",le="10"} 1
k8s_client_request_latency_bucket{verb="PUT",le="+Inf"} 1
k8s_client_request_latency_sum{verb="PUT"} 0.002381375
k8s_client_request_latency_count{verb="PUT"} 1
# HELP k8s_client_request_total Kubernetes client request total
# TYPE k8s_client_request_total counter
k8s_client_request_total{code="200",method="GET"} 120
k8s_client_request_total{code="200",method="PUT"} 1
k8s_client_request_total{code="409",method="POST"} 6
```

![Screenshot 2024-03-13 at 12 56 22 PM](https://github.com/unionai/flyte/assets/5725707/115cf54c-d5b6-4d39-8735-505377fb559a)


## Test Plan
- [x] Run single binary and verify metrics exist
- [x] Deploy to a staging propeller instance and observe there 

## Rollout Plan (if applicable)
Once this is merged, need to pull into cloud and scrape these metrics

## Upstream Changes
Should this change be upstreamed to OSS (flyteorg/flyte)? If so, please check this box for auditing. Note, this is the responsibility of each developer. See [this guide](https://unionai.atlassian.net/wiki/spaces/ENG/pages/447610883/Flyte+-+Union+Cloud+Development+Runbook/#When-are-versions-updated%3F).
- [x] To be upstreamed

## Jira Issue
https://unionai.atlassian.net/browse/CLOUD-1677
  • Loading branch information
andrewwdye authored Mar 13, 2024
1 parent 8535a62 commit e2b3041
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cmd/single/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ import (

"github.com/flyteorg/flyte/flytestdlib/logger"

"github.com/flyteorg/flyte/flytestdlib/config"
"github.com/flyteorg/flyte/flytestdlib/config/viper"
"github.com/spf13/cobra"
"github.com/spf13/pflag"

"github.com/flyteorg/flyte/flytestdlib/config"
"github.com/flyteorg/flyte/flytestdlib/config/viper"
_ "github.com/flyteorg/flyte/flytestdlib/promutils"
)

var (
Expand Down
1 change: 1 addition & 0 deletions flyteadmin/cmd/entrypoints/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/flyteorg/flyte/flytestdlib/logger"
"github.com/flyteorg/flyte/flytestdlib/otelutils"
"github.com/flyteorg/flyte/flytestdlib/profutils"
_ "github.com/flyteorg/flyte/flytestdlib/promutils"
)

var pluginRegistryStore = plugins.NewAtomicRegistry(plugins.NewRegistry())
Expand Down
81 changes: 81 additions & 0 deletions flytestdlib/promutils/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package promutils

import (
"context"
"net/url"
"time"

"k8s.io/client-go/tools/metrics"

"github.com/prometheus/client_golang/prometheus"
)

func init() {
requestMetrics := newRequestMetricsProvider()
rateLimiterMetrics := newRateLimiterMetricsAdapter()
metrics.Register(metrics.RegisterOpts{
RequestLatency: &requestMetrics,
RequestResult: &requestMetrics,
RateLimiterLatency: &rateLimiterMetrics,
})
}

var latencyBuckets = []float64{.0005, .001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}

type requestMetricsProvider struct {
requestLatency *prometheus.HistogramVec
requestResult *prometheus.CounterVec
}

func (r *requestMetricsProvider) Observe(ctx context.Context, verb string, _ url.URL, latency time.Duration) {
r.requestLatency.WithLabelValues(verb).Observe(latency.Seconds())
}

func (r *requestMetricsProvider) Increment(ctx context.Context, code string, method string, _ string) {
r.requestResult.WithLabelValues(code, method).Inc()
}

func newRequestMetricsProvider() requestMetricsProvider {
requestLatency := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "k8s_client_request_latency",
Help: "Kubernetes client request latency in seconds",
Buckets: latencyBuckets,
},
[]string{"verb"})
prometheus.Register(requestLatency)
requestResult := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "k8s_client_request_total",
Help: "Kubernetes client request total",
},
[]string{"code", "method"},
)
prometheus.Register(requestResult)
return requestMetricsProvider{
requestLatency,
requestResult,
}
}

type rateLimiterMetricsProvider struct {
rateLimiterLatency *prometheus.HistogramVec
}

func (r *rateLimiterMetricsProvider) Observe(ctx context.Context, verb string, _ url.URL, latency time.Duration) {
r.rateLimiterLatency.WithLabelValues(verb).Observe(latency.Seconds())
}

func newRateLimiterMetricsAdapter() rateLimiterMetricsProvider {
rateLimiterLatency := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "k8s_client_rate_limiter_latency",
Help: "Kubernetes client rate limiter latency in seconds",
Buckets: latencyBuckets,
},
[]string{"verb"})
prometheus.Register(rateLimiterLatency)
return rateLimiterMetricsProvider{
rateLimiterLatency,
}
}

0 comments on commit e2b3041

Please sign in to comment.