Skip to content

Commit d3d31f1

Browse files
authored
chore(aggregated_metrics): Add metrics to aggreated metrics (#14986)
1 parent 62e7d61 commit d3d31f1

File tree

6 files changed

+221
-11
lines changed

6 files changed

+221
-11
lines changed

pkg/pattern/aggregation/metrics.go

+100-5
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,121 @@ package aggregation
33
import (
44
"github.com/prometheus/client_golang/prometheus"
55
"github.com/prometheus/client_golang/prometheus/promauto"
6+
7+
"github.com/grafana/loki/v3/pkg/util/constants"
68
)
79

8-
type ChunkMetrics struct {
10+
type Metrics struct {
11+
reg prometheus.Registerer
12+
913
chunks *prometheus.GaugeVec
1014
samples *prometheus.CounterVec
15+
16+
// push operation
17+
pushErrors *prometheus.CounterVec
18+
pushRetries *prometheus.CounterVec
19+
pushSuccesses *prometheus.CounterVec
20+
payloadSize *prometheus.HistogramVec
21+
22+
// Batch metrics
23+
streamsPerPush *prometheus.HistogramVec
24+
entriesPerPush *prometheus.HistogramVec
25+
servicesTracked *prometheus.GaugeVec
26+
27+
writeTimeout *prometheus.CounterVec
1128
}
1229

13-
func NewChunkMetrics(r prometheus.Registerer, metricsNamespace string) *ChunkMetrics {
14-
return &ChunkMetrics{
30+
func NewMetrics(r prometheus.Registerer) *Metrics {
31+
var m Metrics
32+
m.reg = r
33+
34+
m = Metrics{
1535
chunks: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
16-
Namespace: metricsNamespace,
36+
Namespace: constants.Loki,
1737
Subsystem: "pattern_ingester",
1838
Name: "metric_chunks",
1939
Help: "The total number of chunks in memory.",
2040
}, []string{"service_name"}),
2141
samples: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
22-
Namespace: metricsNamespace,
42+
Namespace: constants.Loki,
2343
Subsystem: "pattern_ingester",
2444
Name: "metric_samples",
2545
Help: "The total number of samples in memory.",
2646
}, []string{"service_name"}),
47+
pushErrors: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
48+
Namespace: constants.Loki,
49+
Subsystem: "pattern_ingester",
50+
Name: "push_errors_total",
51+
Help: "Total number of errors when pushing metrics to Loki.",
52+
}, []string{"tenant_id", "error_type"}),
53+
54+
pushRetries: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
55+
Namespace: constants.Loki,
56+
Subsystem: "pattern_ingester",
57+
Name: "push_retries_total",
58+
Help: "Total number of retries when pushing metrics to Loki.",
59+
}, []string{"tenant_id"}),
60+
61+
pushSuccesses: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
62+
Namespace: constants.Loki,
63+
Subsystem: "pattern_ingester",
64+
Name: "push_successes_total",
65+
Help: "Total number of successful pushes to Loki.",
66+
}, []string{"tenant_id"}),
67+
68+
// Batch metrics
69+
payloadSize: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
70+
Namespace: constants.Loki,
71+
Subsystem: "pattern_ingester",
72+
Name: "push_payload_bytes",
73+
Help: "Size of push payloads in bytes.",
74+
Buckets: []float64{1024, 4096, 16384, 65536, 262144, 1048576},
75+
}, []string{"tenant_id"}),
76+
77+
streamsPerPush: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
78+
Namespace: constants.Loki,
79+
Subsystem: "pattern_ingester",
80+
Name: "streams_per_push",
81+
Help: "Number of streams in each push request.",
82+
Buckets: []float64{1, 5, 10, 25, 50, 100, 250, 500, 1000},
83+
}, []string{"tenant_id"}),
84+
85+
entriesPerPush: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
86+
Namespace: constants.Loki,
87+
Subsystem: "pattern_ingester",
88+
Name: "entries_per_push",
89+
Help: "Number of entries in each push request.",
90+
Buckets: []float64{10, 50, 100, 500, 1000, 5000, 10000},
91+
}, []string{"tenant_id"}),
92+
93+
servicesTracked: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
94+
Namespace: constants.Loki,
95+
Subsystem: "pattern_ingester",
96+
Name: "services_tracked",
97+
Help: "Number of unique services being tracked.",
98+
}, []string{"tenant_id"}),
99+
writeTimeout: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
100+
Namespace: constants.Loki,
101+
Subsystem: "pattern_ingester",
102+
Name: "write_timeouts_total",
103+
Help: "Total number of write timeouts.",
104+
}, []string{"tenant_id"}),
27105
}
106+
107+
if m.reg != nil {
108+
m.reg.MustRegister(
109+
m.chunks,
110+
m.samples,
111+
m.pushErrors,
112+
m.pushRetries,
113+
m.pushSuccesses,
114+
m.payloadSize,
115+
m.streamsPerPush,
116+
m.entriesPerPush,
117+
m.servicesTracked,
118+
m.writeTimeout,
119+
)
120+
}
121+
122+
return &m
28123
}

pkg/pattern/aggregation/push.go

+23-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bufio"
55
"bytes"
66
"context"
7+
"errors"
78
"fmt"
89
"io"
910
"net/http"
@@ -16,6 +17,7 @@ import (
1617
"github.com/go-kit/log/level"
1718
"github.com/golang/snappy"
1819
"github.com/opentracing/opentracing-go"
20+
"github.com/prometheus/client_golang/prometheus"
1921
"github.com/prometheus/common/config"
2022
"github.com/prometheus/common/model"
2123
"github.com/prometheus/prometheus/model/labels"
@@ -71,6 +73,8 @@ type Push struct {
7173
backoff *backoff.Config
7274

7375
entries entries
76+
77+
metrics *Metrics
7478
}
7579

7680
type entry struct {
@@ -108,6 +112,7 @@ func NewPush(
108112
useTLS bool,
109113
backoffCfg *backoff.Config,
110114
logger log.Logger,
115+
registrer prometheus.Registerer,
111116
) (*Push, error) {
112117
client, err := config.NewClientFromConfig(cfg, "pattern-ingester-push", config.WithHTTP2Disabled())
113118
if err != nil {
@@ -142,6 +147,7 @@ func NewPush(
142147
entries: entries{
143148
entries: make([]entry, 0),
144149
},
150+
metrics: NewMetrics(registrer),
145151
}
146152

147153
go p.run(pushPeriod)
@@ -222,6 +228,10 @@ func (p *Push) buildPayload(ctx context.Context) ([]byte, error) {
222228

223229
payload = snappy.Encode(nil, payload)
224230

231+
p.metrics.streamsPerPush.WithLabelValues(p.tenantID).Observe(float64(len(streams)))
232+
p.metrics.entriesPerPush.WithLabelValues(p.tenantID).Observe(float64(len(entries)))
233+
p.metrics.servicesTracked.WithLabelValues(p.tenantID).Set(float64(serviceLimit))
234+
225235
sp.LogKV(
226236
"event", "build aggregated metrics payload",
227237
"num_service", len(entriesByStream),
@@ -267,7 +277,7 @@ func (p *Push) run(pushPeriod time.Duration) {
267277
break
268278
}
269279

270-
if status > 0 && status != 429 && status/100 != 5 {
280+
if status > 0 && util.IsRateLimited(status) && !util.IsServerError(status) {
271281
level.Error(p.logger).Log("msg", "failed to send entry, server rejected push with a non-retryable status code", "status", status, "err", err)
272282
pushTicker.Reset(pushPeriod)
273283
break
@@ -302,6 +312,8 @@ func (p *Push) send(ctx context.Context, payload []byte) (int, error) {
302312
defer sp.Finish()
303313

304314
req, err := http.NewRequestWithContext(ctx, "POST", p.lokiURL, bytes.NewReader(payload))
315+
p.metrics.payloadSize.WithLabelValues(p.tenantID).Observe(float64(len(payload)))
316+
305317
if err != nil {
306318
return -1, fmt.Errorf("failed to create push request: %w", err)
307319
}
@@ -320,23 +332,29 @@ func (p *Push) send(ctx context.Context, payload []byte) (int, error) {
320332

321333
resp, err = p.httpClient.Do(req)
322334
if err != nil {
335+
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
336+
p.metrics.writeTimeout.WithLabelValues(p.tenantID).Inc()
337+
}
323338
return -1, fmt.Errorf("failed to push payload: %w", err)
324339
}
325-
status := resp.StatusCode
326-
if status/100 != 2 {
340+
statusCode := resp.StatusCode
341+
if util.IsError(statusCode) {
342+
errType := util.ErrorTypeFromHTTPStatus(statusCode)
343+
327344
scanner := bufio.NewScanner(io.LimitReader(resp.Body, defaultMaxReponseBufferLen))
328345
line := ""
329346
if scanner.Scan() {
330347
line = scanner.Text()
331348
}
332-
err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, status, line)
349+
err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, statusCode, line)
350+
p.metrics.pushErrors.WithLabelValues(p.tenantID, errType).Inc()
333351
}
334352

335353
if err := resp.Body.Close(); err != nil {
336354
level.Error(p.logger).Log("msg", "failed to close response body", "error", err)
337355
}
338356

339-
return status, err
357+
return statusCode, err
340358
}
341359

342360
func AggregatedMetricEntry(

pkg/pattern/aggregation/push_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ func Test_Push(t *testing.T) {
5858
false,
5959
&backoff,
6060
log.NewNopLogger(),
61+
nil,
6162
)
6263
require.NoError(t, err)
6364
ts, payload := testPayload()
@@ -82,7 +83,7 @@ func Test_Push(t *testing.T) {
8283
"user", "secret",
8384
false,
8485
&backoff,
85-
log.NewNopLogger(),
86+
log.NewNopLogger(), nil,
8687
)
8788
require.NoError(t, err)
8889
ts, payload := testPayload()
@@ -123,6 +124,7 @@ func Test_Push(t *testing.T) {
123124
quit: make(chan struct{}),
124125
backoff: &backoff,
125126
entries: entries{},
127+
metrics: NewMetrics(nil),
126128
}
127129

128130
lbls1 := labels.New(labels.Label{Name: "test", Value: "test"})

pkg/pattern/ingester.go

+1
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ func (i *Ingester) GetOrCreateInstance(instanceID string) (*instance, error) { /
403403
aggCfg.UseTLS,
404404
&aggCfg.BackoffConfig,
405405
i.logger,
406+
i.registerer,
406407
)
407408
if err != nil {
408409
return nil, err

pkg/util/http.go

+32
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ import (
2323

2424
const messageSizeLargerErrFmt = "received message larger than max (%d vs %d)"
2525

26+
const (
27+
HTTPRateLimited = "rate_limited"
28+
HTTPServerError = "server_error"
29+
HTTPErrorUnknown = "unknown"
30+
HTTPClientError = "client_error"
31+
)
32+
2633
// IsRequestBodyTooLarge returns true if the error is "http: request body too large".
2734
func IsRequestBodyTooLarge(err error) bool {
2835
return err != nil && strings.Contains(err.Error(), "http: request body too large")
@@ -307,3 +314,28 @@ func IsValidURL(endpoint string) bool {
307314

308315
return u.Scheme != "" && u.Host != ""
309316
}
317+
318+
func ErrorTypeFromHTTPStatus(status int) string {
319+
errorType := HTTPErrorUnknown
320+
if status == 429 {
321+
errorType = HTTPRateLimited
322+
} else if status/100 == 5 {
323+
errorType = HTTPServerError
324+
} else if status/100 != 2 {
325+
errorType = HTTPClientError
326+
}
327+
328+
return errorType
329+
}
330+
331+
func IsError(status int) bool {
332+
return status/200 != 0
333+
}
334+
335+
func IsServerError(status int) bool {
336+
return status/100 == 5
337+
}
338+
339+
func IsRateLimited(status int) bool {
340+
return status == 429
341+
}

pkg/util/http_test.go

+62
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,65 @@ func TestIsRequestBodyTooLargeRegression(t *testing.T) {
218218
_, err := io.ReadAll(http.MaxBytesReader(httptest.NewRecorder(), io.NopCloser(bytes.NewReader([]byte{1, 2, 3, 4})), 1))
219219
assert.True(t, util.IsRequestBodyTooLarge(err))
220220
}
221+
222+
func TestErrorTypeFromHTTPStatus(t *testing.T) {
223+
tests := []struct {
224+
name string
225+
status int
226+
expectedResult string
227+
}{
228+
{
229+
name: "rate limited error",
230+
status: 429,
231+
expectedResult: util.HTTPRateLimited,
232+
},
233+
{
234+
name: "server error - 500",
235+
status: 500,
236+
expectedResult: util.HTTPServerError,
237+
},
238+
{
239+
name: "server error - 503",
240+
status: 503,
241+
expectedResult: util.HTTPServerError,
242+
},
243+
{
244+
name: "client error - 400",
245+
status: 400,
246+
expectedResult: util.HTTPClientError,
247+
},
248+
{
249+
name: "client error - 404",
250+
status: 404,
251+
expectedResult: util.HTTPClientError,
252+
},
253+
{
254+
name: "success status should return unknown - 200",
255+
status: 200,
256+
expectedResult: util.HTTPErrorUnknown,
257+
},
258+
{
259+
name: "success status should return unknown - 201",
260+
status: 201,
261+
expectedResult: util.HTTPErrorUnknown,
262+
},
263+
{
264+
name: "invalid status should return unknown - 600",
265+
status: 600,
266+
expectedResult: util.HTTPClientError,
267+
},
268+
{
269+
name: "invalid status should return unknown - -1",
270+
status: -1,
271+
expectedResult: util.HTTPClientError,
272+
},
273+
}
274+
275+
for _, tt := range tests {
276+
t.Run(tt.name, func(t *testing.T) {
277+
result := util.ErrorTypeFromHTTPStatus(tt.status)
278+
assert.Equal(t, tt.expectedResult, result, "ErrorTypeFromHTTPStatus(%d) = %s; want %s",
279+
tt.status, result, tt.expectedResult)
280+
})
281+
}
282+
}

0 commit comments

Comments
 (0)