Skip to content

Commit 4c9b22f

Browse files
authored
feat(blooms): ignore individual bloom-gw failures (#12863)
1 parent fcd544c commit 4c9b22f

File tree

2 files changed

+32
-25
lines changed

2 files changed

+32
-25
lines changed

pkg/bloomgateway/client.go

+20-17
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
ringclient "github.com/grafana/dskit/ring/client"
1515
"github.com/pkg/errors"
1616
"github.com/prometheus/client_golang/prometheus"
17+
"go.uber.org/atomic"
1718
"golang.org/x/exp/slices"
1819
"google.golang.org/grpc"
1920
"google.golang.org/grpc/health/grpc_health_v1"
@@ -238,20 +239,8 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, _ string, interval blo
238239
}
239240
}
240241

241-
if len(servers) > 0 {
242-
// cache locality score (higher is better):
243-
// `% keyspace / % instances`. Ideally converges to 1 (querying x% of keyspace requires x% of instances),
244-
// but can be less if the keyspace is not evenly distributed across instances. Ideal operation will see the range of
245-
// `1-2/num_instances` -> `1`, where the former represents slight
246-
// overlap on instances to the left and right of the range.
247-
pctKeyspace := float64(lastFp-firstFp) / float64(math.MaxUint64)
248-
pctInstances := float64(len(servers)) / float64(max(1, len(c.pool.Addrs())))
249-
cacheLocalityScore := pctKeyspace / pctInstances
250-
c.metrics.cacheLocalityScore.Observe(cacheLocalityScore)
251-
}
252-
253242
results := make([][]*logproto.GroupedChunkRefs, len(servers))
254-
count := 0
243+
count := atomic.NewInt64(0)
255244
err := concurrency.ForEachJob(ctx, len(servers), len(servers), func(ctx context.Context, i int) error {
256245
rs := servers[i]
257246

@@ -269,10 +258,24 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, _ string, interval blo
269258
}
270259
resp, err := client.FilterChunkRefs(ctx, req)
271260
if err != nil {
272-
return err
261+
// We don't want a single bloom-gw failure to fail the entire query,
262+
// so instrument & move on
263+
level.Error(c.logger).Log(
264+
"msg", "filter failed for instance, skipping",
265+
"addr", rs.addr,
266+
"series", len(rs.groups),
267+
"blocks", len(rs.blocks),
268+
"err", err,
269+
)
270+
// filter none of the results on failed request
271+
c.metrics.clientRequests.WithLabelValues(typeError).Inc()
272+
results[i] = rs.groups
273+
} else {
274+
c.metrics.clientRequests.WithLabelValues(typeSuccess).Inc()
275+
results[i] = resp.ChunkRefs
273276
}
274-
results[i] = resp.ChunkRefs
275-
count += len(resp.ChunkRefs)
277+
278+
count.Add(int64(len(results[i])))
276279
return nil
277280
})
278281
})
@@ -281,7 +284,7 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, _ string, interval blo
281284
return nil, err
282285
}
283286

284-
buf := make([]*logproto.GroupedChunkRefs, 0, count)
287+
buf := make([]*logproto.GroupedChunkRefs, 0, int(count.Load()))
285288
return mergeSeries(results, buf)
286289
}
287290

pkg/bloomgateway/metrics.go

+12-8
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,25 @@ type metrics struct {
1515
*serverMetrics
1616
}
1717

18+
const (
19+
typeSuccess = "success"
20+
typeError = "error"
21+
)
22+
1823
type clientMetrics struct {
19-
cacheLocalityScore prometheus.Histogram
20-
requestLatency *prometheus.HistogramVec
21-
clients prometheus.Gauge
24+
clientRequests *prometheus.CounterVec
25+
requestLatency *prometheus.HistogramVec
26+
clients prometheus.Gauge
2227
}
2328

2429
func newClientMetrics(registerer prometheus.Registerer) *clientMetrics {
2530
return &clientMetrics{
26-
cacheLocalityScore: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
31+
clientRequests: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
2732
Namespace: constants.Loki,
2833
Subsystem: "bloom_gateway_client",
29-
Name: "cache_locality_score",
30-
Help: "Cache locality score of the bloom filter, as measured by % of keyspace touched / % of bloom_gws required",
31-
Buckets: prometheus.LinearBuckets(0.01, 0.2, 5),
32-
}),
34+
Name: "requests_total",
35+
Help: "Total number of requests made to the bloom gateway",
36+
}, []string{"type"}),
3337
requestLatency: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
3438
Namespace: constants.Loki,
3539
Subsystem: "bloom_gateway_client",

0 commit comments

Comments
 (0)