Skip to content

Commit 5540c92

Browse files
authored
perf(blooms): Resolve bloom blocks on index gateway and shard by block address (#12720)
This pull request changes how data is sharded across bloom gateways. Currently, chunks are grouped by series and shared by the fingerprint of the series across the available bloom gateways using jumphash algorithm. This however, leads to over-querying bloom blocks, because bloom blocks have consecutive fingerprint ranges, whereas sharding keys are evenly distributed across keyspace. This PR changes the sharding in the way that bloom blocks for series are already resolved on the index gateways and that their address is used for sharding data. This has the advantage that the grouped series can be mapped to the correct bloom blocks on the client side. Sending the block ref along with the grouped series to the bloom gateway allows for efficient querying of the data because each bloom gateway therefore owns exactly 1/nth of the blocks. --- Signed-off-by: Christian Haudum <[email protected]>
1 parent a00f1f1 commit 5540c92

21 files changed

+747
-220
lines changed

pkg/bloomgateway/bloomgateway.go

+25-7
Original file line numberDiff line numberDiff line change
@@ -238,31 +238,49 @@ func (g *Gateway) FilterChunkRefs(ctx context.Context, req *logproto.FilterChunk
238238
}, nil
239239
}
240240

241-
seriesByDay := partitionRequest(req)
242-
stats.NumTasks = len(seriesByDay)
241+
blocks := make([]bloomshipper.BlockRef, 0, len(req.Blocks))
242+
for _, key := range req.Blocks {
243+
block, err := bloomshipper.BlockRefFromKey(key)
244+
if err != nil {
245+
stats.Status = labelFailure
246+
return nil, errors.New("could not parse block key")
247+
}
248+
blocks = append(blocks, block)
249+
}
243250

244-
// no tasks --> empty response
245-
if len(seriesByDay) == 0 {
251+
// Shortcut if request does not contain blocks
252+
if len(blocks) == 0 {
246253
stats.Status = labelSuccess
247254
return &logproto.FilterChunkRefResponse{
248-
ChunkRefs: []*logproto.GroupedChunkRefs{},
255+
ChunkRefs: req.Refs,
249256
}, nil
250257
}
251258

259+
// TODO(chaudum): I intentionally keep the logic for handling multiple tasks,
260+
// so that the PR does not explode in size. This should be cleaned up at some point.
261+
262+
seriesByDay := partitionRequest(req)
263+
stats.NumTasks = len(seriesByDay)
264+
252265
sp.LogKV(
253266
"filters", len(filters),
254267
"days", len(seriesByDay),
268+
"blocks", len(req.Blocks),
255269
"series_requested", len(req.Refs),
256270
)
257271

272+
if len(seriesByDay) != 1 {
273+
stats.Status = labelFailure
274+
return nil, errors.New("request time range must span exactly one day")
275+
}
276+
258277
tasks := make([]Task, 0, len(seriesByDay))
259278
responses := make([][]v1.Output, 0, len(seriesByDay))
260279
for _, seriesForDay := range seriesByDay {
261-
task, err := NewTask(ctx, tenantID, seriesForDay, filters)
280+
task, err := NewTask(ctx, tenantID, seriesForDay, filters, blocks)
262281
if err != nil {
263282
return nil, err
264283
}
265-
266284
// TODO(owen-d): include capacity in constructor?
267285
task.responses = responsesPool.Get(len(seriesForDay.series))
268286
tasks = append(tasks, task)

pkg/bloomgateway/bloomgateway_test.go

+66-7
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,24 @@ import (
3030
"github.com/grafana/loki/v3/pkg/validation"
3131
)
3232

33+
func stringSlice[T fmt.Stringer](s []T) []string {
34+
res := make([]string, len(s))
35+
for i := range res {
36+
res[i] = s[i].String()
37+
}
38+
return res
39+
}
40+
3341
func groupRefs(t *testing.T, chunkRefs []*logproto.ChunkRef) []*logproto.GroupedChunkRefs {
3442
t.Helper()
35-
grouped := make([]*logproto.GroupedChunkRefs, 0, len(chunkRefs))
36-
return groupChunkRefs(chunkRefs, grouped)
43+
return groupChunkRefs(chunkRefs, nil)
3744
}
3845

3946
func newLimits() *validation.Overrides {
4047
limits := validation.Limits{}
4148
flagext.DefaultValues(&limits)
4249
limits.BloomGatewayEnabled = true
50+
limits.BloomGatewayShardSize = 1
4351

4452
overrides, _ := validation.NewOverrides(limits, nil)
4553
return overrides
@@ -129,11 +137,46 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
129137
MaxOutstandingPerTenant: 1024,
130138
}
131139

132-
t.Run("shipper error is propagated", func(t *testing.T) {
140+
t.Run("request fails when providing invalid block", func(t *testing.T) {
133141
now := mktime("2023-10-03 10:00")
134142

135143
_, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
136144
mockStore := newMockBloomStore(queriers, metas)
145+
146+
reg := prometheus.NewRegistry()
147+
gw, err := New(cfg, mockStore, logger, reg)
148+
require.NoError(t, err)
149+
150+
err = services.StartAndAwaitRunning(context.Background(), gw)
151+
require.NoError(t, err)
152+
t.Cleanup(func() {
153+
err = services.StopAndAwaitTerminated(context.Background(), gw)
154+
require.NoError(t, err)
155+
})
156+
157+
chunkRefs := createQueryInputFromBlockData(t, tenantID, data, 100)
158+
159+
expr, err := syntax.ParseExpr(`{foo="bar"} |= "does not match"`)
160+
require.NoError(t, err)
161+
162+
req := &logproto.FilterChunkRefRequest{
163+
From: now.Add(-24 * time.Hour),
164+
Through: now,
165+
Refs: groupRefs(t, chunkRefs),
166+
Plan: plan.QueryPlan{AST: expr},
167+
Blocks: []string{"bloom/invalid/block.tar.gz"},
168+
}
169+
170+
ctx := user.InjectOrgID(context.Background(), tenantID)
171+
res, err := gw.FilterChunkRefs(ctx, req)
172+
require.ErrorContainsf(t, err, "could not parse block key", "%+v", res)
173+
})
174+
175+
t.Run("shipper error is propagated", func(t *testing.T) {
176+
now := mktime("2023-10-03 10:00")
177+
178+
refs, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
179+
mockStore := newMockBloomStore(queriers, metas)
137180
mockStore.err = errors.New("request failed")
138181

139182
reg := prometheus.NewRegistry()
@@ -160,6 +203,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
160203
Through: now,
161204
Refs: groupRefs(t, chunkRefs),
162205
Plan: plan.QueryPlan{AST: expr},
206+
Blocks: stringSlice(refs),
163207
}
164208

165209
ctx, cancelFn := context.WithTimeout(context.Background(), 10*time.Second)
@@ -175,7 +219,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
175219
now := mktime("2024-01-25 10:00")
176220

177221
// replace store implementation and re-initialize workers and sub-services
178-
_, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
222+
refs, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
179223
mockStore := newMockBloomStore(queriers, metas)
180224
mockStore.delay = 2000 * time.Millisecond
181225

@@ -203,6 +247,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
203247
Through: now,
204248
Refs: groupRefs(t, chunkRefs),
205249
Plan: plan.QueryPlan{AST: expr},
250+
Blocks: stringSlice(refs),
206251
}
207252

208253
ctx, cancelFn := context.WithTimeout(context.Background(), 500*time.Millisecond)
@@ -228,11 +273,12 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
228273
require.NoError(t, err)
229274
})
230275

276+
// input chunks need to be sorted by their fingerprint
231277
chunkRefs := []*logproto.ChunkRef{
232-
{Fingerprint: 3000, UserID: tenantID, From: now.Add(-24 * time.Hour), Through: now.Add(-23 * time.Hour), Checksum: 1},
233278
{Fingerprint: 1000, UserID: tenantID, From: now.Add(-22 * time.Hour), Through: now.Add(-21 * time.Hour), Checksum: 2},
234-
{Fingerprint: 2000, UserID: tenantID, From: now.Add(-20 * time.Hour), Through: now.Add(-19 * time.Hour), Checksum: 3},
235279
{Fingerprint: 1000, UserID: tenantID, From: now.Add(-23 * time.Hour), Through: now.Add(-22 * time.Hour), Checksum: 4},
280+
{Fingerprint: 2000, UserID: tenantID, From: now.Add(-20 * time.Hour), Through: now.Add(-19 * time.Hour), Checksum: 3},
281+
{Fingerprint: 3000, UserID: tenantID, From: now.Add(-24 * time.Hour), Through: now.Add(-23 * time.Hour), Checksum: 1},
236282
}
237283
req := &logproto.FilterChunkRefRequest{
238284
From: now.Add(-24 * time.Hour),
@@ -284,13 +330,24 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
284330
Checksum: uint32(idx),
285331
},
286332
}
333+
ref := bloomshipper.BlockRef{
334+
Ref: bloomshipper.Ref{
335+
TenantID: tenantID,
336+
TableName: "table_1",
337+
Bounds: v1.NewBounds(0, 10000),
338+
StartTimestamp: now.Add(-24 * time.Hour),
339+
EndTimestamp: now,
340+
Checksum: uint32(idx),
341+
},
342+
}
287343
expr, err := syntax.ParseExpr(`{foo="bar"} |= "foo"`)
288344
require.NoError(t, err)
289345
req := &logproto.FilterChunkRefRequest{
290346
From: now.Add(-24 * time.Hour),
291347
Through: now,
292348
Refs: groupRefs(t, chunkRefs),
293349
Plan: plan.QueryPlan{AST: expr},
350+
Blocks: stringSlice([]bloomshipper.BlockRef{ref}),
294351
}
295352
ctx := user.InjectOrgID(context.Background(), tenantID)
296353
_, err = gw.FilterChunkRefs(ctx, req)
@@ -303,7 +360,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
303360
now := mktime("2023-10-03 10:00")
304361

305362
// replace store implementation and re-initialize workers and sub-services
306-
_, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
363+
refs, metas, queriers, data := createBlocks(t, tenantID, 10, now.Add(-1*time.Hour), now, 0x0000, 0x0fff)
307364

308365
reg := prometheus.NewRegistry()
309366
store := newMockBloomStore(queriers, metas)
@@ -329,6 +386,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
329386
Through: now,
330387
Refs: inputChunkRefs,
331388
Plan: plan.QueryPlan{AST: expr},
389+
Blocks: stringSlice(refs),
332390
}
333391
ctx := user.InjectOrgID(context.Background(), tenantID)
334392
res, err := gw.FilterChunkRefs(ctx, req)
@@ -361,6 +419,7 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
361419
Through: now,
362420
Refs: inputChunkRefs,
363421
Plan: plan.QueryPlan{AST: expr},
422+
Blocks: stringSlice(refs),
364423
}
365424
ctx := user.InjectOrgID(context.Background(), tenantID)
366425
res, err := gw.FilterChunkRefs(ctx, req)

pkg/bloomgateway/cache_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -450,14 +450,14 @@ func TestCache(t *testing.T) {
450450
res, err = cacheMiddleware.FilterChunkRefs(ctx, req)
451451
require.NoError(t, err)
452452
require.Equal(t, 2, *calls)
453-
require.Equal(t, expectedRes, res)
453+
require.ElementsMatch(t, expectedRes.ChunkRefs, res.ChunkRefs)
454454

455455
// Doing a request again should only hit the cache
456456
*calls = 0
457457
res, err = cacheMiddleware.FilterChunkRefs(ctx, req)
458458
require.NoError(t, err)
459459
require.Equal(t, 0, *calls)
460-
require.Equal(t, expectedRes, res)
460+
require.ElementsMatch(t, expectedRes.ChunkRefs, res.ChunkRefs)
461461
}
462462

463463
type mockServer struct {

pkg/bloomgateway/client.go

+47-28
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"io"
88
"math"
9+
"sort"
910

1011
"github.com/go-kit/log"
1112
"github.com/go-kit/log/level"
@@ -14,7 +15,6 @@ import (
1415
ringclient "github.com/grafana/dskit/ring/client"
1516
"github.com/pkg/errors"
1617
"github.com/prometheus/client_golang/prometheus"
17-
"github.com/prometheus/common/model"
1818
"google.golang.org/grpc"
1919
"google.golang.org/grpc/health/grpc_health_v1"
2020

@@ -24,6 +24,7 @@ import (
2424
"github.com/grafana/loki/v3/pkg/queue"
2525
"github.com/grafana/loki/v3/pkg/storage/chunk/cache"
2626
"github.com/grafana/loki/v3/pkg/storage/chunk/cache/resultscache"
27+
"github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper"
2728
"github.com/grafana/loki/v3/pkg/util/constants"
2829
"github.com/grafana/loki/v3/pkg/util/discovery"
2930
)
@@ -111,12 +112,11 @@ func (i *ClientConfig) Validate() error {
111112
}
112113

113114
type Client interface {
114-
FilterChunks(ctx context.Context, tenant string, from, through model.Time, groups []*logproto.GroupedChunkRefs, plan plan.QueryPlan) ([]*logproto.GroupedChunkRefs, error)
115+
FilterChunks(ctx context.Context, tenant string, interval bloomshipper.Interval, blocks []blockWithSeries, plan plan.QueryPlan) ([]*logproto.GroupedChunkRefs, error)
115116
}
116117

117118
type GatewayClient struct {
118119
cfg ClientConfig
119-
limits Limits
120120
logger log.Logger
121121
metrics *clientMetrics
122122
pool *JumpHashClientPool
@@ -188,7 +188,6 @@ func NewClient(
188188
return &GatewayClient{
189189
cfg: cfg,
190190
logger: logger,
191-
limits: limits,
192191
metrics: metrics,
193192
pool: pool,
194193
dnsProvider: dnsProvider, // keep reference so we can stop it when the client is closed
@@ -201,26 +200,41 @@ func (c *GatewayClient) Close() {
201200
}
202201

203202
// FilterChunkRefs implements Client
204-
func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, through model.Time, groups []*logproto.GroupedChunkRefs, plan plan.QueryPlan) ([]*logproto.GroupedChunkRefs, error) {
205-
if !c.limits.BloomGatewayEnabled(tenant) || len(groups) == 0 {
206-
return groups, nil
203+
func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, interval bloomshipper.Interval, blocks []blockWithSeries, plan plan.QueryPlan) ([]*logproto.GroupedChunkRefs, error) {
204+
// no block and therefore no series with chunks
205+
if len(blocks) == 0 {
206+
return nil, nil
207207
}
208208

209-
clients := make(map[string][]*logproto.GroupedChunkRefs)
210-
for _, g := range groups {
211-
addr, err := c.pool.AddrForFingerprint(g.Fingerprint)
209+
firstFp, lastFp := uint64(math.MaxUint64), uint64(0)
210+
pos := make(map[string]int)
211+
servers := make([]addrWithGroups, 0, len(blocks))
212+
for _, blockWithSeries := range blocks {
213+
addr, err := c.pool.Addr(blockWithSeries.block.String())
212214
if err != nil {
213-
return nil, errors.Wrap(err, "server address for fingerprint")
215+
return nil, errors.Wrapf(err, "server address for block: %s", blockWithSeries.block)
214216
}
215-
clients[addr] = append(clients[addr], g)
216-
}
217217

218-
servers := make([]addrWithGroups, 0, len(clients))
219-
for k, v := range clients {
220-
servers = append(servers, addrWithGroups{
221-
groups: v,
222-
addr: k,
223-
})
218+
// min/max fingerprint needed for the cache locality score
219+
first, last := getFirstLast(blockWithSeries.series)
220+
if first.Fingerprint < firstFp {
221+
firstFp = first.Fingerprint
222+
}
223+
if last.Fingerprint > lastFp {
224+
lastFp = last.Fingerprint
225+
}
226+
227+
if idx, found := pos[addr]; found {
228+
servers[idx].groups = append(servers[idx].groups, blockWithSeries.series...)
229+
servers[idx].blocks = append(servers[idx].blocks, blockWithSeries.block.String())
230+
} else {
231+
pos[addr] = len(servers)
232+
servers = append(servers, addrWithGroups{
233+
addr: addr,
234+
blocks: []string{blockWithSeries.block.String()},
235+
groups: blockWithSeries.series,
236+
})
237+
}
224238
}
225239

226240
if len(servers) > 0 {
@@ -229,7 +243,6 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
229243
// but can be less if the keyspace is not evenly distributed across instances. Ideal operation will see the range of
230244
// `1-2/num_instances` -> `1`, where the former represents slight
231245
// overlap on instances to the left and right of the range.
232-
firstFp, lastFp := groups[0].Fingerprint, groups[len(groups)-1].Fingerprint
233246
pctKeyspace := float64(lastFp-firstFp) / float64(math.MaxUint64)
234247
pctInstances := float64(len(servers)) / float64(max(1, len(c.pool.Addrs())))
235248
cacheLocalityScore := pctKeyspace / pctInstances
@@ -241,22 +254,27 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
241254
err := concurrency.ForEachJob(ctx, len(servers), len(servers), func(ctx context.Context, i int) error {
242255
rs := servers[i]
243256

257+
sort.Slice(rs.groups, func(i, j int) bool {
258+
return rs.groups[i].Fingerprint < rs.groups[j].Fingerprint
259+
})
260+
244261
level.Info(c.logger).Log(
245262
"msg", "do FilterChunkRefs for addresses",
246-
"progress", fmt.Sprintf("%d/%d", i+1, len(servers)),
263+
"part", fmt.Sprintf("%d/%d", i+1, len(servers)),
247264
"addr", rs.addr,
248-
"from", from.Time(),
249-
"through", through.Time(),
250-
"num_refs", len(rs.groups),
251-
"plan", plan.String(),
252-
"plan_hash", plan.Hash(),
265+
"from", interval.Start.Time(),
266+
"through", interval.End.Time(),
267+
"series", len(rs.groups),
268+
"blocks", len(rs.blocks),
269+
"tenant", tenant,
253270
)
254271

255272
return c.doForAddrs([]string{rs.addr}, func(client logproto.BloomGatewayClient) error {
256273
req := &logproto.FilterChunkRefRequest{
257-
From: from,
258-
Through: through,
274+
From: interval.Start,
275+
Through: interval.End,
259276
Refs: rs.groups,
277+
Blocks: rs.blocks,
260278
Plan: plan,
261279
}
262280
resp, err := client.FilterChunkRefs(ctx, req)
@@ -308,5 +326,6 @@ func (c *GatewayClient) doForAddrs(addrs []string, fn func(logproto.BloomGateway
308326

309327
type addrWithGroups struct {
310328
addr string
329+
blocks []string
311330
groups []*logproto.GroupedChunkRefs
312331
}

0 commit comments

Comments
 (0)