Skip to content

Commit 1665e85

Browse files
authored
fix(blooms): dont break iterator conventions (#12808)
Followup to #12806 which exposes skipped pages more explicitly than as an error. * refactors skip logic for bloom pages that are too large * s/Seek/LoadOffset/ for LazyBloomIter * removes unused code
1 parent af5be90 commit 1665e85

File tree

5 files changed

+67
-123
lines changed

5 files changed

+67
-123
lines changed

pkg/storage/bloom/v1/block.go

+5-74
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,12 @@ func (bq *BlockQuerier) Seek(fp model.Fingerprint) error {
144144
func (bq *BlockQuerier) Next() bool {
145145
for bq.series.Next() {
146146
series := bq.series.At()
147-
bq.blooms.Seek(series.Offset)
147+
if skip := bq.blooms.LoadOffset(series.Offset); skip {
148+
// can't seek to the desired bloom, likely because the page was too large to load
149+
// so we skip this series and move on to the next
150+
continue
151+
}
148152
if !bq.blooms.Next() {
149-
// skip blocks that are too large
150-
if errors.Is(bq.blooms.Err(), ErrPageTooLarge) {
151-
// fmt.Printf("skipping bloom page: %s (%d)\n", series.Fingerprint, series.Chunks.Len())
152-
bq.blooms.err = nil
153-
continue
154-
}
155153
return false
156154
}
157155
bloom := bq.blooms.At()
@@ -175,70 +173,3 @@ func (bq *BlockQuerier) Err() error {
175173

176174
return bq.blooms.Err()
177175
}
178-
179-
// CheckChunksForSeries checks if the given chunks pass a set of searches in the given bloom block.
180-
// It returns the list of chunks which will need to be downloaded for a query based on the initial list
181-
// passed as the `chks` argument. Chunks will be removed from the result set if they are indexed in the bloom
182-
// and fail to pass all the searches.
183-
func (bq *BlockQuerier) CheckChunksForSeries(fp model.Fingerprint, chks ChunkRefs, searches [][]byte) (ChunkRefs, error) {
184-
schema, err := bq.Schema()
185-
if err != nil {
186-
return chks, fmt.Errorf("getting schema: %w", err)
187-
}
188-
189-
if err := bq.Seek(fp); err != nil {
190-
return chks, errors.Wrapf(err, "seeking to series for fp: %v", fp)
191-
}
192-
193-
if !bq.series.Next() {
194-
return chks, nil
195-
}
196-
197-
series := bq.series.At()
198-
if series.Fingerprint != fp {
199-
return chks, nil
200-
}
201-
202-
bq.blooms.Seek(series.Offset)
203-
if !bq.blooms.Next() {
204-
return chks, fmt.Errorf("seeking to bloom for fp: %v", fp)
205-
}
206-
207-
bloom := bq.blooms.At()
208-
209-
// First, see if the search passes the series level bloom before checking for chunks individually
210-
for _, search := range searches {
211-
if !bloom.Test(search) {
212-
// the entire series bloom didn't pass one of the searches,
213-
// so we can skip checking chunks individually.
214-
// We still return all chunks that are not included in the bloom
215-
// as they may still have the data
216-
return chks.Unless(series.Chunks), nil
217-
}
218-
}
219-
220-
// TODO(salvacorts): pool tokenBuf
221-
var tokenBuf []byte
222-
var prefixLen int
223-
224-
// Check chunks individually now
225-
mustCheck, inBlooms := chks.Compare(series.Chunks, true)
226-
227-
outer:
228-
for _, chk := range inBlooms {
229-
// Get buf to concatenate the chunk and search token
230-
tokenBuf, prefixLen = prefixedToken(schema.NGramLen(), chk, tokenBuf)
231-
for _, search := range searches {
232-
tokenBuf = append(tokenBuf[:prefixLen], search...)
233-
234-
if !bloom.Test(tokenBuf) {
235-
// chunk didn't pass the search, continue to the next chunk
236-
continue outer
237-
}
238-
}
239-
// chunk passed all searches, add to the list of chunks to download
240-
mustCheck = append(mustCheck, chk)
241-
242-
}
243-
return mustCheck, nil
244-
}

pkg/storage/bloom/v1/bloom.go

+9-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
// gateways to OOM.
1818
// Figure out a decent default maximum page size that we can process.
1919
var DefaultMaxPageSize = 64 << 20 // 64MB
20-
var ErrPageTooLarge = errors.Errorf("bloom page too large")
2120

2221
type Bloom struct {
2322
filter.ScalableBloomFilter
@@ -275,11 +274,14 @@ func (b *BloomBlock) DecodeHeaders(r io.ReadSeeker) (uint32, error) {
275274
return checksum, nil
276275
}
277276

278-
func (b *BloomBlock) BloomPageDecoder(r io.ReadSeeker, pageIdx int, maxPageSize int, metrics *Metrics) (res *BloomPageDecoder, err error) {
277+
// BloomPageDecoder returns a decoder for the given page index.
278+
// It may skip the page if it's too large.
279+
// NB(owen-d): if `skip` is true, err _must_ be nil.
280+
func (b *BloomBlock) BloomPageDecoder(r io.ReadSeeker, pageIdx int, maxPageSize int, metrics *Metrics) (res *BloomPageDecoder, skip bool, err error) {
279281
if pageIdx < 0 || pageIdx >= len(b.pageHeaders) {
280282
metrics.pagesSkipped.WithLabelValues(pageTypeBloom, skipReasonOOB).Inc()
281283
metrics.bytesSkipped.WithLabelValues(pageTypeBloom, skipReasonOOB).Add(float64(b.pageHeaders[pageIdx].DecompressedLen))
282-
return nil, fmt.Errorf("invalid page (%d) for bloom page decoding", pageIdx)
284+
return nil, false, fmt.Errorf("invalid page (%d) for bloom page decoding", pageIdx)
283285
}
284286

285287
page := b.pageHeaders[pageIdx]
@@ -288,13 +290,13 @@ func (b *BloomBlock) BloomPageDecoder(r io.ReadSeeker, pageIdx int, maxPageSize
288290
if page.Len > maxPageSize {
289291
metrics.pagesSkipped.WithLabelValues(pageTypeBloom, skipReasonTooLarge).Inc()
290292
metrics.bytesSkipped.WithLabelValues(pageTypeBloom, skipReasonTooLarge).Add(float64(page.DecompressedLen))
291-
return nil, ErrPageTooLarge
293+
return nil, true, nil
292294
}
293295

294296
if _, err = r.Seek(int64(page.Offset), io.SeekStart); err != nil {
295297
metrics.pagesSkipped.WithLabelValues(pageTypeBloom, skipReasonErr).Inc()
296298
metrics.bytesSkipped.WithLabelValues(pageTypeBloom, skipReasonErr).Add(float64(page.DecompressedLen))
297-
return nil, errors.Wrap(err, "seeking to bloom page")
299+
return nil, false, errors.Wrap(err, "seeking to bloom page")
298300
}
299301

300302
if b.schema.encoding == chunkenc.EncNone {
@@ -306,10 +308,10 @@ func (b *BloomBlock) BloomPageDecoder(r io.ReadSeeker, pageIdx int, maxPageSize
306308
if err != nil {
307309
metrics.pagesSkipped.WithLabelValues(pageTypeBloom, skipReasonErr).Inc()
308310
metrics.bytesSkipped.WithLabelValues(pageTypeBloom, skipReasonErr).Add(float64(page.DecompressedLen))
309-
return nil, errors.Wrap(err, "decoding bloom page")
311+
return nil, false, errors.Wrap(err, "decoding bloom page")
310312
}
311313

312314
metrics.pagesRead.WithLabelValues(pageTypeBloom).Inc()
313315
metrics.bytesRead.WithLabelValues(pageTypeBloom).Add(float64(page.DecompressedLen))
314-
return res, nil
316+
return res, false, nil
315317
}

pkg/storage/bloom/v1/bloom_querier.go

+19-11
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,11 @@ func (it *LazyBloomIter) ensureInit() {
4242
}
4343
}
4444

45-
func (it *LazyBloomIter) Seek(offset BloomOffset) {
45+
// LoadOffset returns whether the bloom page at the given offset should
46+
// be skipped (due to being too large) _and_ there's no error
47+
func (it *LazyBloomIter) LoadOffset(offset BloomOffset) (skip bool) {
4648
it.ensureInit()
4749

48-
// reset error from any previous seek/next that yield pages too large
49-
if errors.Is(it.err, ErrPageTooLarge) {
50-
it.err = nil
51-
}
52-
5350
// if we need a different page or the current page hasn't been loaded,
5451
// load the desired page
5552
if it.curPageIndex != offset.Page || it.curPage == nil {
@@ -63,12 +60,16 @@ func (it *LazyBloomIter) Seek(offset BloomOffset) {
6360
r, err := it.b.reader.Blooms()
6461
if err != nil {
6562
it.err = errors.Wrap(err, "getting blooms reader")
66-
return
63+
return false
6764
}
68-
decoder, err := it.b.blooms.BloomPageDecoder(r, offset.Page, it.m, it.b.metrics)
65+
decoder, skip, err := it.b.blooms.BloomPageDecoder(r, offset.Page, it.m, it.b.metrics)
6966
if err != nil {
7067
it.err = errors.Wrap(err, "loading bloom page")
71-
return
68+
return false
69+
}
70+
71+
if skip {
72+
return true
7273
}
7374

7475
it.curPageIndex = offset.Page
@@ -77,6 +78,7 @@ func (it *LazyBloomIter) Seek(offset BloomOffset) {
7778
}
7879

7980
it.curPage.Seek(offset.ByteOffset)
81+
return false
8082
}
8183

8284
func (it *LazyBloomIter) Next() bool {
@@ -101,17 +103,23 @@ func (it *LazyBloomIter) next() bool {
101103
return false
102104
}
103105

104-
it.curPage, err = it.b.blooms.BloomPageDecoder(
106+
var skip bool
107+
it.curPage, skip, err = it.b.blooms.BloomPageDecoder(
105108
r,
106109
it.curPageIndex,
107110
it.m,
108111
it.b.metrics,
109112
)
113+
// this page wasn't skipped & produced an error, return
110114
if err != nil {
111115
it.err = err
112116
return false
113117
}
114-
continue
118+
if skip {
119+
// this page was skipped; check the next
120+
it.curPageIndex++
121+
continue
122+
}
115123
}
116124

117125
if !it.curPage.Next() {

pkg/storage/bloom/v1/fuse.go

+19-13
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ func NewFusedQuerier(bq *BlockQuerier, inputs []PeekingIterator[Request], logger
5959
}
6060
}
6161

62+
func (fq *FusedQuerier) noRemovals(batch []Request, fp model.Fingerprint) {
63+
for _, input := range batch {
64+
input.Response <- Output{
65+
Fp: fp,
66+
Removals: nil,
67+
}
68+
}
69+
}
70+
6271
func (fq *FusedQuerier) Run() error {
6372
schema, err := fq.bq.Schema()
6473
if err != nil {
@@ -85,26 +94,23 @@ func (fq *FusedQuerier) Run() error {
8594
if series.Fingerprint != fp {
8695
// fingerprint not found, can't remove chunks
8796
level.Debug(fq.logger).Log("msg", "fingerprint not found", "fp", series.Fingerprint, "err", fq.bq.series.Err())
88-
for _, input := range nextBatch {
89-
input.Response <- Output{
90-
Fp: fp,
91-
Removals: nil,
92-
}
93-
}
97+
fq.noRemovals(nextBatch, fp)
9498
continue
9599
}
96100

97101
// Now that we've found the series, we need to find the unpack the bloom
98-
fq.bq.blooms.Seek(series.Offset)
102+
skip := fq.bq.blooms.LoadOffset(series.Offset)
103+
if skip {
104+
// could not seek to the desired bloom,
105+
// likely because the page was too large to load
106+
fq.noRemovals(nextBatch, fp)
107+
continue
108+
}
109+
99110
if !fq.bq.blooms.Next() {
100111
// fingerprint not found, can't remove chunks
101112
level.Debug(fq.logger).Log("msg", "fingerprint not found", "fp", series.Fingerprint, "err", fq.bq.blooms.Err())
102-
for _, input := range nextBatch {
103-
input.Response <- Output{
104-
Fp: fp,
105-
Removals: nil,
106-
}
107-
}
113+
fq.noRemovals(nextBatch, fp)
108114
continue
109115
}
110116

pkg/storage/bloom/v1/fuse_test.go

+15-18
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ func TestLazyBloomIter_Seek_ResetError(t *testing.T) {
152152
writer := NewMemoryBlockWriter(indexBuf, bloomsBuf)
153153
reader := NewByteReader(indexBuf, bloomsBuf)
154154

155+
largeSeries := func(i int) bool {
156+
return i%2 == 0
157+
}
158+
155159
numSeries := 4
156160
data := make([]SeriesWithBloom, 0, numSeries)
157161
tokenizer := NewNGramTokenizer(4, 0)
@@ -170,8 +174,10 @@ func TestLazyBloomIter_Seek_ResetError(t *testing.T) {
170174
bloom.ScalableBloomFilter = *filter.NewScalableBloomFilter(1024, 0.01, 0.8)
171175

172176
nLines := 10
173-
if i == 0 || i == 2 {
174-
// Add enough lines to make the bloom page too large for series 1
177+
// all even series will have a larger bloom (more than 1 filter)
178+
if largeSeries(i) {
179+
// Add enough lines to make the bloom page too large and
180+
// trigger another filter addition
175181
nLines = 10000
176182
}
177183

@@ -218,14 +224,15 @@ func TestLazyBloomIter_Seek_ResetError(t *testing.T) {
218224
series := querier.series.At()
219225
require.Equal(t, fp, series.Fingerprint)
220226

221-
querier.blooms.Seek(series.Offset)
222-
223-
if fp == 0 || fp == 2 {
224-
require.False(t, querier.blooms.Next())
225-
require.Error(t, querier.blooms.Err())
227+
seekable := true
228+
if largeSeries(int(fp)) {
229+
seekable = false
230+
}
231+
if !seekable {
232+
require.True(t, querier.blooms.LoadOffset(series.Offset))
226233
continue
227234
}
228-
235+
require.False(t, querier.blooms.LoadOffset(series.Offset))
229236
require.True(t, querier.blooms.Next())
230237
require.NoError(t, querier.blooms.Err())
231238
}
@@ -293,16 +300,6 @@ func BenchmarkBlockQuerying(b *testing.B) {
293300
// benchmark
294301
b.StartTimer()
295302

296-
b.Run("single-pass", func(b *testing.B) {
297-
for i := 0; i < b.N; i++ {
298-
for _, chain := range requestChains {
299-
for _, req := range chain {
300-
_, _ = querier.CheckChunksForSeries(req.Fp, req.Chks, nil)
301-
}
302-
}
303-
}
304-
305-
})
306303
b.Run("fused", func(b *testing.B) {
307304
// spin up some goroutines to consume the responses so they don't block
308305
go func() {

0 commit comments

Comments
 (0)