Skip to content

Commit eac5622

Browse files
authored
feat(blooms): limit bloom size during creation (#12796)
1 parent c0113db commit eac5622

14 files changed

+179
-96
lines changed

docs/sources/shared/configuration.md

+6
Original file line numberDiff line numberDiff line change
@@ -3333,6 +3333,12 @@ shard_streams:
33333333
# CLI flag: -bloom-compactor.max-block-size
33343334
[bloom_compactor_max_block_size: <int> | default = 200MB]
33353335

3336+
# Experimental. The maximum bloom size per log stream. A log stream whose
3337+
# generated bloom filter exceeds this size will be discarded. A value of 0 sets
3338+
# an unlimited size. Default is 128MB.
3339+
# CLI flag: -bloom-compactor.max-bloom-size
3340+
[bloom_compactor_max_bloom_size: <int> | default = 128MB]
3341+
33363342
# Experimental. Length of the n-grams created when computing blooms from log
33373343
# lines.
33383344
# CLI flag: -bloom-compactor.ngram-length

pkg/bloomcompactor/bloomcompactor_test.go

+4
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,10 @@ func (m mockLimits) BloomCompactorMaxBlockSize(_ string) int {
197197
panic("implement me")
198198
}
199199

200+
func (m mockLimits) BloomCompactorMaxBloomSize(_ string) int {
201+
panic("implement me")
202+
}
203+
200204
func TestTokenRangesForInstance(t *testing.T) {
201205
desc := func(id int, tokens ...uint32) ring.InstanceDesc {
202206
return ring.InstanceDesc{Id: fmt.Sprintf("%d", id), Tokens: tokens}

pkg/bloomcompactor/config.go

+1
Original file line numberDiff line numberDiff line change
@@ -93,5 +93,6 @@ type Limits interface {
9393
BloomNGramSkip(tenantID string) int
9494
BloomFalsePositiveRate(tenantID string) float64
9595
BloomCompactorMaxBlockSize(tenantID string) int
96+
BloomCompactorMaxBloomSize(tenantID string) int
9697
BloomBlockEncoding(tenantID string) string
9798
}

pkg/bloomcompactor/controller.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,8 @@ func (s *SimpleBloomController) buildGaps(
352352
nGramSize = uint64(s.limits.BloomNGramLength(tenant))
353353
nGramSkip = uint64(s.limits.BloomNGramSkip(tenant))
354354
maxBlockSize = uint64(s.limits.BloomCompactorMaxBlockSize(tenant))
355-
blockOpts = v1.NewBlockOptions(blockEnc, nGramSize, nGramSkip, maxBlockSize)
355+
maxBloomSize = uint64(s.limits.BloomCompactorMaxBloomSize(tenant))
356+
blockOpts = v1.NewBlockOptions(blockEnc, nGramSize, nGramSkip, maxBlockSize, maxBloomSize)
356357
created []bloomshipper.Meta
357358
totalSeries int
358359
bytesAdded int

pkg/bloomcompactor/spec.go

+13-8
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,17 @@ func NewSimpleBloomGenerator(
8989
metrics: metrics,
9090
reporter: reporter,
9191

92-
tokenizer: v1.NewBloomTokenizer(opts.Schema.NGramLen(), opts.Schema.NGramSkip(), metrics.bloomMetrics),
92+
tokenizer: v1.NewBloomTokenizer(
93+
opts.Schema.NGramLen(),
94+
opts.Schema.NGramSkip(),
95+
int(opts.UnencodedBlockOptions.MaxBloomSizeBytes),
96+
metrics.bloomMetrics,
97+
),
9398
}
9499
}
95100

96-
func (s *SimpleBloomGenerator) populator(ctx context.Context) func(series *v1.Series, bloom *v1.Bloom) (int, error) {
97-
return func(series *v1.Series, bloom *v1.Bloom) (int, error) {
101+
func (s *SimpleBloomGenerator) populator(ctx context.Context) func(series *v1.Series, bloom *v1.Bloom) (int, bool, error) {
102+
return func(series *v1.Series, bloom *v1.Bloom) (int, bool, error) {
98103
start := time.Now()
99104
level.Debug(s.logger).Log(
100105
"msg", "populating bloom filter",
@@ -104,10 +109,10 @@ func (s *SimpleBloomGenerator) populator(ctx context.Context) func(series *v1.Se
104109
)
105110
chunkItersWithFP, err := s.chunkLoader.Load(ctx, s.userID, series)
106111
if err != nil {
107-
return 0, errors.Wrapf(err, "failed to load chunks for series: %+v", series)
112+
return 0, false, errors.Wrapf(err, "failed to load chunks for series: %+v", series)
108113
}
109114

110-
bytesAdded, err := s.tokenizer.Populate(
115+
bytesAdded, skip, err := s.tokenizer.Populate(
111116
&v1.SeriesWithBloom{
112117
Series: series,
113118
Bloom: bloom,
@@ -128,7 +133,7 @@ func (s *SimpleBloomGenerator) populator(ctx context.Context) func(series *v1.Se
128133
if s.reporter != nil {
129134
s.reporter(series.Fingerprint)
130135
}
131-
return bytesAdded, err
136+
return bytesAdded, skip, err
132137
}
133138

134139
}
@@ -174,7 +179,7 @@ type LazyBlockBuilderIterator struct {
174179
ctx context.Context
175180
opts v1.BlockOptions
176181
metrics *Metrics
177-
populate func(*v1.Series, *v1.Bloom) (int, error)
182+
populate func(*v1.Series, *v1.Bloom) (int, bool, error)
178183
readWriterFn func() (v1.BlockWriter, v1.BlockReader)
179184
series v1.PeekingIterator[*v1.Series]
180185
blocks v1.ResettableIterator[*v1.SeriesWithBloom]
@@ -188,7 +193,7 @@ func NewLazyBlockBuilderIterator(
188193
ctx context.Context,
189194
opts v1.BlockOptions,
190195
metrics *Metrics,
191-
populate func(*v1.Series, *v1.Bloom) (int, error),
196+
populate func(*v1.Series, *v1.Bloom) (int, bool, error),
192197
readWriterFn func() (v1.BlockWriter, v1.BlockReader),
193198
series v1.PeekingIterator[*v1.Series],
194199
blocks v1.ResettableIterator[*v1.SeriesWithBloom],

pkg/bloomcompactor/spec_test.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ func TestSimpleBloomGenerator(t *testing.T) {
121121
}{
122122
{
123123
desc: "SkipsIncompatibleSchemas",
124-
fromSchema: v1.NewBlockOptions(enc, 3, 0, maxBlockSize),
125-
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize),
124+
fromSchema: v1.NewBlockOptions(enc, 3, 0, maxBlockSize, 0),
125+
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
126126
},
127127
{
128128
desc: "CombinesBlocks",
129-
fromSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize),
130-
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize),
129+
fromSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
130+
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
131131
},
132132
} {
133133
t.Run(fmt.Sprintf("%s/%s", tc.desc, enc), func(t *testing.T) {

pkg/storage/bloom/v1/bloom_tokenizer.go

+67-49
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"math"
66
"time"
77

8+
"github.com/c2h5oh/datasize"
89
"github.com/go-kit/log/level"
910
"github.com/pkg/errors"
1011

@@ -25,6 +26,7 @@ Bloom filters are utilized for faster lookups of log lines.
2526
type BloomTokenizer struct {
2627
metrics *Metrics
2728

29+
maxBloomSize int
2830
lineTokenizer *NGramTokenizer
2931
cache map[string]interface{}
3032
}
@@ -38,13 +40,14 @@ const eightBits = 8
3840
// 1) The token slices generated must not be mutated externally
3941
// 2) The token slice must not be used after the next call to `Tokens()` as it will repopulate the slice.
4042
// 2) This is not thread safe.
41-
func NewBloomTokenizer(nGramLen, nGramSkip int, metrics *Metrics) *BloomTokenizer {
43+
func NewBloomTokenizer(nGramLen, nGramSkip int, maxBloomSize int, metrics *Metrics) *BloomTokenizer {
4244
// TODO(chaudum): Replace logger
4345
level.Info(util_log.Logger).Log("msg", "create new bloom tokenizer", "ngram length", nGramLen, "ngram skip", nGramSkip)
4446
return &BloomTokenizer{
4547
metrics: metrics,
4648
cache: make(map[string]interface{}, cacheSize),
4749
lineTokenizer: NewNGramTokenizer(nGramLen, nGramSkip),
50+
maxBloomSize: maxBloomSize,
4851
}
4952
}
5053

@@ -89,7 +92,9 @@ type ChunkRefWithIter struct {
8992
}
9093

9194
// Populate adds the tokens from the given chunks to the given seriesWithBloom.
92-
func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefWithIter]) (int, error) {
95+
// The `skip` return value indicates whether this series should be discarded and is used to short-circuit
96+
// bloom generation for series that are too large. We will undoubtedly improve this in the future.
97+
func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefWithIter]) (bytesAdded int, skip bool, err error) {
9398
startTime := time.Now().UnixMilli()
9499

95100
clearCache(bt.cache)
@@ -119,61 +124,53 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
119124
tokenBuf, prefixLn = prefixedToken(bt.lineTokenizer.N(), chk.Ref, tokenBuf)
120125

121126
// Iterate over lines in the chunk
127+
entries:
122128
for itr.Next() && itr.Error() == nil {
123129
// TODO(owen-d): rather than iterate over the line twice, once for prefixed tokenizer & once for
124130
// raw tokenizer, we could iterate once and just return (prefix, token) pairs from the tokenizer.
125131
// Double points for them being different-ln references to the same data.
126132
line := itr.Entry().Line
127133
chunkBytes += len(line)
128-
chunkTokenizer := NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(line))
129-
for chunkTokenizer.Next() {
130-
tok := chunkTokenizer.At()
131-
tokens++
132-
// TODO(owen-d): [n]byte this
133-
str := string(tok)
134-
_, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
135-
if found {
136-
cachedInserts++
137-
continue
138-
}
139-
140-
bt.cache[str] = nil
141-
collision := swb.Bloom.ScalableBloomFilter.TestAndAdd(tok)
142-
if collision {
143-
collisionInserts++
144-
} else {
145-
successfulInserts++
146-
}
147134

148-
if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
149-
clearCache(bt.cache)
150-
}
135+
tokenItrs := []Iterator[[]byte]{
136+
// two iterators, one for the raw tokens and one for the chunk prefixed tokens.
137+
// Warning: the underlying line tokenizer (used in both iterators) uses the same buffer for tokens.
138+
// They are NOT SAFE for concurrent use.
139+
NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(line)),
140+
bt.lineTokenizer.Tokens(line),
151141
}
152142

153-
lineTokenizer := bt.lineTokenizer.Tokens(line)
154-
for lineTokenizer.Next() {
155-
tok := lineTokenizer.At()
156-
tokens++
157-
str := string(tok)
158-
_, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
159-
if found {
160-
chunkCachedInserts++
161-
continue
143+
for _, itr := range tokenItrs {
144+
for itr.Next() {
145+
tok := itr.At()
146+
tokens++
147+
// TODO(owen-d): [n]byte this
148+
str := string(tok)
149+
_, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
150+
if found {
151+
cachedInserts++
152+
continue
153+
}
154+
155+
bt.cache[str] = nil
156+
collision, sz := swb.Bloom.ScalableBloomFilter.HeavyAdd(tok)
157+
if collision {
158+
collisionInserts++
159+
} else {
160+
successfulInserts++
161+
}
162+
163+
if bt.maxBloomSize > 0 && sz > bt.maxBloomSize {
164+
skip = true
165+
break entries
166+
}
167+
168+
if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
169+
clearCache(bt.cache)
170+
}
162171
}
163-
bt.cache[str] = nil
164172

165-
collision := swb.Bloom.ScalableBloomFilter.TestAndAdd(tok)
166-
if collision {
167-
chunkCollisionInserts++
168-
} else {
169-
chunkSuccessfulInserts++
170-
}
171-
172-
if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
173-
clearCache(bt.cache)
174-
}
175173
}
176-
177174
}
178175

179176
// add the recorded chunkbytes to the sourcebytes counter in case we return early via error
@@ -187,7 +184,7 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
187184
es.Add(errors.Wrapf(err, "error iterating chunk: %#v", chk.Ref))
188185
}
189186
if combined := es.Err(); combined != nil {
190-
return sourceBytes, combined
187+
return sourceBytes, skip, combined
191188
}
192189
swb.Series.Chunks = append(swb.Series.Chunks, chk.Ref)
193190

@@ -200,13 +197,27 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
200197
bt.metrics.insertsTotal.WithLabelValues(tokenTypeChunkPrefixed, collisionTypeCache).Add(float64(chunkCachedInserts))
201198
bt.metrics.insertsTotal.WithLabelValues(tokenTypeChunkPrefixed, collisionTypeTrue).Add(float64(chunkCollisionInserts))
202199
bt.metrics.sourceBytesAdded.Add(float64(chunkBytes))
200+
201+
// Exit early if the series is too large
202+
if skip {
203+
break
204+
}
203205
}
204206

205207
if err := chks.Err(); err != nil {
206208
level.Error(util_log.Logger).Log("msg", "error downloading chunks batch", "err", err)
207-
return sourceBytes, fmt.Errorf("error downloading chunks batch: %w", err)
209+
return sourceBytes, skip, fmt.Errorf("error downloading chunks batch: %w", err)
208210
}
209211

212+
level.Debug(util_log.Logger).Log(
213+
"msg", "bloom filter populated",
214+
"chunks", len(swb.Series.Chunks),
215+
"fp", swb.Series.Fingerprint,
216+
"sourceBytes", datasize.ByteSize(sourceBytes).HumanReadable(),
217+
"bloomSize", datasize.ByteSize(swb.Bloom.Capacity()/8).HumanReadable(),
218+
"skipped", skip,
219+
)
220+
210221
endTime := time.Now().UnixMilli()
211222

212223
fillRatio := swb.Bloom.ScalableBloomFilter.FillRatio()
@@ -215,8 +226,15 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
215226
float64(estimatedCount(swb.Bloom.ScalableBloomFilter.Capacity(), fillRatio)),
216227
)
217228
bt.metrics.bloomSize.Observe(float64(swb.Bloom.ScalableBloomFilter.Capacity() / eightBits))
218-
bt.metrics.sbfCreationTime.Add(float64(endTime - startTime))
219-
return sourceBytes, nil
229+
230+
ty := bloomCreationTypeIndexed
231+
if skip {
232+
ty = bloomCreationTypeSkipped
233+
}
234+
bt.metrics.sbfCreationTime.WithLabelValues(ty).Add(float64(endTime - startTime))
235+
bt.metrics.bloomsTotal.WithLabelValues(ty).Inc()
236+
237+
return sourceBytes, skip, nil
220238
}
221239

222240
// n ≈ −m ln(1 − p).

pkg/storage/bloom/v1/bloom_tokenizer_test.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func TestPrefixedKeyCreation(t *testing.T) {
7979

8080
func TestSetLineTokenizer(t *testing.T) {
8181
t.Parallel()
82-
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, metrics)
82+
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics)
8383

8484
// Validate defaults
8585
require.Equal(t, bt.lineTokenizer.N(), DefaultNGramLength)
@@ -94,7 +94,7 @@ func TestSetLineTokenizer(t *testing.T) {
9494
func TestTokenizerPopulate(t *testing.T) {
9595
t.Parallel()
9696
var testLine = "this is a log line"
97-
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, metrics)
97+
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics)
9898

9999
sbf := filter.NewScalableBloomFilter(1024, 0.01, 0.8)
100100
var lbsList []labels.Labels
@@ -125,7 +125,7 @@ func TestTokenizerPopulate(t *testing.T) {
125125
Series: &series,
126126
}
127127

128-
_, err = bt.Populate(&swb, NewSliceIter([]ChunkRefWithIter{{Ref: ChunkRef{}, Itr: itr}}))
128+
_, _, err = bt.Populate(&swb, NewSliceIter([]ChunkRefWithIter{{Ref: ChunkRef{}, Itr: itr}}))
129129
require.NoError(t, err)
130130
tokenizer := NewNGramTokenizer(DefaultNGramLength, DefaultNGramSkip)
131131
toks := tokenizer.Tokens(testLine)
@@ -138,7 +138,7 @@ func TestTokenizerPopulate(t *testing.T) {
138138
func BenchmarkPopulateSeriesWithBloom(b *testing.B) {
139139
for i := 0; i < b.N; i++ {
140140
var testLine = lorem + lorem + lorem
141-
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, metrics)
141+
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics)
142142

143143
sbf := filter.NewScalableBloomFilter(1024, 0.01, 0.8)
144144
var lbsList []labels.Labels
@@ -169,13 +169,13 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) {
169169
Series: &series,
170170
}
171171

172-
_, err = bt.Populate(&swb, NewSliceIter([]ChunkRefWithIter{{Ref: ChunkRef{}, Itr: itr}}))
172+
_, _, err = bt.Populate(&swb, NewSliceIter([]ChunkRefWithIter{{Ref: ChunkRef{}, Itr: itr}}))
173173
require.NoError(b, err)
174174
}
175175
}
176176

177177
func BenchmarkMapClear(b *testing.B) {
178-
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, metrics)
178+
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics)
179179
for i := 0; i < b.N; i++ {
180180
for k := 0; k < cacheSize; k++ {
181181
bt.cache[fmt.Sprint(k)] = k
@@ -186,7 +186,7 @@ func BenchmarkMapClear(b *testing.B) {
186186
}
187187

188188
func BenchmarkNewMap(b *testing.B) {
189-
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, metrics)
189+
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics)
190190
for i := 0; i < b.N; i++ {
191191
for k := 0; k < cacheSize; k++ {
192192
bt.cache[fmt.Sprint(k)] = k

0 commit comments

Comments
 (0)