5
5
"math"
6
6
"time"
7
7
8
+ "github.com/c2h5oh/datasize"
8
9
"github.com/go-kit/log/level"
9
10
"github.com/pkg/errors"
10
11
@@ -25,6 +26,7 @@ Bloom filters are utilized for faster lookups of log lines.
25
26
type BloomTokenizer struct {
26
27
metrics * Metrics
27
28
29
+ maxBloomSize int
28
30
lineTokenizer * NGramTokenizer
29
31
cache map [string ]interface {}
30
32
}
@@ -38,13 +40,14 @@ const eightBits = 8
38
40
// 1) The token slices generated must not be mutated externally
39
41
// 2) The token slice must not be used after the next call to `Tokens()` as it will repopulate the slice.
40
42
// 2) This is not thread safe.
41
- func NewBloomTokenizer (nGramLen , nGramSkip int , metrics * Metrics ) * BloomTokenizer {
43
+ func NewBloomTokenizer (nGramLen , nGramSkip int , maxBloomSize int , metrics * Metrics ) * BloomTokenizer {
42
44
// TODO(chaudum): Replace logger
43
45
level .Info (util_log .Logger ).Log ("msg" , "create new bloom tokenizer" , "ngram length" , nGramLen , "ngram skip" , nGramSkip )
44
46
return & BloomTokenizer {
45
47
metrics : metrics ,
46
48
cache : make (map [string ]interface {}, cacheSize ),
47
49
lineTokenizer : NewNGramTokenizer (nGramLen , nGramSkip ),
50
+ maxBloomSize : maxBloomSize ,
48
51
}
49
52
}
50
53
@@ -89,7 +92,9 @@ type ChunkRefWithIter struct {
89
92
}
90
93
91
94
// Populate adds the tokens from the given chunks to the given seriesWithBloom.
92
- func (bt * BloomTokenizer ) Populate (swb * SeriesWithBloom , chks Iterator [ChunkRefWithIter ]) (int , error ) {
95
+ // The `skip` return value indicates whether this series should be discarded and is used to short-circuit
96
+ // bloom generation for series that are too large. We will undoubtedly improve this in the future.
97
+ func (bt * BloomTokenizer ) Populate (swb * SeriesWithBloom , chks Iterator [ChunkRefWithIter ]) (bytesAdded int , skip bool , err error ) {
93
98
startTime := time .Now ().UnixMilli ()
94
99
95
100
clearCache (bt .cache )
@@ -119,61 +124,53 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
119
124
tokenBuf , prefixLn = prefixedToken (bt .lineTokenizer .N (), chk .Ref , tokenBuf )
120
125
121
126
// Iterate over lines in the chunk
127
+ entries:
122
128
for itr .Next () && itr .Error () == nil {
123
129
// TODO(owen-d): rather than iterate over the line twice, once for prefixed tokenizer & once for
124
130
// raw tokenizer, we could iterate once and just return (prefix, token) pairs from the tokenizer.
125
131
// Double points for them being different-ln references to the same data.
126
132
line := itr .Entry ().Line
127
133
chunkBytes += len (line )
128
- chunkTokenizer := NewPrefixedTokenIter (tokenBuf , prefixLn , bt .lineTokenizer .Tokens (line ))
129
- for chunkTokenizer .Next () {
130
- tok := chunkTokenizer .At ()
131
- tokens ++
132
- // TODO(owen-d): [n]byte this
133
- str := string (tok )
134
- _ , found := bt .cache [str ] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
135
- if found {
136
- cachedInserts ++
137
- continue
138
- }
139
-
140
- bt .cache [str ] = nil
141
- collision := swb .Bloom .ScalableBloomFilter .TestAndAdd (tok )
142
- if collision {
143
- collisionInserts ++
144
- } else {
145
- successfulInserts ++
146
- }
147
134
148
- if len (bt .cache ) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
149
- clearCache (bt .cache )
150
- }
135
+ tokenItrs := []Iterator [[]byte ]{
136
+ // two iterators, one for the raw tokens and one for the chunk prefixed tokens.
137
+ // Warning: the underlying line tokenizer (used in both iterators) uses the same buffer for tokens.
138
+ // They are NOT SAFE for concurrent use.
139
+ NewPrefixedTokenIter (tokenBuf , prefixLn , bt .lineTokenizer .Tokens (line )),
140
+ bt .lineTokenizer .Tokens (line ),
151
141
}
152
142
153
- lineTokenizer := bt .lineTokenizer .Tokens (line )
154
- for lineTokenizer .Next () {
155
- tok := lineTokenizer .At ()
156
- tokens ++
157
- str := string (tok )
158
- _ , found := bt .cache [str ] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
159
- if found {
160
- chunkCachedInserts ++
161
- continue
143
+ for _ , itr := range tokenItrs {
144
+ for itr .Next () {
145
+ tok := itr .At ()
146
+ tokens ++
147
+ // TODO(owen-d): [n]byte this
148
+ str := string (tok )
149
+ _ , found := bt .cache [str ] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
150
+ if found {
151
+ cachedInserts ++
152
+ continue
153
+ }
154
+
155
+ bt .cache [str ] = nil
156
+ collision , sz := swb .Bloom .ScalableBloomFilter .HeavyAdd (tok )
157
+ if collision {
158
+ collisionInserts ++
159
+ } else {
160
+ successfulInserts ++
161
+ }
162
+
163
+ if bt .maxBloomSize > 0 && sz > bt .maxBloomSize {
164
+ skip = true
165
+ break entries
166
+ }
167
+
168
+ if len (bt .cache ) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
169
+ clearCache (bt .cache )
170
+ }
162
171
}
163
- bt .cache [str ] = nil
164
172
165
- collision := swb .Bloom .ScalableBloomFilter .TestAndAdd (tok )
166
- if collision {
167
- chunkCollisionInserts ++
168
- } else {
169
- chunkSuccessfulInserts ++
170
- }
171
-
172
- if len (bt .cache ) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
173
- clearCache (bt .cache )
174
- }
175
173
}
176
-
177
174
}
178
175
179
176
// add the recorded chunkbytes to the sourcebytes counter in case we return early via error
@@ -187,7 +184,7 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
187
184
es .Add (errors .Wrapf (err , "error iterating chunk: %#v" , chk .Ref ))
188
185
}
189
186
if combined := es .Err (); combined != nil {
190
- return sourceBytes , combined
187
+ return sourceBytes , skip , combined
191
188
}
192
189
swb .Series .Chunks = append (swb .Series .Chunks , chk .Ref )
193
190
@@ -200,13 +197,27 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
200
197
bt .metrics .insertsTotal .WithLabelValues (tokenTypeChunkPrefixed , collisionTypeCache ).Add (float64 (chunkCachedInserts ))
201
198
bt .metrics .insertsTotal .WithLabelValues (tokenTypeChunkPrefixed , collisionTypeTrue ).Add (float64 (chunkCollisionInserts ))
202
199
bt .metrics .sourceBytesAdded .Add (float64 (chunkBytes ))
200
+
201
+ // Exit early if the series is too large
202
+ if skip {
203
+ break
204
+ }
203
205
}
204
206
205
207
if err := chks .Err (); err != nil {
206
208
level .Error (util_log .Logger ).Log ("msg" , "error downloading chunks batch" , "err" , err )
207
- return sourceBytes , fmt .Errorf ("error downloading chunks batch: %w" , err )
209
+ return sourceBytes , skip , fmt .Errorf ("error downloading chunks batch: %w" , err )
208
210
}
209
211
212
+ level .Debug (util_log .Logger ).Log (
213
+ "msg" , "bloom filter populated" ,
214
+ "chunks" , len (swb .Series .Chunks ),
215
+ "fp" , swb .Series .Fingerprint ,
216
+ "sourceBytes" , datasize .ByteSize (sourceBytes ).HumanReadable (),
217
+ "bloomSize" , datasize .ByteSize (swb .Bloom .Capacity ()/ 8 ).HumanReadable (),
218
+ "skipped" , skip ,
219
+ )
220
+
210
221
endTime := time .Now ().UnixMilli ()
211
222
212
223
fillRatio := swb .Bloom .ScalableBloomFilter .FillRatio ()
@@ -215,8 +226,15 @@ func (bt *BloomTokenizer) Populate(swb *SeriesWithBloom, chks Iterator[ChunkRefW
215
226
float64 (estimatedCount (swb .Bloom .ScalableBloomFilter .Capacity (), fillRatio )),
216
227
)
217
228
bt .metrics .bloomSize .Observe (float64 (swb .Bloom .ScalableBloomFilter .Capacity () / eightBits ))
218
- bt .metrics .sbfCreationTime .Add (float64 (endTime - startTime ))
219
- return sourceBytes , nil
229
+
230
+ ty := bloomCreationTypeIndexed
231
+ if skip {
232
+ ty = bloomCreationTypeSkipped
233
+ }
234
+ bt .metrics .sbfCreationTime .WithLabelValues (ty ).Add (float64 (endTime - startTime ))
235
+ bt .metrics .bloomsTotal .WithLabelValues (ty ).Inc ()
236
+
237
+ return sourceBytes , skip , nil
220
238
}
221
239
222
240
// n ≈ −m ln(1 − p).
0 commit comments