Skip to content

Commit bc01e6f

Browse files
authored
feat: Drain uses different tokenizer based on log format (#13384)
1 parent 69b805d commit bc01e6f

8 files changed

+360
-103
lines changed

pkg/pattern/drain/drain.go

+36-22
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func DefaultConfig() *Config {
147147
}
148148
}
149149

150-
func New(config *Config, metrics *Metrics) *Drain {
150+
func New(config *Config, format string, metrics *Metrics) *Drain {
151151
if config.LogClusterDepth < 3 {
152152
panic("depth argument must be at least 3")
153153
}
@@ -156,14 +156,24 @@ func New(config *Config, metrics *Metrics) *Drain {
156156
if metrics != nil {
157157
evictFn = func(int, *LogCluster) { metrics.PatternsEvictedTotal.Inc() }
158158
}
159+
var tokenizer LineTokenizer
160+
switch format {
161+
case FormatJSON:
162+
tokenizer = newJSONTokenizer(config.ParamString)
163+
case FormatLogfmt:
164+
tokenizer = newLogfmtTokenizer(config.ParamString)
165+
default:
166+
tokenizer = newPunctuationTokenizer()
167+
}
159168

160169
d := &Drain{
161170
config: config,
162171
rootNode: createNode(),
163172
idToCluster: createLogClusterCache(config.MaxClusters, evictFn),
164173
metrics: metrics,
165-
tokenizer: newPunctuationTokenizer(),
174+
tokenizer: tokenizer,
166175
maxAllowedLineLength: 3000,
176+
format: format,
167177
}
168178
return d
169179
}
@@ -176,6 +186,9 @@ type Drain struct {
176186
metrics *Metrics
177187
tokenizer LineTokenizer
178188
maxAllowedLineLength int
189+
format string
190+
tokens []string
191+
state interface{}
179192
}
180193

181194
func (d *Drain) Clusters() []*LogCluster {
@@ -190,8 +203,8 @@ func (d *Drain) Train(content string, ts int64) *LogCluster {
190203
if len(content) > d.maxAllowedLineLength {
191204
return nil
192205
}
193-
tokens, state := d.tokenizer.Tokenize(content)
194-
return d.train(tokens, state, ts)
206+
d.tokens, d.state = d.tokenizer.Tokenize(content, d.tokens, d.state)
207+
return d.train(d.tokens, d.state, ts)
195208
}
196209

197210
func (d *Drain) train(tokens []string, state interface{}, ts int64) *LogCluster {
@@ -200,13 +213,16 @@ func (d *Drain) train(tokens []string, state interface{}, ts int64) *LogCluster
200213
}
201214
if d.metrics != nil {
202215
d.metrics.TokensPerLine.Observe(float64(len(tokens)))
203-
d.metrics.StatePerLine.Observe(float64(len(state.([]int))))
216+
if stateInts, ok := state.([]int); ok {
217+
d.metrics.StatePerLine.Observe(float64(len(stateInts)))
218+
}
204219
}
205220
matchCluster := d.treeSearch(d.rootNode, tokens, d.config.SimTh, false)
206221
// Match no existing log cluster
207222
if matchCluster == nil {
208223
d.clustersCounter++
209224
clusterID := d.clustersCounter
225+
tokens, state = d.tokenizer.Clone(tokens, state)
210226
matchCluster = &LogCluster{
211227
Tokens: tokens,
212228
TokenState: state,
@@ -222,8 +238,7 @@ func (d *Drain) train(tokens []string, state interface{}, ts int64) *LogCluster
222238
d.metrics.PatternsDetectedTotal.Inc()
223239
}
224240
} else {
225-
newTemplateTokens := d.createTemplate(tokens, matchCluster.Tokens)
226-
matchCluster.Tokens = newTemplateTokens
241+
matchCluster.Tokens = d.createTemplate(tokens, matchCluster.Tokens)
227242
matchCluster.append(model.TimeFromUnixNano(ts))
228243
// Touch cluster to update its state in the cache.
229244
d.idToCluster.Get(matchCluster.id)
@@ -232,12 +247,13 @@ func (d *Drain) train(tokens []string, state interface{}, ts int64) *LogCluster
232247
}
233248

234249
func (d *Drain) TrainPattern(content string, samples []*logproto.PatternSample) *LogCluster {
235-
tokens, state := d.tokenizer.Tokenize(content)
250+
tokens, state := d.tokenizer.Tokenize(content, d.tokens, d.state)
236251
matchCluster := d.treeSearch(d.rootNode, tokens, d.config.SimTh, true)
237252
// Match no existing log cluster
238253
if matchCluster == nil {
239254
d.clustersCounter++
240255
clusterID := d.clustersCounter
256+
tokens, state = d.tokenizer.Clone(tokens, state)
241257
matchCluster = &LogCluster{
242258
Tokens: tokens,
243259
TokenState: state,
@@ -246,8 +262,7 @@ func (d *Drain) TrainPattern(content string, samples []*logproto.PatternSample)
246262
d.idToCluster.Set(clusterID, matchCluster)
247263
d.addSeqToPrefixTree(d.rootNode, matchCluster)
248264
} else {
249-
newTemplateTokens := d.createTemplate(tokens, matchCluster.Tokens)
250-
matchCluster.Tokens = newTemplateTokens
265+
matchCluster.Tokens = d.createTemplate(tokens, matchCluster.Tokens)
251266
// Touch cluster to update its state in the cache.
252267
d.idToCluster.Get(matchCluster.id)
253268
}
@@ -277,7 +292,7 @@ func deduplicatePlaceholders(line string, placeholder string) string {
277292
}
278293
builder = append(builder, line[low:]...)
279294

280-
return unsafe.String(unsafe.SliceData(builder), len(builder))
295+
return unsafeString(builder)
281296
}
282297

283298
func (d *Drain) PatternString(c *LogCluster) string {
@@ -313,13 +328,6 @@ func (d *Drain) Delete(cluster *LogCluster) {
313328
d.idToCluster.cache.Remove(cluster.id)
314329
}
315330

316-
// Match against an already existing cluster. Match shall be perfect (sim_th=1.0). New cluster will not be created as a result of this call, nor any cluster modifications.
317-
func (d *Drain) Match(content string) *LogCluster {
318-
contentTokens, _ := d.tokenizer.Tokenize(content)
319-
matchCluster := d.treeSearch(d.rootNode, contentTokens, 1.0, true)
320-
return matchCluster
321-
}
322-
323331
func (d *Drain) treeSearch(rootNode *Node, tokens []string, simTh float64, includeParams bool) *LogCluster {
324332
tokenCount := len(tokens)
325333

@@ -511,12 +519,18 @@ func (d *Drain) createTemplate(tokens, matchClusterTokens []string) []string {
511519
if len(tokens) != len(matchClusterTokens) {
512520
panic("seq1 seq2 be of same length")
513521
}
514-
retVal := make([]string, len(matchClusterTokens))
515-
copy(retVal, matchClusterTokens)
516522
for i := range tokens {
517523
if tokens[i] != matchClusterTokens[i] {
518-
retVal[i] = d.config.ParamString
524+
matchClusterTokens[i] = d.config.ParamString
519525
}
520526
}
521-
return retVal
527+
return matchClusterTokens
528+
}
529+
530+
func unsafeString(s []byte) string {
531+
return unsafe.String(unsafe.SliceData(s), len(s))
532+
}
533+
534+
func unsafeBytes(s string) []byte {
535+
return unsafe.Slice(unsafe.StringData(s), len(s))
522536
}

pkg/pattern/drain/drain_benchmark_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func BenchmarkDrain_TrainExtractsPatterns(b *testing.B) {
3535
line := scanner.Text()
3636
lines = append(lines, line)
3737
}
38-
drain := New(DefaultConfig(), nil)
38+
drain := New(DefaultConfig(), DetectLogFormat(lines[0]), nil)
3939

4040
b.ReportAllocs()
4141
b.ResetTimer()

pkg/pattern/drain/drain_test.go

+17-63
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
2727
format string
2828
}{
2929
{
30-
drain: New(DefaultConfig(), nil),
30+
drain: New(DefaultConfig(), "", nil),
3131
inputFile: `testdata/agent-logfmt.txt`,
3232
format: FormatLogfmt,
3333
patterns: []string{
@@ -56,7 +56,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
5656
},
5757
},
5858
{
59-
drain: New(DefaultConfig(), nil),
59+
drain: New(DefaultConfig(), "", nil),
6060
inputFile: `testdata/ingester-logfmt.txt`,
6161
format: FormatLogfmt,
6262
patterns: []string{
@@ -66,7 +66,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
6666
},
6767
},
6868
{
69-
drain: New(DefaultConfig(), nil),
69+
drain: New(DefaultConfig(), "", nil),
7070
inputFile: `testdata/drone-json.txt`,
7171
format: FormatJSON,
7272
patterns: []string{
@@ -79,7 +79,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
7979
},
8080
},
8181
{
82-
drain: New(DefaultConfig(), nil),
82+
drain: New(DefaultConfig(), "", nil),
8383
inputFile: "testdata/distributor-logfmt.txt",
8484
format: FormatLogfmt,
8585
patterns: []string{
@@ -91,7 +91,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
9191
},
9292
},
9393
{
94-
drain: New(DefaultConfig(), nil),
94+
drain: New(DefaultConfig(), "", nil),
9595
inputFile: "testdata/journald.txt",
9696
format: FormatUnknown,
9797
patterns: []string{
@@ -211,7 +211,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
211211
},
212212
},
213213
{
214-
drain: New(DefaultConfig(), nil),
214+
drain: New(DefaultConfig(), "", nil),
215215
inputFile: "testdata/kafka.txt",
216216
format: FormatUnknown,
217217
patterns: []string{
@@ -232,7 +232,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
232232
},
233233
},
234234
{
235-
drain: New(DefaultConfig(), nil),
235+
drain: New(DefaultConfig(), "", nil),
236236
inputFile: "testdata/kubernetes.txt",
237237
format: FormatUnknown,
238238
patterns: []string{
@@ -273,15 +273,15 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
273273
},
274274
},
275275
{
276-
drain: New(DefaultConfig(), nil),
276+
drain: New(DefaultConfig(), "", nil),
277277
inputFile: "testdata/vault.txt",
278278
format: FormatUnknown,
279279
patterns: []string{
280280
`<_> [INFO] expiration: revoked lease: lease_id=<_>`,
281281
},
282282
},
283283
{
284-
drain: New(DefaultConfig(), nil),
284+
drain: New(DefaultConfig(), "", nil),
285285
inputFile: "testdata/calico.txt",
286286
format: FormatUnknown,
287287
patterns: []string{
@@ -374,7 +374,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
374374
},
375375
},
376376
{
377-
drain: New(DefaultConfig(), nil),
377+
drain: New(DefaultConfig(), "", nil),
378378
inputFile: "testdata/grafana-ruler.txt",
379379
format: FormatLogfmt,
380380
patterns: []string{
@@ -426,6 +426,7 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
426426
}
427427

428428
for _, tt := range tests {
429+
tt := tt
429430
t.Run(tt.inputFile, func(t *testing.T) {
430431
file, err := os.Open(tt.inputFile)
431432
require.NoError(t, err)
@@ -461,53 +462,6 @@ func TestDrain_TrainExtractsPatterns(t *testing.T) {
461462
}
462463
}
463464

464-
func TestDrain_TrainGeneratesMatchablePatterns(t *testing.T) {
465-
t.Parallel()
466-
tests := []struct {
467-
name string
468-
drain *Drain
469-
inputLines []string
470-
}{
471-
{
472-
name: "should match each line against a pattern",
473-
drain: New(DefaultConfig(), nil),
474-
inputLines: []string{
475-
"test test test test",
476-
"test test test test",
477-
"test test test test",
478-
"test test test test",
479-
},
480-
},
481-
{
482-
name: "should also match newlines",
483-
drain: New(DefaultConfig(), nil),
484-
inputLines: []string{
485-
`test test test test
486-
`,
487-
`test test test test
488-
`,
489-
`test test test test
490-
`,
491-
`test test test test
492-
`,
493-
},
494-
},
495-
}
496-
for _, tt := range tests {
497-
tt := tt
498-
t.Run(tt.name, func(t *testing.T) {
499-
for _, line := range tt.inputLines {
500-
tt.drain.Train(line, 0)
501-
}
502-
503-
for _, line := range tt.inputLines {
504-
match := tt.drain.Match(line)
505-
require.NotNil(t, match, `Line should match a cluster`)
506-
}
507-
})
508-
}
509-
}
510-
511465
func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T) {
512466
t.Parallel()
513467
tests := []struct {
@@ -517,7 +471,7 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
517471
}{
518472
{
519473
name: "should extract patterns that all lines match",
520-
drain: New(DefaultConfig(), nil),
474+
drain: New(DefaultConfig(), "", nil),
521475
inputLines: []string{
522476
"test 1 test test",
523477
"test 2 test test",
@@ -527,7 +481,7 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
527481
},
528482
{
529483
name: "should extract patterns that match if line ends with newlines",
530-
drain: New(DefaultConfig(), nil),
484+
drain: New(DefaultConfig(), "", nil),
531485
inputLines: []string{
532486
`test 1 test test
533487
`,
@@ -541,7 +495,7 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
541495
},
542496
{
543497
name: "should extract patterns that match if line ends with empty space",
544-
drain: New(DefaultConfig(), nil),
498+
drain: New(DefaultConfig(), "", nil),
545499
inputLines: []string{
546500
`test 1 test test `,
547501
`test 2 test test `,
@@ -551,7 +505,7 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
551505
},
552506
{
553507
name: "should extract patterns that match if line starts with empty space",
554-
drain: New(DefaultConfig(), nil),
508+
drain: New(DefaultConfig(), "", nil),
555509
inputLines: []string{
556510
` test 1 test test`,
557511
` test 2 test test`,
@@ -561,7 +515,7 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
561515
},
562516
{
563517
name: "Scheduler patterns are matchable",
564-
drain: New(DefaultConfig(), nil),
518+
drain: New(DefaultConfig(), "", nil),
565519
inputLines: []string{
566520
`ts=2024-05-30T12:50:36.648377186Z caller=scheduler_processor.go:143 level=warn msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"error reading server preface: EOF\"" addr=10.0.151.101:9095`,
567521
`ts=2024-05-30T12:50:36.350575929Z caller=scheduler_processor.go:143 level=warn msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"error reading server preface: EOF\"" addr=10.0.151.101:9095`,
@@ -659,7 +613,7 @@ func TestDrain_PruneTreeClearsOldBranches(t *testing.T) {
659613
}{
660614
{
661615
name: "should prune old branches",
662-
drain: New(DefaultConfig(), nil),
616+
drain: New(DefaultConfig(), "", nil),
663617
inputLines: []string{
664618
"test test test A",
665619
"test test test B",

0 commit comments

Comments
 (0)