From 78cf37ca054bc5c964ad22feb9a983838a1914d9 Mon Sep 17 00:00:00 2001 From: Dan Jaglowski Date: Mon, 21 Aug 2023 16:03:31 -0400 Subject: [PATCH 1/2] [pkg/stanza] Move encoding concerns to dedicated package --- pkg/stanza/operator/helper/encoding.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/stanza/operator/helper/encoding.go b/pkg/stanza/operator/helper/encoding.go index 2ebd4c7e5f11..9a625fd623f2 100644 --- a/pkg/stanza/operator/helper/encoding.go +++ b/pkg/stanza/operator/helper/encoding.go @@ -20,6 +20,7 @@ type EncodingConfig struct { // Deprecated: [v0.84.0] Use decoder.Decoder instead type Decoder = decoder.Decoder +// Deprecated: [v0.84.0] Use decoder.New instead var NewDecoder = decoder.New // Deprecated: [v0.84.0] Use decoder.LookupEncoding instead From 6e827f1b4a00fdbe41b590ccb745c0aecb5f4808 Mon Sep 17 00:00:00 2001 From: Dan Jaglowski Date: Mon, 21 Aug 2023 14:47:09 -0400 Subject: [PATCH 2/2] [chore] Move tokenization test into subpackage of tokenize --- .../operator/input/syslog/syslog_test.go | 54 +-- pkg/stanza/operator/internal/test_common.go | 119 ----- pkg/stanza/tokenize/multiline_test.go | 414 ++++++++---------- .../tokenize.go} | 46 +- 4 files changed, 242 insertions(+), 391 deletions(-) delete mode 100644 pkg/stanza/operator/internal/test_common.go rename pkg/stanza/tokenize/{util_test.go => tokenizetest/tokenize.go} (60%) diff --git a/pkg/stanza/operator/input/syslog/syslog_test.go b/pkg/stanza/operator/input/syslog/syslog_test.go index 4decaf3f5e12..a19e763cc01c 100644 --- a/pkg/stanza/operator/input/syslog/syslog_test.go +++ b/pkg/stanza/operator/input/syslog/syslog_test.go @@ -15,10 +15,10 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/input/tcp" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/input/udp" - "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/internal" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/parser/syslog" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/pipeline" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/testutil" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/tokenize/tokenizetest" ) var ( @@ -182,68 +182,68 @@ func NewConfigWithUDP(syslogCfg *syslog.BaseConfig) *Config { } func TestOctetFramingSplitFunc(t *testing.T) { - testCases := []internal.TokenizerTestCase{ + testCases := []tokenizetest.TestCase{ { - Name: "OneLogSimple", - Raw: []byte(`17 my log LOGEND 123`), - ExpectedTokenized: []string{ + Name: "OneLogSimple", + Input: []byte(`17 my log LOGEND 123`), + ExpectedTokens: []string{ `17 my log LOGEND 123`, }, }, { - Name: "TwoLogsSimple", - Raw: []byte(`17 my log LOGEND 12317 my log LOGEND 123`), - ExpectedTokenized: []string{ + Name: "TwoLogsSimple", + Input: []byte(`17 my log LOGEND 12317 my log LOGEND 123`), + ExpectedTokens: []string{ `17 my log LOGEND 123`, `17 my log LOGEND 123`, }, }, { - Name: "NoMatches", - Raw: []byte(`no matches in it`), - ExpectedTokenized: []string{ + Name: "NoMatches", + Input: []byte(`no matches in it`), + ExpectedTokens: []string{ `no matches in it`, }, }, { - Name: "NonMatchesAfter", - Raw: []byte(`17 my log LOGEND 123my log LOGEND 12317 my log LOGEND 123`), - ExpectedTokenized: []string{ + Name: "NonMatchesAfter", + Input: []byte(`17 my log LOGEND 123my log LOGEND 12317 my log LOGEND 123`), + ExpectedTokens: []string{ `17 my log LOGEND 123`, `my log LOGEND 12317 my log LOGEND 123`, }, }, { Name: "HugeLog100", - Raw: func() []byte { - newRaw := internal.GeneratedByteSliceOfLength(100) + Input: func() []byte { + newRaw := tokenizetest.GenerateBytes(100) newRaw = append([]byte(`100 `), newRaw...) return newRaw }(), - ExpectedTokenized: []string{ - `100 ` + string(internal.GeneratedByteSliceOfLength(100)), + ExpectedTokens: []string{ + `100 ` + string(tokenizetest.GenerateBytes(100)), }, }, { Name: "maxCapacity", - Raw: func() []byte { - newRaw := internal.GeneratedByteSliceOfLength(4091) + Input: func() []byte { + newRaw := tokenizetest.GenerateBytes(4091) newRaw = append([]byte(`4091 `), newRaw...) return newRaw }(), - ExpectedTokenized: []string{ - `4091 ` + string(internal.GeneratedByteSliceOfLength(4091)), + ExpectedTokens: []string{ + `4091 ` + string(tokenizetest.GenerateBytes(4091)), }, }, { Name: "over capacity", - Raw: func() []byte { - newRaw := internal.GeneratedByteSliceOfLength(4092) + Input: func() []byte { + newRaw := tokenizetest.GenerateBytes(4092) newRaw = append([]byte(`5000 `), newRaw...) return newRaw }(), - ExpectedTokenized: []string{ - `5000 ` + string(internal.GeneratedByteSliceOfLength(4091)), + ExpectedTokens: []string{ + `5000 ` + string(tokenizetest.GenerateBytes(4091)), `j`, }, }, @@ -251,7 +251,7 @@ func TestOctetFramingSplitFunc(t *testing.T) { for _, tc := range testCases { splitFunc, err := OctetMultiLineBuilder(nil) require.NoError(t, err) - t.Run(tc.Name, tc.RunFunc(splitFunc)) + t.Run(tc.Name, tc.Run(splitFunc)) } } diff --git a/pkg/stanza/operator/internal/test_common.go b/pkg/stanza/operator/internal/test_common.go deleted file mode 100644 index 39210f43aef7..000000000000 --- a/pkg/stanza/operator/internal/test_common.go +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -package internal // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/internal" - -import ( - "bufio" - "io" - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -// state is going to keep processing state of the TestReader -type state struct { - ReadFrom int - Processed int -} - -// TestReader is a TestReader which keeps state of readed and processed data -type TestReader struct { - State *state - Data []byte -} - -// NewTestReader creates TestReader with empty state -func NewTestReader(data []byte) TestReader { - return TestReader{ - State: &state{ - ReadFrom: 0, - Processed: 0, - }, - Data: data, - } -} - -// Read reads data from TestReader and remebers where reading has been finished -func (r TestReader) Read(p []byte) (n int, err error) { - // return eof if data has been fully readed - if len(r.Data)-r.State.ReadFrom == 0 { - return 0, io.EOF - } - - // iterate over data char by char and write into p - // until p is full or no more data left to read - i := 0 - for ; i < len(r.Data)-r.State.ReadFrom; i++ { - if i == len(p) { - break - } - p[i] = r.Data[r.State.ReadFrom+i] - } - - // update state - r.State.ReadFrom += i - return i, nil -} - -// Reset resets TestReader state (sets last readed position to last processed position) -func (r *TestReader) Reset() { - r.State.ReadFrom = r.State.Processed -} - -func (r *TestReader) SplitFunc(splitFunc bufio.SplitFunc) bufio.SplitFunc { - return func(data []byte, atEOF bool) (advance int, token []byte, err error) { - advance, token, err = splitFunc(data, atEOF) - r.State.Processed += advance - return - } -} - -type TokenizerTestCase struct { - Name string - Pattern string - Raw []byte - ExpectedTokenized []string - ExpectedError error - Sleep time.Duration - AdditionalIterations int - PreserveLeadingWhitespaces bool - PreserveTrailingWhitespaces bool -} - -func (tc TokenizerTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T) { - reader := NewTestReader(tc.Raw) - - return func(t *testing.T) { - var tokenized []string - for i := 0; i < 1+tc.AdditionalIterations; i++ { - // sleep before next iterations - if i > 0 { - time.Sleep(tc.Sleep) - } - reader.Reset() - scanner := bufio.NewScanner(reader) - scanner.Split(reader.SplitFunc(splitFunc)) - for { - ok := scanner.Scan() - if !ok { - assert.Equal(t, tc.ExpectedError, scanner.Err()) - break - } - tokenized = append(tokenized, scanner.Text()) - } - } - - assert.Equal(t, tc.ExpectedTokenized, tokenized) - } -} - -func GeneratedByteSliceOfLength(length int) []byte { - chars := []byte(`abcdefghijklmnopqrstuvwxyz`) - newSlice := make([]byte, length) - for i := 0; i < length; i++ { - newSlice[i] = chars[i%len(chars)] - } - return newSlice -} diff --git a/pkg/stanza/tokenize/multiline_test.go b/pkg/stanza/tokenize/multiline_test.go index af0dffc69e35..23482f25ba9b 100644 --- a/pkg/stanza/tokenize/multiline_test.go +++ b/pkg/stanza/tokenize/multiline_test.go @@ -12,10 +12,11 @@ import ( "testing" "time" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/text/encoding" "golang.org/x/text/encoding/unicode" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/tokenize/tokenizetest" ) const ( @@ -25,29 +26,29 @@ const ( ) type MultiLineTokenizerTestCase struct { - TokenizerTestCase + tokenizetest.TestCase Flusher *Flusher } func TestLineStartSplitFunc(t *testing.T) { testCases := []MultiLineTokenizerTestCase{ { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "OneLogSimple", Pattern: `LOGSTART \d+ `, - Raw: []byte("LOGSTART 123 log1LOGSTART 123 a"), - ExpectedTokenized: []string{ + Input: []byte("LOGSTART 123 log1LOGSTART 123 a"), + ExpectedTokens: []string{ `LOGSTART 123 log1`, }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "TwoLogsSimple", Pattern: `LOGSTART \d+ `, - Raw: []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`), - ExpectedTokenized: []string{ + Input: []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`), + ExpectedTokens: []string{ `LOGSTART 123 log1`, `LOGSTART 234 log2`, }, @@ -55,11 +56,11 @@ func TestLineStartSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "TwoLogsLineStart", Pattern: `^LOGSTART \d+ `, - Raw: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), - ExpectedTokenized: []string{ + Input: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), + ExpectedTokens: []string{ "LOGSTART 123 LOGSTART 345 log1", "LOGSTART 234 log2", }, @@ -67,19 +68,19 @@ func TestLineStartSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "NoMatches", Pattern: `LOGSTART \d+ `, - Raw: []byte(`file that has no matches in it`), + Input: []byte(`file that has no matches in it`), }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "PrecedingNonMatches", Pattern: `LOGSTART \d+ `, - Raw: []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`), - ExpectedTokenized: []string{ + Input: []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`), + ExpectedTokens: []string{ `part that doesn't match`, `LOGSTART 123 part that matches`, }, @@ -87,57 +88,57 @@ func TestLineStartSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "HugeLog100", Pattern: `LOGSTART \d+ `, - Raw: func() []byte { - newRaw := []byte(`LOGSTART 123 `) - newRaw = append(newRaw, GeneratedByteSliceOfLength(100)...) - newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) - return newRaw + Input: func() []byte { + newInput := []byte(`LOGSTART 123 `) + newInput = append(newInput, tokenizetest.GenerateBytes(100)...) + newInput = append(newInput, []byte(`LOGSTART 234 endlog`)...) + return newInput }(), - ExpectedTokenized: []string{ - `LOGSTART 123 ` + string(GeneratedByteSliceOfLength(100)), + ExpectedTokens: []string{ + `LOGSTART 123 ` + string(tokenizetest.GenerateBytes(100)), }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "HugeLog10000", Pattern: `LOGSTART \d+ `, - Raw: func() []byte { - newRaw := []byte(`LOGSTART 123 `) - newRaw = append(newRaw, GeneratedByteSliceOfLength(10000)...) - newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) - return newRaw + Input: func() []byte { + newInput := []byte(`LOGSTART 123 `) + newInput = append(newInput, tokenizetest.GenerateBytes(10000)...) + newInput = append(newInput, []byte(`LOGSTART 234 endlog`)...) + return newInput }(), - ExpectedTokenized: []string{ - `LOGSTART 123 ` + string(GeneratedByteSliceOfLength(10000)), + ExpectedTokens: []string{ + `LOGSTART 123 ` + string(tokenizetest.GenerateBytes(10000)), }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "ErrTooLong", Pattern: `LOGSTART \d+ `, - Raw: func() []byte { - newRaw := []byte(`LOGSTART 123 `) - newRaw = append(newRaw, GeneratedByteSliceOfLength(1000000)...) - newRaw = append(newRaw, []byte(`LOGSTART 234 endlog`)...) - return newRaw + Input: func() []byte { + newInput := []byte(`LOGSTART 123 `) + newInput = append(newInput, tokenizetest.GenerateBytes(1000000)...) + newInput = append(newInput, []byte(`LOGSTART 234 endlog`)...) + return newInput }(), ExpectedError: errors.New("bufio.Scanner: token too long"), }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "MultipleMultilineLogs", Pattern: `^LOGSTART \d+`, - Raw: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1\t \nLOGSTART 17 log2\nLOGPART log2\nanother line\nLOGSTART 43 log5"), - ExpectedTokenized: []string{ + Input: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1\t \nLOGSTART 17 log2\nLOGPART log2\nanother line\nLOGSTART 43 log5"), + ExpectedTokens: []string{ "LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1", "LOGSTART 17 log2\nLOGPART log2\nanother line", }, @@ -145,19 +146,19 @@ func TestLineStartSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithoutFlusher", Pattern: `^LOGSTART \d+`, - Raw: []byte("LOGPART log1\nLOGPART log1\t \n"), + Input: []byte("LOGPART log1\nLOGPART log1\t \n"), }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusher", Pattern: `^LOGSTART \d+`, - Raw: []byte("LOGPART log1\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1\nLOGPART log1", }, @@ -169,11 +170,11 @@ func TestLineStartSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusherWithMultipleLogsInBuffer", Pattern: `^LOGSTART \d+`, - Raw: []byte("LOGPART log1\nLOGSTART 123\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGSTART 123\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1", "LOGSTART 123\nLOGPART log1", }, @@ -185,11 +186,11 @@ func TestLineStartSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithLongFlusherWithMultipleLogsInBuffer", Pattern: `^LOGSTART \d+`, - Raw: []byte("LOGPART log1\nLOGSTART 123\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGSTART 123\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1", }, AdditionalIterations: 1, @@ -200,11 +201,11 @@ func TestLineStartSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusherWithLogStartingWithWhiteChars", Pattern: `^LOGSTART \d+`, - Raw: []byte("\nLOGSTART 333"), - ExpectedTokenized: []string{ + Input: []byte("\nLOGSTART 333"), + ExpectedTokens: []string{ "", "LOGSTART 333", }, @@ -224,7 +225,7 @@ func TestLineStartSplitFunc(t *testing.T) { splitFunc, err := cfg.getSplitFunc(unicode.UTF8, false, tc.Flusher, 0, tc.PreserveLeadingWhitespaces, tc.PreserveTrailingWhitespaces) require.NoError(t, err) - t.Run(tc.Name, tc.RunFunc(splitFunc)) + t.Run(tc.Name, tc.Run(splitFunc)) } t.Run("FirstMatchHitsEndOfBuffer", func(t *testing.T) { @@ -250,22 +251,22 @@ func TestLineStartSplitFunc(t *testing.T) { func TestLineEndSplitFunc(t *testing.T) { testCases := []MultiLineTokenizerTestCase{ { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "OneLogSimple", Pattern: `LOGEND \d+`, - Raw: []byte(`my log LOGEND 123`), - ExpectedTokenized: []string{ + Input: []byte(`my log LOGEND 123`), + ExpectedTokens: []string{ `my log LOGEND 123`, }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "TwoLogsSimple", Pattern: `LOGEND \d+`, - Raw: []byte(`log1 LOGEND 123log2 LOGEND 234`), - ExpectedTokenized: []string{ + Input: []byte(`log1 LOGEND 123log2 LOGEND 234`), + ExpectedTokens: []string{ `log1 LOGEND 123`, `log2 LOGEND 234`, }, @@ -273,11 +274,11 @@ func TestLineEndSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "TwoLogsLineEndSimple", Pattern: `LOGEND$`, - Raw: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), - ExpectedTokenized: []string{ + Input: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), + ExpectedTokens: []string{ "log1 LOGEND LOGEND", "log2 LOGEND", }, @@ -285,73 +286,73 @@ func TestLineEndSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "NoMatches", Pattern: `LOGEND \d+`, - Raw: []byte(`file that has no matches in it`), + Input: []byte(`file that has no matches in it`), }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "NonMatchesAfter", Pattern: `LOGEND \d+`, - Raw: []byte(`part that matches LOGEND 123 part that doesn't match`), - ExpectedTokenized: []string{ + Input: []byte(`part that matches LOGEND 123 part that doesn't match`), + ExpectedTokens: []string{ `part that matches LOGEND 123`, }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "HugeLog100", Pattern: `LOGEND \d`, - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(100) - newRaw = append(newRaw, []byte(`LOGEND 1 `)...) - return newRaw + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(100) + newInput = append(newInput, []byte(`LOGEND 1 `)...) + return newInput }(), - ExpectedTokenized: []string{ - string(GeneratedByteSliceOfLength(100)) + `LOGEND 1`, + ExpectedTokens: []string{ + string(tokenizetest.GenerateBytes(100)) + `LOGEND 1`, }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "HugeLog10000", Pattern: `LOGEND \d`, - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(10000) - newRaw = append(newRaw, []byte(`LOGEND 1 `)...) - return newRaw + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(10000) + newInput = append(newInput, []byte(`LOGEND 1 `)...) + return newInput }(), - ExpectedTokenized: []string{ - string(GeneratedByteSliceOfLength(10000)) + `LOGEND 1`, + ExpectedTokens: []string{ + string(tokenizetest.GenerateBytes(10000)) + `LOGEND 1`, }, }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "HugeLog1000000", Pattern: `LOGEND \d`, - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(1000000) - newRaw = append(newRaw, []byte(`LOGEND 1 `)...) - return newRaw + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(1000000) + newInput = append(newInput, []byte(`LOGEND 1 `)...) + return newInput }(), ExpectedError: errors.New("bufio.Scanner: token too long"), }, nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "MultipleMultilineLogs", Pattern: `^LOGEND.*$`, - Raw: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1\t \nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2\nLOGSTART 43 log5"), - ExpectedTokenized: []string{ + Input: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1\t \nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2\nLOGSTART 43 log5"), + ExpectedTokens: []string{ "LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1", "LOGSTART 17 log2\nLOGPART log2\nLOGEND log2", }, @@ -359,19 +360,19 @@ func TestLineEndSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithoutFlusher", Pattern: `^LOGEND.*$`, - Raw: []byte("LOGPART log1\nLOGPART log1\t \n"), + Input: []byte("LOGPART log1\nLOGPART log1\t \n"), }, &Flusher{}, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusher", Pattern: `^LOGEND.*$`, - Raw: []byte("LOGPART log1\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1\nLOGPART log1", }, @@ -383,11 +384,11 @@ func TestLineEndSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusherWithMultipleLogsInBuffer", Pattern: `^LOGEND.*$`, - Raw: []byte("LOGPART log1\nLOGEND\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGEND\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1\nLOGEND", "LOGPART log1", }, @@ -400,11 +401,11 @@ func TestLineEndSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithLongFlusherWithMultipleLogsInBuffer", Pattern: `^LOGEND.*$`, - Raw: []byte("LOGPART log1\nLOGEND\nLOGPART log1\t \n"), - ExpectedTokenized: []string{ + Input: []byte("LOGPART log1\nLOGEND\nLOGPART log1\t \n"), + ExpectedTokens: []string{ "LOGPART log1\nLOGEND", }, @@ -416,11 +417,11 @@ func TestLineEndSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{ + tokenizetest.TestCase{ Name: "LogsWithFlusherWithLogStartingWithWhiteChars", Pattern: `LOGEND \d+$`, - Raw: []byte("\nLOGEND 333"), - ExpectedTokenized: []string{ + Input: []byte("\nLOGEND 333"), + ExpectedTokens: []string{ "LOGEND 333", }, @@ -440,33 +441,33 @@ func TestLineEndSplitFunc(t *testing.T) { splitFunc, err := cfg.getSplitFunc(unicode.UTF8, false, tc.Flusher, 0, tc.PreserveLeadingWhitespaces, tc.PreserveTrailingWhitespaces) require.NoError(t, err) - t.Run(tc.Name, tc.RunFunc(splitFunc)) + t.Run(tc.Name, tc.Run(splitFunc)) } } func TestNewlineSplitFunc(t *testing.T) { testCases := []MultiLineTokenizerTestCase{ { - TokenizerTestCase{Name: "OneLogSimple", - Raw: []byte("my log\n"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "OneLogSimple", + Input: []byte("my log\n"), + ExpectedTokens: []string{ `my log`, }, }, nil, }, { - TokenizerTestCase{Name: "OneLogCarriageReturn", - Raw: []byte("my log\r\n"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "OneLogCarriageReturn", + Input: []byte("my log\r\n"), + ExpectedTokens: []string{ `my log`, }, }, nil, }, { - TokenizerTestCase{Name: "TwoLogsSimple", - Raw: []byte("log1\nlog2\n"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "TwoLogsSimple", + Input: []byte("log1\nlog2\n"), + ExpectedTokens: []string{ `log1`, `log2`, }, @@ -474,9 +475,9 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "TwoLogsCarriageReturn", - Raw: []byte("log1\r\nlog2\r\n"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "TwoLogsCarriageReturn", + Input: []byte("log1\r\nlog2\r\n"), + ExpectedTokens: []string{ `log1`, `log2`, }, @@ -484,59 +485,59 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "NoTailingNewline", - Raw: []byte(`foo`), + tokenizetest.TestCase{Name: "NoTailingNewline", + Input: []byte(`foo`), }, nil, }, { - TokenizerTestCase{Name: "HugeLog100", - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(100) - newRaw = append(newRaw, '\n') - return newRaw + tokenizetest.TestCase{Name: "HugeLog100", + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(100) + newInput = append(newInput, '\n') + return newInput }(), - ExpectedTokenized: []string{ - string(GeneratedByteSliceOfLength(100)), + ExpectedTokens: []string{ + string(tokenizetest.GenerateBytes(100)), }, }, nil, }, { - TokenizerTestCase{Name: "HugeLog10000", - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(10000) - newRaw = append(newRaw, '\n') - return newRaw + tokenizetest.TestCase{Name: "HugeLog10000", + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(10000) + newInput = append(newInput, '\n') + return newInput }(), - ExpectedTokenized: []string{ - string(GeneratedByteSliceOfLength(10000)), + ExpectedTokens: []string{ + string(tokenizetest.GenerateBytes(10000)), }, }, nil, }, { - TokenizerTestCase{Name: "HugeLog1000000", - Raw: func() []byte { - newRaw := GeneratedByteSliceOfLength(1000000) - newRaw = append(newRaw, '\n') - return newRaw + tokenizetest.TestCase{Name: "HugeLog1000000", + Input: func() []byte { + newInput := tokenizetest.GenerateBytes(1000000) + newInput = append(newInput, '\n') + return newInput }(), ExpectedError: errors.New("bufio.Scanner: token too long"), }, nil, }, { - TokenizerTestCase{Name: "LogsWithoutFlusher", - Raw: []byte("LOGPART log1"), + tokenizetest.TestCase{Name: "LogsWithoutFlusher", + Input: []byte("LOGPART log1"), }, &Flusher{}, }, { - TokenizerTestCase{Name: "LogsWithFlusher", - Raw: []byte("LOGPART log1"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "LogsWithFlusher", + Input: []byte("LOGPART log1"), + ExpectedTokens: []string{ "LOGPART log1", }, AdditionalIterations: 1, @@ -547,9 +548,9 @@ func TestNewlineSplitFunc(t *testing.T) { }, }, { - TokenizerTestCase{Name: "DefaultFlusherSplits", - Raw: []byte("log1\nlog2\n"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "DefaultFlusherSplits", + Input: []byte("log1\nlog2\n"), + ExpectedTokens: []string{ "log1", "log2", }, @@ -557,9 +558,9 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "LogsWithLogStartingWithWhiteChars", - Raw: []byte("\nLOGEND 333\nAnother one"), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "LogsWithLogStartingWithWhiteChars", + Input: []byte("\nLOGEND 333\nAnother one"), + ExpectedTokens: []string{ "", "LOGEND 333", }, @@ -567,9 +568,9 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "PreserveLeadingWhitespaces", - Raw: []byte("\n LOGEND 333 \nAnother one "), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "PreserveLeadingWhitespaces", + Input: []byte("\n LOGEND 333 \nAnother one "), + ExpectedTokens: []string{ "", " LOGEND 333", }, @@ -578,9 +579,9 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "PreserveTrailingWhitespaces", - Raw: []byte("\n LOGEND 333 \nAnother one "), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "PreserveTrailingWhitespaces", + Input: []byte("\n LOGEND 333 \nAnother one "), + ExpectedTokens: []string{ "", "LOGEND 333 ", }, @@ -589,9 +590,9 @@ func TestNewlineSplitFunc(t *testing.T) { nil, }, { - TokenizerTestCase{Name: "PreserveBothLeadingAndTrailingWhitespaces", - Raw: []byte("\n LOGEND 333 \nAnother one "), - ExpectedTokenized: []string{ + tokenizetest.TestCase{Name: "PreserveBothLeadingAndTrailingWhitespaces", + Input: []byte("\n LOGEND 333 \nAnother one "), + ExpectedTokens: []string{ "", " LOGEND 333 ", }, @@ -608,99 +609,68 @@ func TestNewlineSplitFunc(t *testing.T) { if tc.Flusher != nil { splitFunc = tc.Flusher.SplitFunc(splitFunc) } - t.Run(tc.Name, tc.RunFunc(splitFunc)) - } -} - -type noSplitTestCase struct { - Name string - Raw []byte - ExpectedTokenized [][]byte -} - -func (tc noSplitTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T) { - return func(t *testing.T) { - scanner := bufio.NewScanner(bytes.NewReader(tc.Raw)) - scanner.Split(splitFunc) - var tokenized [][]byte - for { - ok := scanner.Scan() - if !ok { - break - } - tokenized = append(tokenized, scanner.Bytes()) - } - - assert.Equal(t, tc.ExpectedTokenized, tokenized) + t.Run(tc.Name, tc.Run(splitFunc)) } } func TestNoSplitFunc(t *testing.T) { const largeLogSize = 100 - testCases := []noSplitTestCase{ + testCases := []tokenizetest.TestCase{ { - Name: "OneLogSimple", - Raw: []byte("my log\n"), - ExpectedTokenized: [][]byte{ - []byte("my log\n"), - }, + Name: "OneLogSimple", + Input: []byte("my log\n"), + ExpectedTokens: []string{"my log\n"}, }, { - Name: "TwoLogsSimple", - Raw: []byte("log1\nlog2\n"), - ExpectedTokenized: [][]byte{ - []byte("log1\nlog2\n"), - }, + Name: "TwoLogsSimple", + Input: []byte("log1\nlog2\n"), + ExpectedTokens: []string{"log1\nlog2\n"}, }, { - Name: "TwoLogsCarriageReturn", - Raw: []byte("log1\r\nlog2\r\n"), - ExpectedTokenized: [][]byte{ - []byte("log1\r\nlog2\r\n"), - }, + Name: "TwoLogsCarriageReturn", + Input: []byte("log1\r\nlog2\r\n"), + ExpectedTokens: []string{"log1\r\nlog2\r\n"}, }, { - Name: "NoTailingNewline", - Raw: []byte(`foo`), - ExpectedTokenized: [][]byte{[]byte("foo")}, + Name: "NoTailingNewline", + Input: []byte(`foo`), + ExpectedTokens: []string{"foo"}, }, { Name: "HugeLog100", - Raw: func() []byte { - return GeneratedByteSliceOfLength(largeLogSize) + Input: func() []byte { + return tokenizetest.GenerateBytes(largeLogSize) }(), - ExpectedTokenized: [][]byte{ - GeneratedByteSliceOfLength(100), - }, + ExpectedTokens: []string{string(tokenizetest.GenerateBytes(largeLogSize))}, }, { Name: "HugeLog300", - Raw: func() []byte { - return GeneratedByteSliceOfLength(largeLogSize * 3) + Input: func() []byte { + return tokenizetest.GenerateBytes(largeLogSize * 3) }(), - ExpectedTokenized: [][]byte{ - []byte("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuv"), - []byte("wxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqr"), - []byte("stuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn"), + ExpectedTokens: []string{ + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuv", + "wxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqr", + "stuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn", }, }, { Name: "EOFBeforeMaxLogSize", - Raw: func() []byte { - return GeneratedByteSliceOfLength(largeLogSize * 3.5) + Input: func() []byte { + return tokenizetest.GenerateBytes(largeLogSize * 3.5) }(), - ExpectedTokenized: [][]byte{ - []byte("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuv"), - []byte("wxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqr"), - []byte("stuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn"), - []byte("opqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl"), + ExpectedTokens: []string{ + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuv", + "wxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqr", + "stuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn", + "opqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl", }, }, } for _, tc := range testCases { splitFunc := SplitNone(largeLogSize) - t.Run(tc.Name, tc.RunFunc(splitFunc)) + t.Run(tc.Name, tc.Run(splitFunc)) } } diff --git a/pkg/stanza/tokenize/util_test.go b/pkg/stanza/tokenize/tokenizetest/tokenize.go similarity index 60% rename from pkg/stanza/tokenize/util_test.go rename to pkg/stanza/tokenize/tokenizetest/tokenize.go index 9a357c5c27b7..996ced63ef50 100644 --- a/pkg/stanza/tokenize/util_test.go +++ b/pkg/stanza/tokenize/tokenizetest/tokenize.go @@ -1,7 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -package tokenize +package tokenizetest // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/tokenize/tokenizetest" import ( "bufio" @@ -12,21 +12,21 @@ import ( "github.com/stretchr/testify/assert" ) -// state is going to keep processing state of the TestReader +// state is going to keep processing state of the testReader type state struct { ReadFrom int Processed int } -// TestReader is a TestReader which keeps state of readed and processed data -type TestReader struct { +// testReader is a testReader which keeps state of readed and processed data +type testReader struct { State *state Data []byte } -// NewTestReader creates TestReader with empty state -func NewTestReader(data []byte) TestReader { - return TestReader{ +// newTestReader creates testReader with empty state +func newTestReader(data []byte) testReader { + return testReader{ State: &state{ ReadFrom: 0, Processed: 0, @@ -35,8 +35,8 @@ func NewTestReader(data []byte) TestReader { } } -// Read reads data from TestReader and remebers where reading has been finished -func (r TestReader) Read(p []byte) (n int, err error) { +// Read reads data from testReader and remebers where reading has been finished +func (r testReader) Read(p []byte) (n int, err error) { // return eof if data has been fully readed if len(r.Data)-r.State.ReadFrom == 0 { return 0, io.EOF @@ -57,24 +57,24 @@ func (r TestReader) Read(p []byte) (n int, err error) { return i, nil } -// Reset resets TestReader state (sets last readed position to last processed position) -func (r *TestReader) Reset() { +// Reset resets testReader state (sets last readed position to last processed position) +func (r *testReader) Reset() { r.State.ReadFrom = r.State.Processed } -func (r *TestReader) SplitFunc(splitFunc bufio.SplitFunc) bufio.SplitFunc { +func (r *testReader) splitFunc(split bufio.SplitFunc) bufio.SplitFunc { return func(data []byte, atEOF bool) (advance int, token []byte, err error) { - advance, token, err = splitFunc(data, atEOF) + advance, token, err = split(data, atEOF) r.State.Processed += advance return } } -type TokenizerTestCase struct { +type TestCase struct { Name string Pattern string - Raw []byte - ExpectedTokenized []string + Input []byte + ExpectedTokens []string ExpectedError error Sleep time.Duration AdditionalIterations int @@ -82,11 +82,11 @@ type TokenizerTestCase struct { PreserveTrailingWhitespaces bool } -func (tc TokenizerTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T) { - reader := NewTestReader(tc.Raw) +func (tc TestCase) Run(split bufio.SplitFunc) func(t *testing.T) { + reader := newTestReader(tc.Input) return func(t *testing.T) { - var tokenized []string + var tokens []string for i := 0; i < 1+tc.AdditionalIterations; i++ { // sleep before next iterations if i > 0 { @@ -94,22 +94,22 @@ func (tc TokenizerTestCase) RunFunc(splitFunc bufio.SplitFunc) func(t *testing.T } reader.Reset() scanner := bufio.NewScanner(reader) - scanner.Split(reader.SplitFunc(splitFunc)) + scanner.Split(reader.splitFunc(split)) for { ok := scanner.Scan() if !ok { assert.Equal(t, tc.ExpectedError, scanner.Err()) break } - tokenized = append(tokenized, scanner.Text()) + tokens = append(tokens, scanner.Text()) } } - assert.Equal(t, tc.ExpectedTokenized, tokenized) + assert.Equal(t, tc.ExpectedTokens, tokens) } } -func GeneratedByteSliceOfLength(length int) []byte { +func GenerateBytes(length int) []byte { chars := []byte(`abcdefghijklmnopqrstuvwxyz`) newSlice := make([]byte, length) for i := 0; i < length; i++ {