[pkg/stanza] Extract trim func from split package (#26536)

Follows #26241 Previously, split funcs were responsible for applying trim funcs. This PR increases composability by applying trim funcs as a wrapper around split funcs. One nuance that was surfaced here is that the newline split func was not handling the case where a line starts with a newline. When this happens, we need to tell the scanner to advance, but we still want to return a `""` token, rather than nil. This is covered by existing tests, but previously it was "fixed" by the trim func which would return an empty slice when the token was nil. Now, the newline split func will explicitly handle this case, while the trim func will return the original value if it is nil or empty.
open-telemetry · Sep 13, 2023 · 82d0db2 · 82d0db2
1 parent a3eacd6
commit 82d0db2
Show file tree

Hide file tree

Showing 16 changed files with 328 additions and 201 deletions.
diff --git a/.chloggen/pkg-stanza-extract-trim-split.yaml b/.chloggen/pkg-stanza-extract-trim-split.yaml
@@ -0,0 +1,30 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Make trim func composable
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [26536]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+  - Adds trim.WithFunc to allow trim funcs to wrap bufio.SplitFuncs.
+  - Removes trim.Func from split.Config.Func. Use trim.WithFunc instead.
+  - Removes trim.Func from flush.WithPeriod. Use trim.WithFunc instead.
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [api]
diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go
@@ -119,7 +119,7 @@ func (c Config) BuildWithSplitFunc(logger *zap.SugaredLogger, emit emit.Callback
 	}
 
 	// Ensure that splitter is buildable
-	factory := splitter.NewCustomFactory(splitFunc, c.FlushPeriod)
+	factory := splitter.NewCustomFactory(splitFunc, c.TrimConfig.Func(), c.FlushPeriod)
 	if _, err := factory.SplitFunc(); err != nil {
 		return nil, err
 	}

diff --git a/pkg/stanza/fileconsumer/internal/header/config.go b/pkg/stanza/fileconsumer/internal/header/config.go
@@ -16,6 +16,7 @@ import (
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/pipeline"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/split"
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
 )
 
 type Config struct {
@@ -69,13 +70,16 @@ func NewConfig(matchRegex string, metadataOperators []operator.Config, enc encod
 		return nil, fmt.Errorf("failed to compile `pattern`: %w", err)
 	}
 
-	splitFunc, err := split.NewlineSplitFunc(enc, false, func(b []byte) []byte {
-		return bytes.Trim(b, "\r\n")
-	})
+	splitFunc, err := split.NewlineSplitFunc(enc, false)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create split func: %w", err)
 	}
 
+	var trimFunc trim.Func = func(b []byte) []byte {
+		return bytes.Trim(b, "\r\n")
+	}
+	splitFunc = trim.WithFunc(splitFunc, trimFunc)
+
 	return &Config{
 		regex:             regex,
 		SplitFunc:         splitFunc,

diff --git a/pkg/stanza/fileconsumer/internal/splitter/custom.go b/pkg/stanza/fileconsumer/internal/splitter/custom.go
@@ -13,19 +13,21 @@ import (
 
 type customFactory struct {
 	splitFunc   bufio.SplitFunc
+	trimFunc    trim.Func
 	flushPeriod time.Duration
 }
 
 var _ Factory = (*customFactory)(nil)
 
-func NewCustomFactory(splitFunc bufio.SplitFunc, flushPeriod time.Duration) Factory {
+func NewCustomFactory(splitFunc bufio.SplitFunc, trimFunc trim.Func, flushPeriod time.Duration) Factory {
 	return &customFactory{
 		splitFunc:   splitFunc,
+		trimFunc:    trimFunc,
 		flushPeriod: flushPeriod,
 	}
 }
 
 // SplitFunc builds a bufio.SplitFunc based on the configuration
 func (f *customFactory) SplitFunc() (bufio.SplitFunc, error) {
-	return flush.WithPeriod(f.splitFunc, trim.Nop, f.flushPeriod), nil
+	return trim.WithFunc(flush.WithPeriod(f.splitFunc, f.flushPeriod), f.trimFunc), nil
 }
diff --git a/pkg/stanza/fileconsumer/internal/splitter/custom_test.go b/pkg/stanza/fileconsumer/internal/splitter/custom_test.go
@@ -9,10 +9,12 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/assert"
+
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
 )
 
 func TestCustom(t *testing.T) {
-	factory := NewCustomFactory(bufio.ScanLines, 0)
+	factory := NewCustomFactory(bufio.ScanLines, trim.Nop, 0)
 	splitFunc, err := factory.SplitFunc()
 	assert.NoError(t, err)
 	assert.NotNil(t, splitFunc)
@@ -35,9 +37,33 @@ func TestCustom(t *testing.T) {
 	assert.Nil(t, token)
 }
 
+func TestCustomWithTrim(t *testing.T) {
+	factory := NewCustomFactory(bufio.ScanLines, trim.Whitespace, 0)
+	splitFunc, err := factory.SplitFunc()
+	assert.NoError(t, err)
+	assert.NotNil(t, splitFunc)
+
+	input := []byte(" hello \n world \n extra ")
+
+	advance, token, err := splitFunc(input, false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("hello"), token)
+
+	advance, token, err = splitFunc(input[8:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("world"), token)
+
+	advance, token, err = splitFunc(input[16:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 0, advance)
+	assert.Nil(t, token)
+}
+
 func TestCustomWithFlush(t *testing.T) {
 	flushPeriod := 100 * time.Millisecond
-	factory := NewCustomFactory(bufio.ScanLines, flushPeriod)
+	factory := NewCustomFactory(bufio.ScanLines, trim.Nop, flushPeriod)
 	splitFunc, err := factory.SplitFunc()
 	assert.NoError(t, err)
 	assert.NotNil(t, splitFunc)
@@ -66,3 +92,35 @@ func TestCustomWithFlush(t *testing.T) {
 	assert.Equal(t, 7, advance)
 	assert.Equal(t, []byte(" extra "), token)
 }
+
+func TestCustomWithFlushTrim(t *testing.T) {
+	flushPeriod := 100 * time.Millisecond
+	factory := NewCustomFactory(bufio.ScanLines, trim.Whitespace, flushPeriod)
+	splitFunc, err := factory.SplitFunc()
+	assert.NoError(t, err)
+	assert.NotNil(t, splitFunc)
+
+	input := []byte(" hello \n world \n extra ")
+
+	advance, token, err := splitFunc(input, false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("hello"), token)
+
+	advance, token, err = splitFunc(input[8:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("world"), token)
+
+	advance, token, err = splitFunc(input[16:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 0, advance)
+	assert.Nil(t, token)
+
+	time.Sleep(2 * flushPeriod)
+
+	advance, token, err = splitFunc(input[16:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 7, advance)
+	assert.Equal(t, []byte("extra"), token) // Ensure trim applies to flushed token
+}
diff --git a/pkg/stanza/fileconsumer/internal/splitter/multiline.go b/pkg/stanza/fileconsumer/internal/splitter/multiline.go
@@ -42,9 +42,14 @@ func NewSplitFuncFactory(
 
 // SplitFunc builds a bufio.SplitFunc based on the configuration
 func (f *splitFuncFactory) SplitFunc() (bufio.SplitFunc, error) {
-	splitFunc, err := f.splitConfig.Func(f.encoding, false, f.maxLogSize, f.trimFunc)
+	splitFunc, err := f.splitConfig.Func(f.encoding, false, f.maxLogSize)
 	if err != nil {
 		return nil, err
 	}
-	return flush.WithPeriod(splitFunc, f.trimFunc, f.flushPeriod), nil
+	splitFunc = flush.WithPeriod(splitFunc, f.flushPeriod)
+	if f.encoding == encoding.Nop {
+		// Special case where we should never trim
+		return splitFunc, nil
+	}
+	return trim.WithFunc(splitFunc, f.trimFunc), nil
 }
diff --git a/pkg/stanza/fileconsumer/internal/splitter/multiline_test.go b/pkg/stanza/fileconsumer/internal/splitter/multiline_test.go
@@ -49,6 +49,30 @@ func TestSplitFunc(t *testing.T) {
 	assert.Nil(t, token)
 }
 
+func TestSplitFuncWithTrim(t *testing.T) {
+	factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Whitespace, 0)
+	splitFunc, err := factory.SplitFunc()
+	assert.NoError(t, err)
+	assert.NotNil(t, splitFunc)
+
+	input := []byte(" hello \n world \n extra ")
+
+	advance, token, err := splitFunc(input, false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("hello"), token)
+
+	advance, token, err = splitFunc(input[8:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 8, advance)
+	assert.Equal(t, []byte("world"), token)
+
+	advance, token, err = splitFunc(input[16:], false)
+	assert.NoError(t, err)
+	assert.Equal(t, 0, advance)
+	assert.Nil(t, token)
+}
+
 func TestSplitFuncWithFlush(t *testing.T) {
 	flushPeriod := 100 * time.Millisecond
 	factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Nop, flushPeriod)
@@ -81,7 +105,7 @@ func TestSplitFuncWithFlush(t *testing.T) {
 	assert.Equal(t, []byte(" extra "), token)
 }
 
-func TestSplitFuncWithTrim(t *testing.T) {
+func TestSplitFuncWithFlushTrim(t *testing.T) {
 	flushPeriod := 100 * time.Millisecond
 	factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Whitespace, flushPeriod)
 	splitFunc, err := factory.SplitFunc()

diff --git a/pkg/stanza/flush/flush.go b/pkg/stanza/flush/flush.go
@@ -6,12 +6,10 @@ package flush // import "github.com/open-telemetry/opentelemetry-collector-contr
 import (
 	"bufio"
 	"time"
-
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
 )
 
 // Wrap a bufio.SplitFunc with a flusher
-func WithPeriod(splitFunc bufio.SplitFunc, trimFunc trim.Func, period time.Duration) bufio.SplitFunc {
+func WithPeriod(splitFunc bufio.SplitFunc, period time.Duration) bufio.SplitFunc {
 	if period <= 0 {
 		return splitFunc
 	}
@@ -20,7 +18,7 @@ func WithPeriod(splitFunc bufio.SplitFunc, trimFunc trim.Func, period time.Durat
 		forcePeriod:        period,
 		previousDataLength: 0,
 	}
-	return f.splitFunc(splitFunc, trimFunc)
+	return f.splitFunc(splitFunc)
 }
 
 // flusher keeps information about flush state
@@ -61,7 +59,7 @@ func (f *flusher) shouldFlush() bool {
 	return f.forcePeriod > 0 && time.Since(f.lastDataChange) > f.forcePeriod && f.previousDataLength > 0
 }
 
-func (f *flusher) splitFunc(splitFunc bufio.SplitFunc, trimFunc trim.Func) bufio.SplitFunc {
+func (f *flusher) splitFunc(splitFunc bufio.SplitFunc) bufio.SplitFunc {
 	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
 		advance, token, err = splitFunc(data, atEOF)
 
@@ -81,7 +79,7 @@ func (f *flusher) splitFunc(splitFunc bufio.SplitFunc, trimFunc trim.Func) bufio
 		if f.shouldFlush() {
 			// Inform flusher that we just flushed
 			f.flushed()
-			token = trimFunc(data)
+			token = data
 			advance = len(data)
 			return
 		}

diff --git a/pkg/stanza/flush/flush_test.go b/pkg/stanza/flush/flush_test.go
@@ -9,8 +9,6 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/assert"
-
-	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
 )
 
 func TestFlusher(t *testing.T) {
@@ -22,7 +20,7 @@ func TestFlusher(t *testing.T) {
 	// always use atEOF=false.
 
 	flushPeriod := 100 * time.Millisecond
-	f := WithPeriod(bufio.ScanWords, trim.Nop, flushPeriod)
+	f := WithPeriod(bufio.ScanWords, flushPeriod)
 
 	content := []byte("foo bar hellowo")
 
@@ -64,7 +62,7 @@ func TestNoFlushPeriod(t *testing.T) {
 	// In other words, we should expect exactly the behavior of bufio.ScanWords.
 
 	flushPeriod := time.Duration(0)
-	f := WithPeriod(bufio.ScanWords, trim.Nop, flushPeriod)
+	f := WithPeriod(bufio.ScanWords, flushPeriod)
 
 	content := []byte("foo bar hellowo")
 

diff --git a/pkg/stanza/operator/input/tcp/tcp.go b/pkg/stanza/operator/input/tcp/tcp.go
@@ -81,13 +81,8 @@ type BaseConfig struct {
 
 type SplitFuncBuilder func(enc encoding.Encoding) (bufio.SplitFunc, error)
 
-func (c Config) defaultMultilineBuilder(enc encoding.Encoding) (bufio.SplitFunc, error) {
-	trimFunc := c.TrimConfig.Func()
-	splitFunc, err := c.SplitConfig.Func(enc, true, int(c.MaxLogSize), trimFunc)
-	if err != nil {
-		return nil, err
-	}
-	return splitFunc, nil
+func (c Config) defaultSplitFuncBuilder(enc encoding.Encoding) (bufio.SplitFunc, error) {
+	return c.SplitConfig.Func(enc, true, int(c.MaxLogSize))
 }
 
 // Build will build a tcp input operator.
@@ -121,14 +116,15 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) {
 	}
 
 	if c.SplitFuncBuilder == nil {
-		c.SplitFuncBuilder = c.defaultMultilineBuilder
+		c.SplitFuncBuilder = c.defaultSplitFuncBuilder
 	}
 
 	// Build split func
 	splitFunc, err := c.SplitFuncBuilder(enc)
 	if err != nil {
 		return nil, err
 	}
+	splitFunc = trim.WithFunc(splitFunc, c.TrimConfig.Func())
 
 	var resolver *helper.IPResolver
 	if c.AddAttributes {

diff --git a/pkg/stanza/operator/input/udp/udp.go b/pkg/stanza/operator/input/udp/udp.go
@@ -89,11 +89,12 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) {
 		return nil, err
 	}
 
-	// Build SplitFunc
-	splitFunc, err := c.SplitConfig.Func(enc, true, MaxUDPSize, c.TrimConfig.Func())
+	// Build split func
+	splitFunc, err := c.SplitConfig.Func(enc, true, MaxUDPSize)
 	if err != nil {
 		return nil, err
 	}
+	splitFunc = trim.WithFunc(splitFunc, c.TrimConfig.Func())
 
 	var resolver *helper.IPResolver
 	if c.AddAttributes {